lib/unicase/u-ct-totitle.h

   1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 /* Quoting the Unicode standard, section "Default Case Algorithms":
  19      Find the word boundaries in X according to Unicode Standard Annex #29,
  20      “Text Boundaries.” For each word boundary, find the first cased character
  21      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
  22      then map all characters C between F and the following word boundary to
  23      Lowercase_Mapping(C).  */
  24
  25 UNIT *
  26 FUNC (const UNIT *s, size_t n,
  27       casing_prefix_context_t prefix_context,
  28       casing_suffix_context_t suffix_context,
  29       const char *iso639_language,
  30       uninorm_t nf,
  31       UNIT *resultbuf, size_t *lengthp)
  32 {
  33   /* The result being accumulated.  */
  34   UNIT *result;
  35   size_t length;
  36   size_t allocated;
  37   /* An array containing the word break positions.  */
  38   char *wordbreaks;
  39
  40   /* Initialize the accumulator.  */
  41   if (nf != NULL || resultbuf == NULL)
  42     {
  43       result = NULL;
  44       allocated = 0;
  45     }
  46   else
  47     {
  48       result = resultbuf;
  49       allocated = *lengthp;
  50     }
  51   length = 0;
  52
  53   /* Initialize the word breaks array.  */
  54   if (n > 0)
  55     {
  56       wordbreaks = (char *) malloc (n);
  57       if (wordbreaks == NULL)
  58         {
  59           errno = ENOMEM;
  60           goto fail2;
  61         }
  62       U_WORDBREAKS (s, n, wordbreaks);
  63     }
  64   else
  65     wordbreaks = NULL;
  66
  67   {
  68     const UNIT *s_end = s + n;
  69     const char *wp = wordbreaks;
  70
  71     /* When considering the string as segmented by word boundaries: For each
  72        such segment:
  73         - In the first part, we are searching for the first cased character.
  74           In this state, in_word_first_part = true, and no conversion takes
  75           place.
  76         - In the second part, we are converting every character: the first
  77           among these characters to title case, the other ones to lower case.
  78           In this state, in_word_first_part = false.  */
  79     bool in_word_first_part = true;
  80
  81     /* Helper for evaluating the FINAL_SIGMA condition:
  82        Last character that was not case-ignorable.  */
  83     ucs4_t last_char_except_ignorable =
  84       prefix_context.last_char_except_ignorable;
  85
  86     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
  87        Last character that was of combining class 230 ("Above") or 0.  */
  88     ucs4_t last_char_normal_or_above =
  89       prefix_context.last_char_normal_or_above;
  90
  91     while (s < s_end)
  92       {
  93         /* Fetch the next character.  */
  94         ucs4_t uc;
  95         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
  96
  97         ucs4_t (*single_character_map) (ucs4_t);
  98         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
  99
 100         ucs4_t mapped_uc[3];
 101         unsigned int mapped_count;
 102
 103         if (*wp)
 104           /* Crossing a word boundary.  */
 105           in_word_first_part = true;
 106
 107         /* Determine single_character_map, offset_in_rule.
 108            There are three possibilities:
 109              - uc should not be converted.
 110              - uc should be titlecased.
 111              - uc should be lowercased.  */
 112         if (in_word_first_part)
 113           {
 114             if (uc_is_cased (uc))
 115               {
 116                 /* uc is to be titlecased.  */
 117                 single_character_map = uc_totitle;
 118                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
 119                 in_word_first_part = false;
 120               }
 121             else
 122               {
 123                 /* uc is not converted.  */
 124                 single_character_map = NULL;
 125                 offset_in_rule = 0;
 126               }
 127           }
 128         else
 129           {
 130             /* uc is to be lowercased.  */
 131             single_character_map = uc_tolower;
 132             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
 133           }
 134
 135         /* Actually map uc.  */
 136         if (single_character_map == NULL)
 137           {
 138             mapped_uc[0] = uc;
 139             mapped_count = 1;
 140             goto found_mapping;
 141           }
 142
 143         if (uc < 0x10000)
 144           {
 145             /* Look first in the special-casing table.  */
 146             char code[3];
 147
 148             code[0] = (uc >> 8) & 0xff;
 149             code[1] = uc & 0xff;
 150
 151             for (code[2] = 0; ; code[2]++)
 152               {
 153                 const struct special_casing_rule *rule =
 154                   gl_unicase_special_lookup (code, 3);
 155
 156                 if (rule == NULL)
 157                   break;
 158
 159                 /* Test if the condition applies.  */
 160                 /* Does the language apply?  */
 161                 if (rule->language[0] == '\0'
 162                     || (iso639_language != NULL
 163                         && iso639_language[0] == rule->language[0]
 164                         && iso639_language[1] == rule->language[1]))
 165                   {
 166                     /* Does the context apply?  */
 167                     int context = rule->context;
 168                     bool applies;
 169
 170                     if (context < 0)
 171                       context = - context;
 172                     switch (context)
 173                       {
 174                       case SCC_ALWAYS:
 175                         applies = true;
 176                         break;
 177
 178                       case SCC_FINAL_SIGMA:
 179                         /* "Before" condition: preceded by a sequence
 180                            consisting of a cased letter and a case-ignorable
 181                            sequence.
 182                            "After" condition: not followed by a sequence
 183                            consisting of a case-ignorable sequence and then a
 184                            cased letter.  */
 185                         /* Test the "before" condition.  */
 186                         applies = uc_is_cased (last_char_except_ignorable);
 187                         /* Test the "after" condition.  */
 188                         if (applies)
 189                           {
 190                             const UNIT *s2 = s + count;
 191                             for (;;)
 192                               {
 193                                 if (s2 < s_end)
 194                                   {
 195                                     ucs4_t uc2;
 196                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 197                                     /* Our uc_is_case_ignorable function is
 198                                        known to return false for all cased
 199                                        characters.  So we can call
 200                                        uc_is_case_ignorable first.  */
 201                                     if (!uc_is_case_ignorable (uc2))
 202                                       {
 203                                         applies = ! uc_is_cased (uc2);
 204                                         break;
 205                                       }
 206                                     s2 += count2;
 207                                   }
 208                                 else
 209                                   {
 210                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
 211                                     break;
 212                                   }
 213                               }
 214                           }
 215                         break;
 216
 217                       case SCC_AFTER_SOFT_DOTTED:
 218                         /* "Before" condition: There is a Soft_Dotted character
 219                            before it, with no intervening character of
 220                            combining class 0 or 230 (Above).  */
 221                         /* Test the "before" condition.  */
 222                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
 223                         break;
 224
 225                       case SCC_MORE_ABOVE:
 226                         /* "After" condition: followed by a character of
 227                            combining class 230 (Above) with no intervening
 228                            character of combining class 0 or 230 (Above).  */
 229                         /* Test the "after" condition.  */
 230                         {
 231                           const UNIT *s2 = s + count;
 232                           applies = false;
 233                           for (;;)
 234                             {
 235                               if (s2 < s_end)
 236                                 {
 237                                   ucs4_t uc2;
 238                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 239                                   int ccc = uc_combining_class (uc2);
 240                                   if (ccc == UC_CCC_A)
 241                                     {
 242                                       applies = true;
 243                                       break;
 244                                     }
 245                                   if (ccc == UC_CCC_NR)
 246                                     break;
 247                                   s2 += count2;
 248                                 }
 249                               else
 250                                 {
 251                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
 252                                   break;
 253                                 }
 254                             }
 255                         }
 256                         break;
 257
 258                       case SCC_BEFORE_DOT:
 259                         /* "After" condition: followed by COMBINING DOT ABOVE
 260                            (U+0307). Any sequence of characters with a
 261                            combining class that is neither 0 nor 230 may
 262                            intervene between the current character and the
 263                            combining dot above.  */
 264                         /* Test the "after" condition.  */
 265                         {
 266                           const UNIT *s2 = s + count;
 267                           applies = false;
 268                           for (;;)
 269                             {
 270                               if (s2 < s_end)
 271                                 {
 272                                   ucs4_t uc2;
 273                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 274                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
 275                                     {
 276                                       applies = true;
 277                                       break;
 278                                     }
 279                                   {
 280                                     int ccc = uc_combining_class (uc2);
 281                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 282                                       break;
 283                                   }
 284                                   s2 += count2;
 285                                 }
 286                               else
 287                                 {
 288                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
 289                                   break;
 290                                 }
 291                             }
 292                         }
 293                         break;
 294
 295                       case SCC_AFTER_I:
 296                         /* "Before" condition: There is an uppercase I before
 297                            it, and there is no intervening character of
 298                            combining class 0 or 230 (Above).  */
 299                         /* Test the "before" condition.  */
 300                         applies = (last_char_normal_or_above == 'I');
 301                         break;
 302
 303                       default:
 304                         abort ();
 305                       }
 306                     if (rule->context < 0)
 307                       applies = !applies;
 308
 309                     if (applies)
 310                       {
 311                         /* The rule applies.
 312                            Look up the mapping (0 to 3 characters).  */
 313                         const unsigned short *mapped_in_rule =
 314                           (const unsigned short *)((const char *)rule + offset_in_rule);
 315
 316                         if (mapped_in_rule[0] == 0)
 317                           mapped_count = 0;
 318                         else
 319                           {
 320                             mapped_uc[0] = mapped_in_rule[0];
 321                             if (mapped_in_rule[1] == 0)
 322                               mapped_count = 1;
 323                             else
 324                               {
 325                                 mapped_uc[1] = mapped_in_rule[1];
 326                                 if (mapped_in_rule[2] == 0)
 327                                   mapped_count = 2;
 328                                 else
 329                                   {
 330                                     mapped_uc[2] = mapped_in_rule[2];
 331                                     mapped_count = 3;
 332                                   }
 333                               }
 334                           }
 335                         goto found_mapping;
 336                       }
 337                   }
 338
 339                 /* Optimization: Save a hash table lookup in the next round.  */
 340                 if (!rule->has_next)
 341                   break;
 342               }
 343           }
 344
 345         /* No special-cased mapping.  So use the locale and context independent
 346            mapping.  */
 347         mapped_uc[0] = single_character_map (uc);
 348         mapped_count = 1;
 349
 350        found_mapping:
 351         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
 352         {
 353           unsigned int i;
 354
 355           for (i = 0; i < mapped_count; i++)
 356             {
 357               ucs4_t muc = mapped_uc[i];
 358
 359               /* Append muc to the result accumulator.  */
 360               if (length < allocated)
 361                 {
 362                   int ret = U_UCTOMB (result + length, muc, allocated - length);
 363                   if (ret == -1)
 364                     {
 365                       errno = EINVAL;
 366                       goto fail1;
 367                     }
 368                   if (ret >= 0)
 369                     {
 370                       length += ret;
 371                       goto done_appending;
 372                     }
 373                 }
 374               {
 375                 size_t old_allocated = allocated;
 376                 size_t new_allocated = 2 * old_allocated;
 377                 if (new_allocated < 64)
 378                   new_allocated = 64;
 379                 if (new_allocated < old_allocated) /* integer overflow? */
 380                   abort ();
 381                 {
 382                   UNIT *larger_result;
 383                   if (result == NULL)
 384                     {
 385                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 386                       if (larger_result == NULL)
 387                         {
 388                           errno = ENOMEM;
 389                           goto fail1;
 390                         }
 391                     }
 392                   else if (result == resultbuf)
 393                     {
 394                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 395                       if (larger_result == NULL)
 396                         {
 397                           errno = ENOMEM;
 398                           goto fail1;
 399                         }
 400                       U_CPY (larger_result, resultbuf, length);
 401                     }
 402                   else
 403                     {
 404                       larger_result =
 405                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
 406                       if (larger_result == NULL)
 407                         {
 408                           errno = ENOMEM;
 409                           goto fail1;
 410                         }
 411                     }
 412                   result = larger_result;
 413                   allocated = new_allocated;
 414                   {
 415                     int ret = U_UCTOMB (result + length, muc, allocated - length);
 416                     if (ret == -1)
 417                       {
 418                         errno = EINVAL;
 419                         goto fail1;
 420                       }
 421                     if (ret < 0)
 422                       abort ();
 423                     length += ret;
 424                     goto done_appending;
 425                   }
 426                 }
 427               }
 428              done_appending: ;
 429             }
 430         }
 431
 432         if (!uc_is_case_ignorable (uc))
 433           last_char_except_ignorable = uc;
 434
 435         {
 436           int ccc = uc_combining_class (uc);
 437           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 438             last_char_normal_or_above = uc;
 439         }
 440
 441         s += count;
 442         wp += count;
 443       }
 444   }
 445
 446   free (wordbreaks);
 447
 448   if (nf != NULL)
 449     {
 450       /* Finally, normalize the result.  */
 451       UNIT *normalized_result;
 452
 453       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
 454       if (normalized_result == NULL)
 455         goto fail2;
 456
 457       free (result);
 458       return normalized_result;
 459     }
 460
 461   if (length == 0)
 462     {
 463       if (result == NULL)
 464         {
 465           /* Return a non-NULL value.  NULL means error.  */
 466           result = (UNIT *) malloc (1);
 467           if (result == NULL)
 468             {
 469               errno = ENOMEM;
 470               goto fail2;
 471             }
 472         }
 473     }
 474   else if (result != resultbuf && length < allocated)
 475     {
 476       /* Shrink the allocated memory if possible.  */
 477       UNIT *memory;
 478
 479       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
 480       if (memory != NULL)
 481         result = memory;
 482     }
 483
 484   *lengthp = length;
 485   return result;
 486
 487  fail1:
 488   {
 489     int saved_errno = errno;
 490     free (wordbreaks);
 491     errno = saved_errno;
 492   }
 493  fail2:
 494   if (result != resultbuf)
 495     {
 496       int saved_errno = errno;
 497       free (result);
 498       errno = saved_errno;
 499     }
 500   return NULL;
 501 }