lib/unicase/u-totitle.h

   1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
   2    Copyright (C) 2009 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 /* Quoting the Unicode standard:
  19      Definition: A character is defined to be "cased" if it has the Lowercase or
  20      Uppercase property or has a General_Category value of Titlecase_Letter.  */
  21 static inline bool
  22 is_cased (ucs4_t uc)
  23 {
  24   return (uc_is_property_lowercase (uc)
  25           || uc_is_property_uppercase (uc)
  26           || uc_is_general_category (uc, UC_TITLECASE_LETTER));
  27 }
  28
  29 /* Quoting the Unicode standard:
  30      Definition: A character is defined to be "case-ignorable" if it has the
  31      value MidLetter {or the value MidNumLet} for the Word_Break property or
  32      its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
  33      Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
  34    The text marked in braces was added in Unicode 5.1.0, see
  35    <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
  36    Definition of case-ignorable".   */
  37 static inline bool
  38 is_case_ignorable (ucs4_t uc)
  39 {
  40   int wbp = uc_wordbreak_property (uc);
  41
  42   return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET
  43           || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn
  44                                                    | UC_CATEGORY_MASK_Me
  45                                                    | UC_CATEGORY_MASK_Cf
  46                                                    | UC_CATEGORY_MASK_Lm
  47                                                    | UC_CATEGORY_MASK_Sk));
  48 }
  49
  50 /* Quoting the Unicode standard, section "Default Case Algorithms":
  51      Find the word boundaries in X according to Unicode Standard Annex #29,
  52      “Text Boundaries.” For each word boundary, find the first cased character
  53      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
  54      then map all characters C between F and the following word boundary to
  55      Lowercase_Mapping(C).  */
  56
  57 UNIT *
  58 FUNC (const UNIT *s, size_t n, const char *iso639_language,
  59       uninorm_t nf,
  60       UNIT *resultbuf, size_t *lengthp)
  61 {
  62   /* The result being accumulated.  */
  63   UNIT *result;
  64   size_t length;
  65   size_t allocated;
  66   /* An array containing the word break positions.  */
  67   char *wordbreaks;
  68
  69   /* Initialize the accumulator.  */
  70   if (nf != NULL || resultbuf == NULL)
  71     {
  72       result = NULL;
  73       allocated = 0;
  74     }
  75   else
  76     {
  77       result = resultbuf;
  78       allocated = *lengthp;
  79     }
  80   length = 0;
  81
  82   /* Initialize the word breaks array.  */
  83   if (n > 0)
  84     {
  85       wordbreaks = (char *) malloc (n);
  86       if (wordbreaks == NULL)
  87         {
  88           errno = ENOMEM;
  89           goto fail2;
  90         }
  91       U_WORDBREAKS (s, n, wordbreaks);
  92     }
  93   else
  94     wordbreaks = NULL;
  95
  96   {
  97     const UNIT *s_end = s + n;
  98     const char *wp = wordbreaks;
  99
 100     /* When considering the string as segmented by word boundaries: For each
 101        such segment:
 102         - In the first part, we are searching for the first cased character.
 103           In this state, in_word_first_part = true, and no conversion takes
 104           place.
 105         - In the second part, we are converting every character: the first
 106           among these characters to title case, the other ones to lower case.
 107           In this state, in_word_first_part = false.  */
 108     bool in_word_first_part = true;
 109
 110     /* Helper for evaluating the FINAL_SIGMA condition:
 111        Last character that was not case-ignorable.  */
 112     ucs4_t last_char_except_ignorable = 0xFFFD;
 113
 114     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
 115        Last character that was of combining class 230 ("Above") or 0.  */
 116     ucs4_t last_char_normal_or_above = 0xFFFD;
 117
 118     while (s < s_end)
 119       {
 120         /* Fetch the next character.  */
 121         ucs4_t uc;
 122         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
 123
 124         ucs4_t (*single_character_map) (ucs4_t);
 125         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
 126
 127         ucs4_t mapped_uc[3];
 128         unsigned int mapped_count;
 129
 130         if (*wp)
 131           /* Crossing a word boundary.  */
 132           in_word_first_part = true;
 133
 134         /* Determine single_character_map, offset_in_rule.
 135            There are three possibilities:
 136              - uc should not be converted.
 137              - uc should be titlecased.
 138              - uc should be lowercased.  */
 139         if (in_word_first_part)
 140           {
 141             if (is_cased (uc))
 142               {
 143                 /* uc is to be titlecased.  */
 144                 single_character_map = uc_totitle;
 145                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
 146                 in_word_first_part = false;
 147               }
 148             else
 149               {
 150                 /* uc is not converted.  */
 151                 single_character_map = NULL;
 152                 offset_in_rule = 0;
 153               }
 154           }
 155         else
 156           {
 157             /* uc is to be lowercased.  */
 158             single_character_map = uc_tolower;
 159             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
 160           }
 161
 162         /* Actually map uc.  */
 163         if (single_character_map == NULL)
 164           {
 165             mapped_uc[0] = uc;
 166             mapped_count = 1;
 167             goto found_mapping;
 168           }
 169
 170         if (uc < 0x10000)
 171           {
 172             /* Look first in the special-casing table.  */
 173             char code[3];
 174
 175             code[0] = (uc >> 8) & 0xff;
 176             code[1] = uc & 0xff;
 177
 178             for (code[2] = 0; ; code[2]++)
 179               {
 180                 const struct special_casing_rule *rule =
 181                   gl_unicase_special_lookup (code, 3);
 182
 183                 if (rule == NULL)
 184                   break;
 185
 186                 /* Test if the condition applies.  */
 187                 /* Does the language apply?  */
 188                 if (rule->language[0] == '\0'
 189                     || (iso639_language != NULL
 190                         && iso639_language[0] == rule->language[0]
 191                         && iso639_language[1] == rule->language[1]))
 192                   {
 193                     /* Does the context apply?  */
 194                     int context = rule->context;
 195                     bool applies;
 196
 197                     if (context < 0)
 198                       context = - context;
 199                     switch (context)
 200                       {
 201                       case SCC_ALWAYS:
 202                         applies = true;
 203                         break;
 204
 205                       case SCC_FINAL_SIGMA:
 206                         /* "Before" condition: preceded by a sequence
 207                            consisting of a cased letter and a case-ignorable
 208                            sequence.
 209                            "After" condition: not followed by a sequence
 210                            consisting of a case-ignorable sequence and then a
 211                            cased letter.  */
 212                         /* Test the "before" condition.  */
 213                         applies = is_cased (last_char_except_ignorable);
 214                         /* Test the "after" condition.  */
 215                         if (applies)
 216                           {
 217                             const UNIT *s2 = s + count;
 218                             while (s2 < s_end)
 219                               {
 220                                 ucs4_t uc2;
 221                                 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 222                                 if (is_cased (uc2))
 223                                   {
 224                                     applies = false;
 225                                     break;
 226                                   }
 227                                 if (!is_case_ignorable (uc2))
 228                                   break;
 229                                 s2 += count2;
 230                               }
 231                           }
 232                         break;
 233
 234                       case SCC_AFTER_SOFT_DOTTED:
 235                         /* "Before" condition: There is a Soft_Dotted character
 236                            before it, with no intervening character of
 237                            combining class 0 or 230 (Above).  */
 238                         /* Test the "before" condition.  */
 239                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
 240                         break;
 241
 242                       case SCC_MORE_ABOVE:
 243                         /* "After" condition: followed by a character of
 244                            combining class 230 (Above) with no intervening
 245                            character of combining class 0 or 230 (Above).  */
 246                         /* Test the "after" condition.  */
 247                         {
 248                           const UNIT *s2 = s + count;
 249                           applies = false;
 250                           while (s2 < s_end)
 251                             {
 252                               ucs4_t uc2;
 253                               int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 254                               int ccc = uc_combining_class (uc2);
 255                               if (ccc == UC_CCC_A)
 256                                 {
 257                                   applies = true;
 258                                   break;
 259                                 }
 260                               if (ccc == UC_CCC_NR)
 261                                 break;
 262                               s2 += count2;
 263                             }
 264                         }
 265                         break;
 266
 267                       case SCC_BEFORE_DOT:
 268                         /* "After" condition: followed by COMBINING DOT ABOVE
 269                            (U+0307). Any sequence of characters with a
 270                            combining class that is neither 0 nor 230 may
 271                            intervene between the current character and the
 272                            combining dot above.  */
 273                         /* Test the "after" condition.  */
 274                         {
 275                           const UNIT *s2 = s + count;
 276                           applies = false;
 277                           while (s2 < s_end)
 278                             {
 279                               ucs4_t uc2;
 280                               int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 281                               if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
 282                                 {
 283                                   applies = true;
 284                                   break;
 285                                 }
 286                               {
 287                                 int ccc = uc_combining_class (uc2);
 288                                 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 289                                   break;
 290                               }
 291                               s2 += count2;
 292                             }
 293                         }
 294                         break;
 295
 296                       case SCC_AFTER_I:
 297                         /* "Before" condition: There is an uppercase I before
 298                            it, and there is no intervening character of
 299                            combining class 0 or 230 (Above).  */
 300                         /* Test the "before" condition.  */
 301                         applies = (last_char_normal_or_above == 'I');
 302                         break;
 303
 304                       default:
 305                         abort ();
 306                       }
 307                     if (rule->context < 0)
 308                       applies = !applies;
 309
 310                     if (applies)
 311                       {
 312                         /* The rule applies.
 313                            Look up the mapping (0 to 3 characters).  */
 314                         const unsigned short *mapped_in_rule =
 315                           (const unsigned short *)((const char *)rule + offset_in_rule);
 316
 317                         if (mapped_in_rule[0] == 0)
 318                           mapped_count = 0;
 319                         else
 320                           {
 321                             mapped_uc[0] = mapped_in_rule[0];
 322                             if (mapped_in_rule[1] == 0)
 323                               mapped_count = 1;
 324                             else
 325                               {
 326                                 mapped_uc[1] = mapped_in_rule[1];
 327                                 if (mapped_in_rule[2] == 0)
 328                                   mapped_count = 2;
 329                                 else
 330                                   {
 331                                     mapped_uc[2] = mapped_in_rule[2];
 332                                     mapped_count = 3;
 333                                   }
 334                               }
 335                           }
 336                         goto found_mapping;
 337                       }
 338                   }
 339
 340                 /* Optimization: Save a hash table lookup in the next round.  */
 341                 if (!rule->has_next)
 342                   break;
 343               }
 344           }
 345
 346         /* No special-cased mapping.  So use the locale and context independent
 347            mapping.  */
 348         mapped_uc[0] = single_character_map (uc);
 349         mapped_count = 1;
 350
 351        found_mapping:
 352         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
 353         {
 354           unsigned int i;
 355
 356           for (i = 0; i < mapped_count; i++)
 357             {
 358               ucs4_t muc = mapped_uc[i];
 359
 360               /* Append muc to the result accumulator.  */
 361               if (length < allocated)
 362                 {
 363                   int ret = U_UCTOMB (result + length, muc, allocated - length);
 364                   if (ret == -1)
 365                     {
 366                       errno = EINVAL;
 367                       goto fail1;
 368                     }
 369                   if (ret >= 0)
 370                     {
 371                       length += ret;
 372                       goto done_appending;
 373                     }
 374                 }
 375               {
 376                 size_t old_allocated = allocated;
 377                 size_t new_allocated = 2 * old_allocated;
 378                 if (new_allocated < 64)
 379                   new_allocated = 64;
 380                 if (new_allocated < old_allocated) /* integer overflow? */
 381                   abort ();
 382                 {
 383                   UNIT *larger_result;
 384                   if (result == NULL)
 385                     {
 386                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 387                       if (larger_result == NULL)
 388                         {
 389                           errno = ENOMEM;
 390                           goto fail1;
 391                         }
 392                     }
 393                   else if (result == resultbuf)
 394                     {
 395                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 396                       if (larger_result == NULL)
 397                         {
 398                           errno = ENOMEM;
 399                           goto fail1;
 400                         }
 401                       U_CPY (larger_result, resultbuf, length);
 402                     }
 403                   else
 404                     {
 405                       larger_result =
 406                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
 407                       if (larger_result == NULL)
 408                         {
 409                           errno = ENOMEM;
 410                           goto fail1;
 411                         }
 412                     }
 413                   result = larger_result;
 414                   allocated = new_allocated;
 415                   {
 416                     int ret = U_UCTOMB (result + length, muc, allocated - length);
 417                     if (ret == -1)
 418                       {
 419                         errno = EINVAL;
 420                         goto fail1;
 421                       }
 422                     if (ret < 0)
 423                       abort ();
 424                     length += ret;
 425                     goto done_appending;
 426                   }
 427                 }
 428               }
 429              done_appending: ;
 430             }
 431         }
 432
 433         if (!is_case_ignorable (uc))
 434           last_char_except_ignorable = uc;
 435
 436         {
 437           int ccc = uc_combining_class (uc);
 438           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 439             last_char_normal_or_above = uc;
 440         }
 441
 442         s += count;
 443         wp += count;
 444       }
 445   }
 446
 447   free (wordbreaks);
 448
 449   if (nf != NULL)
 450     {
 451       /* Finally, normalize the result.  */
 452       UNIT *normalized_result;
 453
 454       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
 455       if (normalized_result == NULL)
 456         goto fail2;
 457
 458       free (result);
 459       return normalized_result;
 460     }
 461
 462   if (length == 0)
 463     {
 464       if (result == NULL)
 465         {
 466           /* Return a non-NULL value.  NULL means error.  */
 467           result = (UNIT *) malloc (1);
 468           if (result == NULL)
 469             {
 470               errno = ENOMEM;
 471               goto fail2;
 472             }
 473         }
 474     }
 475   else if (result != resultbuf && length < allocated)
 476     {
 477       /* Shrink the allocated memory if possible.  */
 478       UNIT *memory;
 479
 480       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
 481       if (memory != NULL)
 482         result = memory;
 483     }
 484
 485   *lengthp = length;
 486   return result;
 487
 488  fail1:
 489   {
 490     int saved_errno = errno;
 491     free (wordbreaks);
 492     errno = saved_errno;
 493   }
 494  fail2:
 495   if (result != resultbuf)
 496     {
 497       int saved_errno = errno;
 498       free (result);
 499       errno = saved_errno;
 500     }
 501   return NULL;
 502 }