181e569bc0662779da727aef805740560aec6e5c
[gnulib.git] / lib / unicase / u-ct-totitle.h
1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2    Copyright (C) 2009 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 /* Quoting the Unicode standard, section "Default Case Algorithms":
19      Find the word boundaries in X according to Unicode Standard Annex #29,
20      “Text Boundaries.” For each word boundary, find the first cased character
21      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22      then map all characters C between F and the following word boundary to
23      Lowercase_Mapping(C).  */
24
25 UNIT *
26 FUNC (const UNIT *s, size_t n,
27       casing_prefix_context_t prefix_context,
28       casing_suffix_context_t suffix_context,
29       const char *iso639_language,
30       uninorm_t nf,
31       UNIT *resultbuf, size_t *lengthp)
32 {
33   /* The result being accumulated.  */
34   UNIT *result;
35   size_t length;
36   size_t allocated;
37   /* An array containing the word break positions.  */
38   char *wordbreaks;
39
40   /* Initialize the accumulator.  */
41   if (nf != NULL || resultbuf == NULL)
42     {
43       result = NULL;
44       allocated = 0;
45     }
46   else
47     {
48       result = resultbuf;
49       allocated = *lengthp;
50     }
51   length = 0;
52
53   /* Initialize the word breaks array.  */
54   if (n > 0)
55     {
56       wordbreaks = (char *) malloc (n);
57       if (wordbreaks == NULL)
58         {
59           errno = ENOMEM;
60           goto fail2;
61         }
62       U_WORDBREAKS (s, n, wordbreaks);
63     }
64   else
65     wordbreaks = NULL;
66
67   {
68     const UNIT *s_end = s + n;
69     const char *wp = wordbreaks;
70
71     /* When considering the string as segmented by word boundaries: For each
72        such segment:
73         - In the first part, we are searching for the first cased character.
74           In this state, in_word_first_part = true, and no conversion takes
75           place.
76         - In the second part, we are converting every character: the first
77           among these characters to title case, the other ones to lower case.
78           In this state, in_word_first_part = false.  */
79     bool in_word_first_part = true;
80
81     /* Helper for evaluating the FINAL_SIGMA condition:
82        Last character that was not case-ignorable.  */
83     ucs4_t last_char_except_ignorable =
84       prefix_context.last_char_except_ignorable;
85
86     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87        Last character that was of combining class 230 ("Above") or 0.  */
88     ucs4_t last_char_normal_or_above =
89       prefix_context.last_char_normal_or_above;
90
91     while (s < s_end)
92       {
93         /* Fetch the next character.  */
94         ucs4_t uc;
95         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
96
97         ucs4_t (*single_character_map) (ucs4_t);
98         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
99
100         ucs4_t mapped_uc[3];
101         unsigned int mapped_count;
102
103         if (*wp)
104           /* Crossing a word boundary.  */
105           in_word_first_part = true;
106
107         /* Determine single_character_map, offset_in_rule.
108            There are three possibilities:
109              - uc should not be converted.
110              - uc should be titlecased.
111              - uc should be lowercased.  */
112         if (in_word_first_part)
113           {
114             if (uc_is_cased (uc))
115               {
116                 /* uc is to be titlecased.  */
117                 single_character_map = uc_totitle;
118                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
119                 in_word_first_part = false;
120               }
121             else
122               {
123                 /* uc is not converted.  */
124                 single_character_map = NULL;
125                 offset_in_rule = 0;
126               }
127           }
128         else
129           {
130             /* uc is to be lowercased.  */
131             single_character_map = uc_tolower;
132             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
133           }
134
135         /* Actually map uc.  */
136         if (single_character_map == NULL)
137           {
138             mapped_uc[0] = uc;
139             mapped_count = 1;
140             goto found_mapping;
141           }
142
143         if (uc < 0x10000)
144           {
145             /* Look first in the special-casing table.  */
146             char code[3];
147
148             code[0] = (uc >> 8) & 0xff;
149             code[1] = uc & 0xff;
150
151             for (code[2] = 0; ; code[2]++)
152               {
153                 const struct special_casing_rule *rule =
154                   gl_unicase_special_lookup (code, 3);
155
156                 if (rule == NULL)
157                   break;
158
159                 /* Test if the condition applies.  */
160                 /* Does the language apply?  */
161                 if (rule->language[0] == '\0'
162                     || (iso639_language != NULL
163                         && iso639_language[0] == rule->language[0]
164                         && iso639_language[1] == rule->language[1]))
165                   {
166                     /* Does the context apply?  */
167                     int context = rule->context;
168                     bool applies;
169
170                     if (context < 0)
171                       context = - context;
172                     switch (context)
173                       {
174                       case SCC_ALWAYS:
175                         applies = true;
176                         break;
177
178                       case SCC_FINAL_SIGMA:
179                         /* "Before" condition: preceded by a sequence
180                            consisting of a cased letter and a case-ignorable
181                            sequence.
182                            "After" condition: not followed by a sequence
183                            consisting of a case-ignorable sequence and then a
184                            cased letter.  */
185                         /* Test the "before" condition.  */
186                         applies = uc_is_cased (last_char_except_ignorable);
187                         /* Test the "after" condition.  */
188                         if (applies)
189                           {
190                             const UNIT *s2 = s + count;
191                             for (;;)
192                               {
193                                 if (s2 < s_end)
194                                   {
195                                     ucs4_t uc2;
196                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
197                                     if (uc_is_cased (uc2))
198                                       {
199                                         applies = false;
200                                         break;
201                                       }
202                                     if (!uc_is_case_ignorable (uc2))
203                                       break;
204                                     s2 += count2;
205                                   }
206                                 else
207                                   {
208                                     applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
209                                     break;
210                                   }
211                               }
212                           }
213                         break;
214
215                       case SCC_AFTER_SOFT_DOTTED:
216                         /* "Before" condition: There is a Soft_Dotted character
217                            before it, with no intervening character of
218                            combining class 0 or 230 (Above).  */
219                         /* Test the "before" condition.  */
220                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
221                         break;
222
223                       case SCC_MORE_ABOVE:
224                         /* "After" condition: followed by a character of
225                            combining class 230 (Above) with no intervening
226                            character of combining class 0 or 230 (Above).  */
227                         /* Test the "after" condition.  */
228                         {
229                           const UNIT *s2 = s + count;
230                           applies = false;
231                           for (;;)
232                             {
233                               if (s2 < s_end)
234                                 {
235                                   ucs4_t uc2;
236                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
237                                   int ccc = uc_combining_class (uc2);
238                                   if (ccc == UC_CCC_A)
239                                     {
240                                       applies = true;
241                                       break;
242                                     }
243                                   if (ccc == UC_CCC_NR)
244                                     break;
245                                   s2 += count2;
246                                 }
247                               else
248                                 {
249                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
250                                   break;
251                                 }
252                             }
253                         }
254                         break;
255
256                       case SCC_BEFORE_DOT:
257                         /* "After" condition: followed by COMBINING DOT ABOVE
258                            (U+0307). Any sequence of characters with a
259                            combining class that is neither 0 nor 230 may
260                            intervene between the current character and the
261                            combining dot above.  */
262                         /* Test the "after" condition.  */
263                         {
264                           const UNIT *s2 = s + count;
265                           applies = false;
266                           for (;;)
267                             {
268                               if (s2 < s_end)
269                                 {
270                                   ucs4_t uc2;
271                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
272                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
273                                     {
274                                       applies = true;
275                                       break;
276                                     }
277                                   {
278                                     int ccc = uc_combining_class (uc2);
279                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
280                                       break;
281                                   }
282                                   s2 += count2;
283                                 }
284                               else
285                                 {
286                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
287                                   break;
288                                 }
289                             }
290                         }
291                         break;
292
293                       case SCC_AFTER_I:
294                         /* "Before" condition: There is an uppercase I before
295                            it, and there is no intervening character of
296                            combining class 0 or 230 (Above).  */
297                         /* Test the "before" condition.  */
298                         applies = (last_char_normal_or_above == 'I');
299                         break;
300
301                       default:
302                         abort ();
303                       }
304                     if (rule->context < 0)
305                       applies = !applies;
306
307                     if (applies)
308                       {
309                         /* The rule applies.
310                            Look up the mapping (0 to 3 characters).  */
311                         const unsigned short *mapped_in_rule =
312                           (const unsigned short *)((const char *)rule + offset_in_rule);
313
314                         if (mapped_in_rule[0] == 0)
315                           mapped_count = 0;
316                         else
317                           {
318                             mapped_uc[0] = mapped_in_rule[0];
319                             if (mapped_in_rule[1] == 0)
320                               mapped_count = 1;
321                             else
322                               {
323                                 mapped_uc[1] = mapped_in_rule[1];
324                                 if (mapped_in_rule[2] == 0)
325                                   mapped_count = 2;
326                                 else
327                                   {
328                                     mapped_uc[2] = mapped_in_rule[2];
329                                     mapped_count = 3;
330                                   }
331                               }
332                           }
333                         goto found_mapping;
334                       }
335                   }
336
337                 /* Optimization: Save a hash table lookup in the next round.  */
338                 if (!rule->has_next)
339                   break;
340               }
341           }
342
343         /* No special-cased mapping.  So use the locale and context independent
344            mapping.  */
345         mapped_uc[0] = single_character_map (uc);
346         mapped_count = 1;
347
348        found_mapping:
349         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
350         {
351           unsigned int i;
352
353           for (i = 0; i < mapped_count; i++)
354             {
355               ucs4_t muc = mapped_uc[i];
356
357               /* Append muc to the result accumulator.  */
358               if (length < allocated)
359                 {
360                   int ret = U_UCTOMB (result + length, muc, allocated - length);
361                   if (ret == -1)
362                     {
363                       errno = EINVAL;
364                       goto fail1;
365                     }
366                   if (ret >= 0)
367                     {
368                       length += ret;
369                       goto done_appending;
370                     }
371                 }
372               {
373                 size_t old_allocated = allocated;
374                 size_t new_allocated = 2 * old_allocated;
375                 if (new_allocated < 64)
376                   new_allocated = 64;
377                 if (new_allocated < old_allocated) /* integer overflow? */
378                   abort ();
379                 {
380                   UNIT *larger_result;
381                   if (result == NULL)
382                     {
383                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
384                       if (larger_result == NULL)
385                         {
386                           errno = ENOMEM;
387                           goto fail1;
388                         }
389                     }
390                   else if (result == resultbuf)
391                     {
392                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
393                       if (larger_result == NULL)
394                         {
395                           errno = ENOMEM;
396                           goto fail1;
397                         }
398                       U_CPY (larger_result, resultbuf, length);
399                     }
400                   else
401                     {
402                       larger_result =
403                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
404                       if (larger_result == NULL)
405                         {
406                           errno = ENOMEM;
407                           goto fail1;
408                         }
409                     }
410                   result = larger_result;
411                   allocated = new_allocated;
412                   {
413                     int ret = U_UCTOMB (result + length, muc, allocated - length);
414                     if (ret == -1)
415                       {
416                         errno = EINVAL;
417                         goto fail1;
418                       }
419                     if (ret < 0)
420                       abort ();
421                     length += ret;
422                     goto done_appending;
423                   }
424                 }
425               }
426              done_appending: ;
427             }
428         }
429
430         if (!uc_is_case_ignorable (uc))
431           last_char_except_ignorable = uc;
432
433         {
434           int ccc = uc_combining_class (uc);
435           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
436             last_char_normal_or_above = uc;
437         }
438
439         s += count;
440         wp += count;
441       }
442   }
443
444   free (wordbreaks);
445
446   if (nf != NULL)
447     {
448       /* Finally, normalize the result.  */
449       UNIT *normalized_result;
450
451       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
452       if (normalized_result == NULL)
453         goto fail2;
454
455       free (result);
456       return normalized_result;
457     }
458
459   if (length == 0)
460     {
461       if (result == NULL)
462         {
463           /* Return a non-NULL value.  NULL means error.  */
464           result = (UNIT *) malloc (1);
465           if (result == NULL)
466             {
467               errno = ENOMEM;
468               goto fail2;
469             }
470         }
471     }
472   else if (result != resultbuf && length < allocated)
473     {
474       /* Shrink the allocated memory if possible.  */
475       UNIT *memory;
476
477       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
478       if (memory != NULL)
479         result = memory;
480     }
481
482   *lengthp = length;
483   return result;
484
485  fail1:
486   {
487     int saved_errno = errno;
488     free (wordbreaks);
489     errno = saved_errno;
490   }
491  fail2:
492   if (result != resultbuf)
493     {
494       int saved_errno = errno;
495       free (result);
496       errno = saved_errno;
497     }
498   return NULL;
499 }