maint: update copyright
[gnulib.git] / lib / unicase / u-casemap.h
1 /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 UNIT *
19 FUNC (const UNIT *s, size_t n,
20       casing_prefix_context_t prefix_context,
21       casing_suffix_context_t suffix_context,
22       const char *iso639_language,
23       ucs4_t (*single_character_map) (ucs4_t),
24       size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
25       uninorm_t nf,
26       UNIT *resultbuf, size_t *lengthp)
27 {
28   /* The result being accumulated.  */
29   UNIT *result;
30   size_t length;
31   size_t allocated;
32
33   /* Initialize the accumulator.  */
34   if (nf != NULL || resultbuf == NULL)
35     {
36       result = NULL;
37       allocated = 0;
38     }
39   else
40     {
41       result = resultbuf;
42       allocated = *lengthp;
43     }
44   length = 0;
45
46   {
47     const UNIT *s_end = s + n;
48
49     /* Helper for evaluating the FINAL_SIGMA condition:
50        Last character that was not case-ignorable.  */
51     ucs4_t last_char_except_ignorable =
52       prefix_context.last_char_except_ignorable;
53
54     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
55        Last character that was of combining class 230 ("Above") or 0.  */
56     ucs4_t last_char_normal_or_above =
57       prefix_context.last_char_normal_or_above;
58
59     while (s < s_end)
60       {
61         ucs4_t uc;
62         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
63
64         ucs4_t mapped_uc[3];
65         unsigned int mapped_count;
66
67         if (uc < 0x10000)
68           {
69             /* Look first in the special-casing table.  */
70             char code[3];
71
72             code[0] = (uc >> 8) & 0xff;
73             code[1] = uc & 0xff;
74
75             for (code[2] = 0; ; code[2]++)
76               {
77                 const struct special_casing_rule *rule =
78                   gl_unicase_special_lookup (code, 3);
79
80                 if (rule == NULL)
81                   break;
82
83                 /* Test if the condition applies.  */
84                 /* Does the language apply?  */
85                 if (rule->language[0] == '\0'
86                     || (iso639_language != NULL
87                         && iso639_language[0] == rule->language[0]
88                         && iso639_language[1] == rule->language[1]))
89                   {
90                     /* Does the context apply?  */
91                     int context = rule->context;
92                     bool applies;
93
94                     if (context < 0)
95                       context = - context;
96                     switch (context)
97                       {
98                       case SCC_ALWAYS:
99                         applies = true;
100                         break;
101
102                       case SCC_FINAL_SIGMA:
103                         /* "Before" condition: preceded by a sequence
104                            consisting of a cased letter and a case-ignorable
105                            sequence.
106                            "After" condition: not followed by a sequence
107                            consisting of a case-ignorable sequence and then a
108                            cased letter.  */
109                         /* Test the "before" condition.  */
110                         applies = uc_is_cased (last_char_except_ignorable);
111                         /* Test the "after" condition.  */
112                         if (applies)
113                           {
114                             const UNIT *s2 = s + count;
115                             for (;;)
116                               {
117                                 if (s2 < s_end)
118                                   {
119                                     ucs4_t uc2;
120                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
121                                     /* Our uc_is_case_ignorable function is
122                                        known to return false for all cased
123                                        characters.  So we can call
124                                        uc_is_case_ignorable first.  */
125                                     if (!uc_is_case_ignorable (uc2))
126                                       {
127                                         applies = ! uc_is_cased (uc2);
128                                         break;
129                                       }
130                                     s2 += count2;
131                                   }
132                                 else
133                                   {
134                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
135                                     break;
136                                   }
137                               }
138                           }
139                         break;
140
141                       case SCC_AFTER_SOFT_DOTTED:
142                         /* "Before" condition: There is a Soft_Dotted character
143                            before it, with no intervening character of
144                            combining class 0 or 230 (Above).  */
145                         /* Test the "before" condition.  */
146                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
147                         break;
148
149                       case SCC_MORE_ABOVE:
150                         /* "After" condition: followed by a character of
151                            combining class 230 (Above) with no intervening
152                            character of combining class 0 or 230 (Above).  */
153                         /* Test the "after" condition.  */
154                         {
155                           const UNIT *s2 = s + count;
156                           applies = false;
157                           for (;;)
158                             {
159                               if (s2 < s_end)
160                                 {
161                                   ucs4_t uc2;
162                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
163                                   int ccc = uc_combining_class (uc2);
164                                   if (ccc == UC_CCC_A)
165                                     {
166                                       applies = true;
167                                       break;
168                                     }
169                                   if (ccc == UC_CCC_NR)
170                                     break;
171                                   s2 += count2;
172                                 }
173                               else
174                                 {
175                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
176                                   break;
177                                 }
178                             }
179                         }
180                         break;
181
182                       case SCC_BEFORE_DOT:
183                         /* "After" condition: followed by COMBINING DOT ABOVE
184                            (U+0307). Any sequence of characters with a
185                            combining class that is neither 0 nor 230 may
186                            intervene between the current character and the
187                            combining dot above.  */
188                         /* Test the "after" condition.  */
189                         {
190                           const UNIT *s2 = s + count;
191                           applies = false;
192                           for (;;)
193                             {
194                               if (s2 < s_end)
195                                 {
196                                   ucs4_t uc2;
197                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
198                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
199                                     {
200                                       applies = true;
201                                       break;
202                                     }
203                                   {
204                                     int ccc = uc_combining_class (uc2);
205                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
206                                       break;
207                                   }
208                                   s2 += count2;
209                                 }
210                               else
211                                 {
212                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
213                                   break;
214                                 }
215                             }
216                         }
217                         break;
218
219                       case SCC_AFTER_I:
220                         /* "Before" condition: There is an uppercase I before
221                            it, and there is no intervening character of
222                            combining class 0 or 230 (Above).  */
223                         /* Test the "before" condition.  */
224                         applies = (last_char_normal_or_above == 'I');
225                         break;
226
227                       default:
228                         abort ();
229                       }
230                     if (rule->context < 0)
231                       applies = !applies;
232
233                     if (applies)
234                       {
235                         /* The rule applies.
236                            Look up the mapping (0 to 3 characters).  */
237                         const unsigned short *mapped_in_rule =
238                           (const unsigned short *)((const char *)rule + offset_in_rule);
239
240                         if (mapped_in_rule[0] == 0)
241                           mapped_count = 0;
242                         else
243                           {
244                             mapped_uc[0] = mapped_in_rule[0];
245                             if (mapped_in_rule[1] == 0)
246                               mapped_count = 1;
247                             else
248                               {
249                                 mapped_uc[1] = mapped_in_rule[1];
250                                 if (mapped_in_rule[2] == 0)
251                                   mapped_count = 2;
252                                 else
253                                   {
254                                     mapped_uc[2] = mapped_in_rule[2];
255                                     mapped_count = 3;
256                                   }
257                               }
258                           }
259                         goto found_mapping;
260                       }
261                   }
262
263                 /* Optimization: Save a hash table lookup in the next round.  */
264                 if (!rule->has_next)
265                   break;
266               }
267           }
268
269         /* No special-cased mapping.  So use the locale and context independent
270            mapping.  */
271         mapped_uc[0] = single_character_map (uc);
272         mapped_count = 1;
273
274        found_mapping:
275         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
276         {
277           unsigned int i;
278
279           for (i = 0; i < mapped_count; i++)
280             {
281               ucs4_t muc = mapped_uc[i];
282
283               /* Append muc to the result accumulator.  */
284               if (length < allocated)
285                 {
286                   int ret = U_UCTOMB (result + length, muc, allocated - length);
287                   if (ret == -1)
288                     {
289                       errno = EINVAL;
290                       goto fail;
291                     }
292                   if (ret >= 0)
293                     {
294                       length += ret;
295                       goto done_appending;
296                     }
297                 }
298               {
299                 size_t old_allocated = allocated;
300                 size_t new_allocated = 2 * old_allocated;
301                 if (new_allocated < 64)
302                   new_allocated = 64;
303                 if (new_allocated < old_allocated) /* integer overflow? */
304                   abort ();
305                 {
306                   UNIT *larger_result;
307                   if (result == NULL)
308                     {
309                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
310                       if (larger_result == NULL)
311                         {
312                           errno = ENOMEM;
313                           goto fail;
314                         }
315                     }
316                   else if (result == resultbuf)
317                     {
318                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
319                       if (larger_result == NULL)
320                         {
321                           errno = ENOMEM;
322                           goto fail;
323                         }
324                       U_CPY (larger_result, resultbuf, length);
325                     }
326                   else
327                     {
328                       larger_result =
329                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
330                       if (larger_result == NULL)
331                         {
332                           errno = ENOMEM;
333                           goto fail;
334                         }
335                     }
336                   result = larger_result;
337                   allocated = new_allocated;
338                   {
339                     int ret = U_UCTOMB (result + length, muc, allocated - length);
340                     if (ret == -1)
341                       {
342                         errno = EINVAL;
343                         goto fail;
344                       }
345                     if (ret < 0)
346                       abort ();
347                     length += ret;
348                     goto done_appending;
349                   }
350                 }
351               }
352              done_appending: ;
353             }
354         }
355
356         if (!uc_is_case_ignorable (uc))
357           last_char_except_ignorable = uc;
358
359         {
360           int ccc = uc_combining_class (uc);
361           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
362             last_char_normal_or_above = uc;
363         }
364
365         s += count;
366       }
367   }
368
369   if (nf != NULL)
370     {
371       /* Finally, normalize the result.  */
372       UNIT *normalized_result;
373
374       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
375       if (normalized_result == NULL)
376         goto fail;
377
378       free (result);
379       return normalized_result;
380     }
381
382   if (length == 0)
383     {
384       if (result == NULL)
385         {
386           /* Return a non-NULL value.  NULL means error.  */
387           result = (UNIT *) malloc (1);
388           if (result == NULL)
389             {
390               errno = ENOMEM;
391               goto fail;
392             }
393         }
394     }
395   else if (result != resultbuf && length < allocated)
396     {
397       /* Shrink the allocated memory if possible.  */
398       UNIT *memory;
399
400       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
401       if (memory != NULL)
402         result = memory;
403     }
404
405   *lengthp = length;
406   return result;
407
408  fail:
409   if (result != resultbuf)
410     {
411       int saved_errno = errno;
412       free (result);
413       errno = saved_errno;
414     }
415   return NULL;
416 }