Add context arguments to u*_casemap functions.
[gnulib.git] / lib / unicase / u-casemap.h
1 /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2    Copyright (C) 2009 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 UNIT *
19 FUNC (const UNIT *s, size_t n,
20       casing_prefix_context_t prefix_context,
21       casing_suffix_context_t suffix_context,
22       const char *iso639_language,
23       ucs4_t (*single_character_map) (ucs4_t),
24       size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
25       uninorm_t nf,
26       UNIT *resultbuf, size_t *lengthp)
27 {
28   /* The result being accumulated.  */
29   UNIT *result;
30   size_t length;
31   size_t allocated;
32
33   /* Initialize the accumulator.  */
34   if (nf != NULL || resultbuf == NULL)
35     {
36       result = NULL;
37       allocated = 0;
38     }
39   else
40     {
41       result = resultbuf;
42       allocated = *lengthp;
43     }
44   length = 0;
45
46   {
47     const UNIT *s_end = s + n;
48
49     /* Helper for evaluating the FINAL_SIGMA condition:
50        Last character that was not case-ignorable.  */
51     ucs4_t last_char_except_ignorable =
52       prefix_context.last_char_except_ignorable;
53
54     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
55        Last character that was of combining class 230 ("Above") or 0.  */
56     ucs4_t last_char_normal_or_above =
57       prefix_context.last_char_normal_or_above;
58
59     while (s < s_end)
60       {
61         ucs4_t uc;
62         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
63
64         ucs4_t mapped_uc[3];
65         unsigned int mapped_count;
66
67         if (uc < 0x10000)
68           {
69             /* Look first in the special-casing table.  */
70             char code[3];
71
72             code[0] = (uc >> 8) & 0xff;
73             code[1] = uc & 0xff;
74
75             for (code[2] = 0; ; code[2]++)
76               {
77                 const struct special_casing_rule *rule =
78                   gl_unicase_special_lookup (code, 3);
79
80                 if (rule == NULL)
81                   break;
82
83                 /* Test if the condition applies.  */
84                 /* Does the language apply?  */
85                 if (rule->language[0] == '\0'
86                     || (iso639_language != NULL
87                         && iso639_language[0] == rule->language[0]
88                         && iso639_language[1] == rule->language[1]))
89                   {
90                     /* Does the context apply?  */
91                     int context = rule->context;
92                     bool applies;
93
94                     if (context < 0)
95                       context = - context;
96                     switch (context)
97                       {
98                       case SCC_ALWAYS:
99                         applies = true;
100                         break;
101
102                       case SCC_FINAL_SIGMA:
103                         /* "Before" condition: preceded by a sequence
104                            consisting of a cased letter and a case-ignorable
105                            sequence.
106                            "After" condition: not followed by a sequence
107                            consisting of a case-ignorable sequence and then a
108                            cased letter.  */
109                         /* Test the "before" condition.  */
110                         applies = uc_is_cased (last_char_except_ignorable);
111                         /* Test the "after" condition.  */
112                         if (applies)
113                           {
114                             const UNIT *s2 = s + count;
115                             for (;;)
116                               {
117                                 if (s2 < s_end)
118                                   {
119                                     ucs4_t uc2;
120                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
121                                     if (uc_is_cased (uc2))
122                                       {
123                                         applies = false;
124                                         break;
125                                       }
126                                     if (!uc_is_case_ignorable (uc2))
127                                       break;
128                                     s2 += count2;
129                                   }
130                                 else
131                                   {
132                                     applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
133                                     break;
134                                   }
135                               }
136                           }
137                         break;
138
139                       case SCC_AFTER_SOFT_DOTTED:
140                         /* "Before" condition: There is a Soft_Dotted character
141                            before it, with no intervening character of
142                            combining class 0 or 230 (Above).  */
143                         /* Test the "before" condition.  */
144                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
145                         break;
146
147                       case SCC_MORE_ABOVE:
148                         /* "After" condition: followed by a character of
149                            combining class 230 (Above) with no intervening
150                            character of combining class 0 or 230 (Above).  */
151                         /* Test the "after" condition.  */
152                         {
153                           const UNIT *s2 = s + count;
154                           applies = false;
155                           for (;;)
156                             {
157                               if (s2 < s_end)
158                                 {
159                                   ucs4_t uc2;
160                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
161                                   int ccc = uc_combining_class (uc2);
162                                   if (ccc == UC_CCC_A)
163                                     {
164                                       applies = true;
165                                       break;
166                                     }
167                                   if (ccc == UC_CCC_NR)
168                                     break;
169                                   s2 += count2;
170                                 }
171                               else
172                                 {
173                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
174                                   break;
175                                 }
176                             }
177                         }
178                         break;
179
180                       case SCC_BEFORE_DOT:
181                         /* "After" condition: followed by COMBINING DOT ABOVE
182                            (U+0307). Any sequence of characters with a
183                            combining class that is neither 0 nor 230 may
184                            intervene between the current character and the
185                            combining dot above.  */
186                         /* Test the "after" condition.  */
187                         {
188                           const UNIT *s2 = s + count;
189                           applies = false;
190                           for (;;)
191                             {
192                               if (s2 < s_end)
193                                 {
194                                   ucs4_t uc2;
195                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
196                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
197                                     {
198                                       applies = true;
199                                       break;
200                                     }
201                                   {
202                                     int ccc = uc_combining_class (uc2);
203                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
204                                       break;
205                                   }
206                                   s2 += count2;
207                                 }
208                               else
209                                 {
210                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
211                                   break;
212                                 }
213                             }
214                         }
215                         break;
216
217                       case SCC_AFTER_I:
218                         /* "Before" condition: There is an uppercase I before
219                            it, and there is no intervening character of
220                            combining class 0 or 230 (Above).  */
221                         /* Test the "before" condition.  */
222                         applies = (last_char_normal_or_above == 'I');
223                         break;
224
225                       default:
226                         abort ();
227                       }
228                     if (rule->context < 0)
229                       applies = !applies;
230
231                     if (applies)
232                       {
233                         /* The rule applies.
234                            Look up the mapping (0 to 3 characters).  */
235                         const unsigned short *mapped_in_rule =
236                           (const unsigned short *)((const char *)rule + offset_in_rule);
237
238                         if (mapped_in_rule[0] == 0)
239                           mapped_count = 0;
240                         else
241                           {
242                             mapped_uc[0] = mapped_in_rule[0];
243                             if (mapped_in_rule[1] == 0)
244                               mapped_count = 1;
245                             else
246                               {
247                                 mapped_uc[1] = mapped_in_rule[1];
248                                 if (mapped_in_rule[2] == 0)
249                                   mapped_count = 2;
250                                 else
251                                   {
252                                     mapped_uc[2] = mapped_in_rule[2];
253                                     mapped_count = 3;
254                                   }
255                               }
256                           }
257                         goto found_mapping;
258                       }
259                   }
260
261                 /* Optimization: Save a hash table lookup in the next round.  */
262                 if (!rule->has_next)
263                   break;
264               }
265           }
266
267         /* No special-cased mapping.  So use the locale and context independent
268            mapping.  */
269         mapped_uc[0] = single_character_map (uc);
270         mapped_count = 1;
271
272        found_mapping:
273         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
274         {
275           unsigned int i;
276
277           for (i = 0; i < mapped_count; i++)
278             {
279               ucs4_t muc = mapped_uc[i];
280
281               /* Append muc to the result accumulator.  */
282               if (length < allocated)
283                 {
284                   int ret = U_UCTOMB (result + length, muc, allocated - length);
285                   if (ret == -1)
286                     {
287                       errno = EINVAL;
288                       goto fail;
289                     }
290                   if (ret >= 0)
291                     {
292                       length += ret;
293                       goto done_appending;
294                     }
295                 }
296               {
297                 size_t old_allocated = allocated;
298                 size_t new_allocated = 2 * old_allocated;
299                 if (new_allocated < 64)
300                   new_allocated = 64;
301                 if (new_allocated < old_allocated) /* integer overflow? */
302                   abort ();
303                 {
304                   UNIT *larger_result;
305                   if (result == NULL)
306                     {
307                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
308                       if (larger_result == NULL)
309                         {
310                           errno = ENOMEM;
311                           goto fail;
312                         }
313                     }
314                   else if (result == resultbuf)
315                     {
316                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
317                       if (larger_result == NULL)
318                         {
319                           errno = ENOMEM;
320                           goto fail;
321                         }
322                       U_CPY (larger_result, resultbuf, length);
323                     }
324                   else
325                     {
326                       larger_result =
327                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
328                       if (larger_result == NULL)
329                         {
330                           errno = ENOMEM;
331                           goto fail;
332                         }
333                     }
334                   result = larger_result;
335                   allocated = new_allocated;
336                   {
337                     int ret = U_UCTOMB (result + length, muc, allocated - length);
338                     if (ret == -1)
339                       {
340                         errno = EINVAL;
341                         goto fail;
342                       }
343                     if (ret < 0)
344                       abort ();
345                     length += ret;
346                     goto done_appending;
347                   }
348                 }
349               }
350              done_appending: ;
351             }
352         }
353
354         if (!uc_is_case_ignorable (uc))
355           last_char_except_ignorable = uc;
356
357         {
358           int ccc = uc_combining_class (uc);
359           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
360             last_char_normal_or_above = uc;
361         }
362
363         s += count;
364       }
365   }
366
367   if (nf != NULL)
368     {
369       /* Finally, normalize the result.  */
370       UNIT *normalized_result;
371
372       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
373       if (normalized_result == NULL)
374         goto fail;
375
376       free (result);
377       return normalized_result;
378     }
379
380   if (length == 0)
381     {
382       if (result == NULL)
383         {
384           /* Return a non-NULL value.  NULL means error.  */
385           result = (UNIT *) malloc (1);
386           if (result == NULL)
387             {
388               errno = ENOMEM;
389               goto fail;
390             }
391         }
392     }
393   else if (result != resultbuf && length < allocated)
394     {
395       /* Shrink the allocated memory if possible.  */
396       UNIT *memory;
397
398       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
399       if (memory != NULL)
400         result = memory;
401     }
402
403   *lengthp = length;
404   return result;
405
406  fail:
407   if (result != resultbuf)
408     {
409       int saved_errno = errno;
410       free (result);
411       errno = saved_errno;
412     }
413   return NULL;
414 }