bb54b300454a4e790e979d95e0551e67c52487fa
[gnulib.git] / lib / uninorm / u-normalize-internal.h
1 /* Decomposition and composition of Unicode strings.
2    Copyright (C) 2009-2012 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 UNIT *
19 FUNC (uninorm_t nf, const UNIT *s, size_t n,
20       UNIT *resultbuf, size_t *lengthp)
21 {
22   int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23   ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25   /* The result being accumulated.  */
26   UNIT *result;
27   size_t length;
28   size_t allocated;
29   /* The buffer for sorting.  */
30   #define SORTBUF_PREALLOCATED 64
31   struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32   struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33   size_t sortbuf_allocated;
34   size_t sortbuf_count;
35
36   /* Initialize the accumulator.  */
37   if (resultbuf == NULL)
38     {
39       result = NULL;
40       allocated = 0;
41     }
42   else
43     {
44       result = resultbuf;
45       allocated = *lengthp;
46     }
47   length = 0;
48
49   /* Initialize the buffer for sorting.  */
50   sortbuf = sortbuf_preallocated;
51   sortbuf_allocated = SORTBUF_PREALLOCATED;
52   sortbuf_count = 0;
53
54   {
55     const UNIT *s_end = s + n;
56
57     for (;;)
58       {
59         int count;
60         ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61         int decomposed_count;
62         int i;
63
64         if (s < s_end)
65           {
66             /* Fetch the next character.  */
67             count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68             decomposed_count = 1;
69
70             /* Decompose it, recursively.
71                It would be possible to precompute the recursive decomposition
72                and store it in a table.  But this would significantly increase
73                the size of the decomposition tables, because for example for
74                U+1FC1 the recursive canonical decomposition and the recursive
75                compatibility decomposition are different.  */
76             {
77               int curr;
78
79               for (curr = 0; curr < decomposed_count; )
80                 {
81                   /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82                      all elements are atomic.  */
83                   ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84                   int curr_decomposed_count;
85
86                   curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87                   if (curr_decomposed_count >= 0)
88                     {
89                       /* Move curr_decomposed[0..curr_decomposed_count-1] over
90                          decomposed[curr], making room.  It's not worth using
91                          memcpy() here, since the counts are so small.  */
92                       int shift = curr_decomposed_count - 1;
93
94                       if (shift < 0)
95                         abort ();
96                       if (shift > 0)
97                         {
98                           int j;
99
100                           decomposed_count += shift;
101                           if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102                             abort ();
103                           for (j = decomposed_count - 1 - shift; j > curr; j--)
104                             decomposed[j + shift] = decomposed[j];
105                         }
106                       for (; shift >= 0; shift--)
107                         decomposed[curr + shift] = curr_decomposed[shift];
108                     }
109                   else
110                     {
111                       /* decomposed[curr] is atomic.  */
112                       curr++;
113                     }
114                 }
115             }
116           }
117         else
118           {
119             count = 0;
120             decomposed_count = 0;
121           }
122
123         i = 0;
124         for (;;)
125           {
126             ucs4_t uc;
127             int ccc;
128
129             if (s < s_end)
130               {
131                 /* Fetch the next character from the decomposition.  */
132                 if (i == decomposed_count)
133                   break;
134                 uc = decomposed[i];
135                 ccc = uc_combining_class (uc);
136               }
137             else
138               {
139                 /* End of string reached.  */
140                 uc = 0;
141                 ccc = 0;
142               }
143
144             if (ccc == 0)
145               {
146                 size_t j;
147
148                 /* Apply the canonical ordering algorithm to the accumulated
149                    sequence of characters.  */
150                 if (sortbuf_count > 1)
151                   gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152                                                            sortbuf + sortbuf_count);
153
154                 if (composer != NULL)
155                   {
156                     /* Attempt to combine decomposed characters, as specified
157                        in the Unicode Standard Annex #15 "Unicode Normalization
158                        Forms".  We need to check
159                          1. whether the first accumulated character is a
160                             "starter" (i.e. has ccc = 0).  This is usually the
161                             case.  But when the string starts with a
162                             non-starter, the sortbuf also starts with a
163                             non-starter.  Btw, this check could also be
164                             omitted, because the composition table has only
165                             entries (code1, code2) for which code1 is a
166                             starter; if the first accumulated character is not
167                             a starter, no lookup will succeed.
168                          2. If the sortbuf has more than one character, check
169                             for each of these characters that are not "blocked"
170                             from the starter (i.e. have a ccc that is higher
171                             than the ccc of the previous character) whether it
172                             can be combined with the first character.
173                          3. If only one character is left in sortbuf, check
174                             whether it can be combined with the next character
175                             (also a starter).  */
176                     if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177                       {
178                         for (j = 1; j < sortbuf_count; )
179                           {
180                             if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181                               {
182                                 ucs4_t combined =
183                                   composer (sortbuf[0].code, sortbuf[j].code);
184                                 if (combined)
185                                   {
186                                     size_t k;
187
188                                     sortbuf[0].code = combined;
189                                     /* sortbuf[0].ccc = 0, still valid.  */
190                                     for (k = j + 1; k < sortbuf_count; k++)
191                                       sortbuf[k - 1] = sortbuf[k];
192                                     sortbuf_count--;
193                                     continue;
194                                   }
195                               }
196                             j++;
197                           }
198                         if (s < s_end && sortbuf_count == 1)
199                           {
200                             ucs4_t combined =
201                               composer (sortbuf[0].code, uc);
202                             if (combined)
203                               {
204                                 uc = combined;
205                                 ccc = 0;
206                                 /* uc could be further combined with subsequent
207                                    characters.  So don't put it into sortbuf[0] in
208                                    this round, only in the next round.  */
209                                 sortbuf_count = 0;
210                               }
211                           }
212                       }
213                   }
214
215                 for (j = 0; j < sortbuf_count; j++)
216                   {
217                     ucs4_t muc = sortbuf[j].code;
218
219                     /* Append muc to the result accumulator.  */
220                     if (length < allocated)
221                       {
222                         int ret =
223                           U_UCTOMB (result + length, muc, allocated - length);
224                         if (ret == -1)
225                           {
226                             errno = EINVAL;
227                             goto fail;
228                           }
229                         if (ret >= 0)
230                           {
231                             length += ret;
232                             goto done_appending;
233                           }
234                       }
235                     {
236                       size_t old_allocated = allocated;
237                       size_t new_allocated = 2 * old_allocated;
238                       if (new_allocated < 64)
239                         new_allocated = 64;
240                       if (new_allocated < old_allocated) /* integer overflow? */
241                         abort ();
242                       {
243                         UNIT *larger_result;
244                         if (result == NULL)
245                           {
246                             larger_result =
247                               (UNIT *) malloc (new_allocated * sizeof (UNIT));
248                             if (larger_result == NULL)
249                               {
250                                 errno = ENOMEM;
251                                 goto fail;
252                               }
253                           }
254                         else if (result == resultbuf)
255                           {
256                             larger_result =
257                               (UNIT *) malloc (new_allocated * sizeof (UNIT));
258                             if (larger_result == NULL)
259                               {
260                                 errno = ENOMEM;
261                                 goto fail;
262                               }
263                             U_CPY (larger_result, resultbuf, length);
264                           }
265                         else
266                           {
267                             larger_result =
268                               (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269                             if (larger_result == NULL)
270                               {
271                                 errno = ENOMEM;
272                                 goto fail;
273                               }
274                           }
275                         result = larger_result;
276                         allocated = new_allocated;
277                         {
278                           int ret =
279                             U_UCTOMB (result + length, muc, allocated - length);
280                           if (ret == -1)
281                             {
282                               errno = EINVAL;
283                               goto fail;
284                             }
285                           if (ret < 0)
286                             abort ();
287                           length += ret;
288                           goto done_appending;
289                         }
290                       }
291                     }
292                    done_appending: ;
293                   }
294
295                 /* sortbuf is now empty.  */
296                 sortbuf_count = 0;
297               }
298
299             if (!(s < s_end))
300               /* End of string reached.  */
301               break;
302
303             /* Append (uc, ccc) to sortbuf.  */
304             if (sortbuf_count == sortbuf_allocated)
305               {
306                 struct ucs4_with_ccc *new_sortbuf;
307
308                 sortbuf_allocated = 2 * sortbuf_allocated;
309                 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310                   abort ();
311                 new_sortbuf =
312                   (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313                 if (new_sortbuf == NULL)
314                   {
315                     errno = ENOMEM;
316                     goto fail;
317                   }
318                 memcpy (new_sortbuf, sortbuf,
319                         sortbuf_count * sizeof (struct ucs4_with_ccc));
320                 if (sortbuf != sortbuf_preallocated)
321                   free (sortbuf);
322                 sortbuf = new_sortbuf;
323               }
324             sortbuf[sortbuf_count].code = uc;
325             sortbuf[sortbuf_count].ccc = ccc;
326             sortbuf_count++;
327
328             i++;
329           }
330
331         if (!(s < s_end))
332           /* End of string reached.  */
333           break;
334
335         s += count;
336       }
337   }
338
339   if (length == 0)
340     {
341       if (result == NULL)
342         {
343           /* Return a non-NULL value.  NULL means error.  */
344           result = (UNIT *) malloc (1);
345           if (result == NULL)
346             {
347               errno = ENOMEM;
348               goto fail;
349             }
350         }
351     }
352   else if (result != resultbuf && length < allocated)
353     {
354       /* Shrink the allocated memory if possible.  */
355       UNIT *memory;
356
357       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358       if (memory != NULL)
359         result = memory;
360     }
361
362   if (sortbuf_count > 0)
363     abort ();
364   if (sortbuf != sortbuf_preallocated)
365     free (sortbuf);
366
367   *lengthp = length;
368   return result;
369
370  fail:
371   {
372     int saved_errno = errno;
373     if (sortbuf != sortbuf_preallocated)
374       free (sortbuf);
375     if (result != resultbuf)
376       free (result);
377     errno = saved_errno;
378   }
379   return NULL;
380 }