1 /* Decomposition and composition of Unicode strings.
2 Copyright (C) 2009-2013 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 FUNC (uninorm_t nf, const UNIT *s, size_t n,
20 UNIT *resultbuf, size_t *lengthp)
22 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
25 /* The result being accumulated. */
29 /* The buffer for sorting. */
30 #define SORTBUF_PREALLOCATED 64
31 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33 size_t sortbuf_allocated;
36 /* Initialize the accumulator. */
37 if (resultbuf == NULL)
49 /* Initialize the buffer for sorting. */
50 sortbuf = sortbuf_preallocated;
51 sortbuf_allocated = SORTBUF_PREALLOCATED;
55 const UNIT *s_end = s + n;
60 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
66 /* Fetch the next character. */
67 count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
70 /* Decompose it, recursively.
71 It would be possible to precompute the recursive decomposition
72 and store it in a table. But this would significantly increase
73 the size of the decomposition tables, because for example for
74 U+1FC1 the recursive canonical decomposition and the recursive
75 compatibility decomposition are different. */
79 for (curr = 0; curr < decomposed_count; )
81 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82 all elements are atomic. */
83 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84 int curr_decomposed_count;
86 curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87 if (curr_decomposed_count >= 0)
89 /* Move curr_decomposed[0..curr_decomposed_count-1] over
90 decomposed[curr], making room. It's not worth using
91 memcpy() here, since the counts are so small. */
92 int shift = curr_decomposed_count - 1;
100 decomposed_count += shift;
101 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
103 for (j = decomposed_count - 1 - shift; j > curr; j--)
104 decomposed[j + shift] = decomposed[j];
106 for (; shift >= 0; shift--)
107 decomposed[curr + shift] = curr_decomposed[shift];
111 /* decomposed[curr] is atomic. */
120 decomposed_count = 0;
131 /* Fetch the next character from the decomposition. */
132 if (i == decomposed_count)
135 ccc = uc_combining_class (uc);
139 /* End of string reached. */
148 /* Apply the canonical ordering algorithm to the accumulated
149 sequence of characters. */
150 if (sortbuf_count > 1)
151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152 sortbuf + sortbuf_count);
154 if (composer != NULL)
156 /* Attempt to combine decomposed characters, as specified
157 in the Unicode Standard Annex #15 "Unicode Normalization
158 Forms". We need to check
159 1. whether the first accumulated character is a
160 "starter" (i.e. has ccc = 0). This is usually the
161 case. But when the string starts with a
162 non-starter, the sortbuf also starts with a
163 non-starter. Btw, this check could also be
164 omitted, because the composition table has only
165 entries (code1, code2) for which code1 is a
166 starter; if the first accumulated character is not
167 a starter, no lookup will succeed.
168 2. If the sortbuf has more than one character, check
169 for each of these characters that are not "blocked"
170 from the starter (i.e. have a ccc that is higher
171 than the ccc of the previous character) whether it
172 can be combined with the first character.
173 3. If only one character is left in sortbuf, check
174 whether it can be combined with the next character
176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
178 for (j = 1; j < sortbuf_count; )
180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
183 composer (sortbuf[0].code, sortbuf[j].code);
188 sortbuf[0].code = combined;
189 /* sortbuf[0].ccc = 0, still valid. */
190 for (k = j + 1; k < sortbuf_count; k++)
191 sortbuf[k - 1] = sortbuf[k];
198 if (s < s_end && sortbuf_count == 1)
201 composer (sortbuf[0].code, uc);
206 /* uc could be further combined with subsequent
207 characters. So don't put it into sortbuf[0] in
208 this round, only in the next round. */
215 for (j = 0; j < sortbuf_count; j++)
217 ucs4_t muc = sortbuf[j].code;
219 /* Append muc to the result accumulator. */
220 if (length < allocated)
223 U_UCTOMB (result + length, muc, allocated - length);
236 size_t old_allocated = allocated;
237 size_t new_allocated = 2 * old_allocated;
238 if (new_allocated < 64)
240 if (new_allocated < old_allocated) /* integer overflow? */
247 (UNIT *) malloc (new_allocated * sizeof (UNIT));
248 if (larger_result == NULL)
254 else if (result == resultbuf)
257 (UNIT *) malloc (new_allocated * sizeof (UNIT));
258 if (larger_result == NULL)
263 U_CPY (larger_result, resultbuf, length);
268 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269 if (larger_result == NULL)
275 result = larger_result;
276 allocated = new_allocated;
279 U_UCTOMB (result + length, muc, allocated - length);
295 /* sortbuf is now empty. */
300 /* End of string reached. */
303 /* Append (uc, ccc) to sortbuf. */
304 if (sortbuf_count == sortbuf_allocated)
306 struct ucs4_with_ccc *new_sortbuf;
308 sortbuf_allocated = 2 * sortbuf_allocated;
309 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
312 (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313 if (new_sortbuf == NULL)
318 memcpy (new_sortbuf, sortbuf,
319 sortbuf_count * sizeof (struct ucs4_with_ccc));
320 if (sortbuf != sortbuf_preallocated)
322 sortbuf = new_sortbuf;
324 sortbuf[sortbuf_count].code = uc;
325 sortbuf[sortbuf_count].ccc = ccc;
332 /* End of string reached. */
343 /* Return a non-NULL value. NULL means error. */
344 result = (UNIT *) malloc (1);
352 else if (result != resultbuf && length < allocated)
354 /* Shrink the allocated memory if possible. */
357 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
362 if (sortbuf_count > 0)
364 if (sortbuf != sortbuf_preallocated)
372 int saved_errno = errno;
373 if (sortbuf != sortbuf_preallocated)
375 if (result != resultbuf)