1 /* Stream-based normalization of Unicode strings.
2 Copyright (C) 2009-2012 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
29 #include "normalize-internal.h"
30 #include "decompose-internal.h"
35 /* Characteristics of the normalization form. */
36 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
37 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
39 /* The encapsulated stream. */
40 int (*stream_func) (void *stream_data, ucs4_t uc);
43 /* The buffer for sorting. */
44 #define SORTBUF_PREALLOCATED 64
45 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
46 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
47 size_t sortbuf_allocated;
51 struct uninorm_filter *
52 uninorm_filter_create (uninorm_t nf,
53 int (*stream_func) (void *stream_data, ucs4_t uc),
56 struct uninorm_filter *filter =
57 (struct uninorm_filter *) malloc (sizeof (struct uninorm_filter));
60 /* errno is ENOMEM. */
63 filter->decomposer = nf->decomposer;
64 filter->composer = nf->composer;
65 filter->stream_func = stream_func;
66 filter->stream_data = stream_data;
67 filter->sortbuf = filter->sortbuf_preallocated;
68 filter->sortbuf_allocated = SORTBUF_PREALLOCATED;
69 filter->sortbuf_count = 0;
75 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc_arg)
77 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
80 /* Accept the next character. */
81 decomposed[0] = uc_arg;
84 /* Decompose it, recursively.
85 It would be possible to precompute the recursive decomposition
86 and store it in a table. But this would significantly increase
87 the size of the decomposition tables, because for example for
88 U+1FC1 the recursive canonical decomposition and the recursive
89 compatibility decomposition are different. */
93 for (curr = 0; curr < decomposed_count; )
95 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
96 all elements are atomic. */
97 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
98 int curr_decomposed_count;
100 curr_decomposed_count =
101 filter->decomposer (decomposed[curr], curr_decomposed);
102 if (curr_decomposed_count >= 0)
104 /* Move curr_decomposed[0..curr_decomposed_count-1] over
105 decomposed[curr], making room. It's not worth using
106 memcpy() here, since the counts are so small. */
107 int shift = curr_decomposed_count - 1;
115 decomposed_count += shift;
116 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
118 for (j = decomposed_count - 1 - shift; j > curr; j--)
119 decomposed[j + shift] = decomposed[j];
121 for (; shift >= 0; shift--)
122 decomposed[curr + shift] = curr_decomposed[shift];
126 /* decomposed[curr] is atomic. */
133 /* Cache sortbuf and sortbuf_count in local register variables. */
134 struct ucs4_with_ccc * const sortbuf = filter->sortbuf;
135 size_t sortbuf_count = filter->sortbuf_count;
138 for (i = 0; i < decomposed_count; i++)
140 /* Fetch the next character from the decomposition. */
141 ucs4_t uc = decomposed[i];
142 int ccc = uc_combining_class (uc);
148 /* Apply the canonical ordering algorithm to the accumulated
149 sequence of characters. */
150 if (sortbuf_count > 1)
151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152 sortbuf + sortbuf_count);
154 if (filter->composer != NULL)
156 /* Attempt to combine decomposed characters, as specified
157 in the Unicode Standard Annex #15 "Unicode Normalization
158 Forms". We need to check
159 1. whether the first accumulated character is a
160 "starter" (i.e. has ccc = 0). This is usually the
161 case. But when the string starts with a
162 non-starter, the sortbuf also starts with a
163 non-starter. Btw, this check could also be
164 omitted, because the composition table has only
165 entries (code1, code2) for which code1 is a
166 starter; if the first accumulated character is not
167 a starter, no lookup will succeed.
168 2. If the sortbuf has more than one character, check
169 for each of these characters that are not "blocked"
170 from the starter (i.e. have a ccc that is higher
171 than the ccc of the previous character) whether it
172 can be combined with the first character.
173 3. If only one character is left in sortbuf, check
174 whether it can be combined with the next character
176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
178 for (j = 1; j < sortbuf_count; )
180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
183 filter->composer (sortbuf[0].code, sortbuf[j].code);
188 sortbuf[0].code = combined;
189 /* sortbuf[0].ccc = 0, still valid. */
190 for (k = j + 1; k < sortbuf_count; k++)
191 sortbuf[k - 1] = sortbuf[k];
198 if (sortbuf_count == 1)
201 filter->composer (sortbuf[0].code, uc);
206 /* uc could be further combined with subsequent
207 characters. So don't put it into sortbuf[0] in
208 this round, only in the next round. */
215 for (j = 0; j < sortbuf_count; j++)
217 ucs4_t muc = sortbuf[j].code;
219 /* Output muc to the encapsulated stream. */
220 int ret = filter->stream_func (filter->stream_data, muc);
223 /* errno is set here. */
224 filter->sortbuf_count = 0;
229 /* sortbuf is now empty. */
233 /* Append (uc, ccc) to sortbuf. */
234 if (sortbuf_count == filter->sortbuf_allocated)
236 struct ucs4_with_ccc *new_sortbuf;
238 filter->sortbuf_allocated = 2 * filter->sortbuf_allocated;
239 if (filter->sortbuf_allocated < sortbuf_count) /* integer overflow? */
242 (struct ucs4_with_ccc *)
243 malloc (2 * filter->sortbuf_allocated * sizeof (struct ucs4_with_ccc));
244 if (new_sortbuf == NULL)
246 /* errno is ENOMEM. */
247 filter->sortbuf_count = sortbuf_count;
250 memcpy (new_sortbuf, filter->sortbuf,
251 sortbuf_count * sizeof (struct ucs4_with_ccc));
252 if (filter->sortbuf != filter->sortbuf_preallocated)
253 free (filter->sortbuf);
254 filter->sortbuf = new_sortbuf;
256 filter->sortbuf[sortbuf_count].code = uc;
257 filter->sortbuf[sortbuf_count].ccc = ccc;
261 filter->sortbuf_count = sortbuf_count;
267 /* Bring data buffered in the filter to its destination, the encapsulated
269 Return 0 if successful, or -1 with errno set upon failure.
270 Note! If after calling this function, additional characters are written
271 into the filter, the resulting character sequence in the encapsulated stream
272 will not necessarily be normalized. */
274 uninorm_filter_flush (struct uninorm_filter *filter)
276 /* Cache sortbuf and sortbuf_count in local register variables. */
277 struct ucs4_with_ccc * const sortbuf = filter->sortbuf;
278 size_t sortbuf_count = filter->sortbuf_count;
281 /* Apply the canonical ordering algorithm to the accumulated
282 sequence of characters. */
283 if (sortbuf_count > 1)
284 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
285 sortbuf + sortbuf_count);
287 if (filter->composer != NULL)
289 /* Attempt to combine decomposed characters, as specified
290 in the Unicode Standard Annex #15 "Unicode Normalization
291 Forms". We need to check
292 1. whether the first accumulated character is a
293 "starter" (i.e. has ccc = 0). This is usually the
294 case. But when the string starts with a
295 non-starter, the sortbuf also starts with a
296 non-starter. Btw, this check could also be
297 omitted, because the composition table has only
298 entries (code1, code2) for which code1 is a
299 starter; if the first accumulated character is not
300 a starter, no lookup will succeed.
301 2. If the sortbuf has more than one character, check
302 for each of these characters that are not "blocked"
303 from the starter (i.e. have a ccc that is higher
304 than the ccc of the previous character) whether it
305 can be combined with the first character.
306 3. If only one character is left in sortbuf, check
307 whether it can be combined with the next character
309 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
311 for (j = 1; j < sortbuf_count; )
313 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
316 filter->composer (sortbuf[0].code, sortbuf[j].code);
321 sortbuf[0].code = combined;
322 /* sortbuf[0].ccc = 0, still valid. */
323 for (k = j + 1; k < sortbuf_count; k++)
324 sortbuf[k - 1] = sortbuf[k];
334 for (j = 0; j < sortbuf_count; j++)
336 ucs4_t muc = sortbuf[j].code;
338 /* Output muc to the encapsulated stream. */
339 int ret = filter->stream_func (filter->stream_data, muc);
342 /* errno is set here. */
343 filter->sortbuf_count = 0;
348 /* sortbuf is now empty. */
349 filter->sortbuf_count = 0;
354 /* Bring data buffered in the filter to its destination, the encapsulated
355 stream, then close and free the filter.
356 Return 0 if successful, or -1 with errno set upon failure. */
358 uninorm_filter_free (struct uninorm_filter *filter)
360 int ret = uninorm_filter_flush (filter);
363 /* errno is set here. */
366 if (filter->sortbuf_count > 0)
368 if (filter->sortbuf != filter->sortbuf_preallocated)
369 free (filter->sortbuf);