1 /* Test of compatibility normalization of UTF-8 strings.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
21 #if GNULIB_UNINORM_U8_NORMALIZE
32 #define SIZEOF(array) (sizeof (array) / sizeof (array[0]))
33 #define ASSERT(expr) \
38 fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
46 check (const uint8_t *input, size_t input_length,
47 const uint8_t *expected, size_t expected_length)
52 /* Test return conventions with resultbuf == NULL. */
53 result = u8_normalize (UNINORM_NFKC, input, input_length, NULL, &length);
54 if (!(result != NULL))
56 if (!(length == expected_length))
58 if (!(u8_cmp (result, expected, expected_length) == 0))
62 /* Test return conventions with resultbuf too small. */
63 if (expected_length > 0)
65 uint8_t *preallocated;
67 length = expected_length - 1;
68 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
69 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
70 if (!(result != NULL))
72 if (!(result != preallocated))
74 if (!(length == expected_length))
76 if (!(u8_cmp (result, expected, expected_length) == 0))
82 /* Test return conventions with resultbuf large enough. */
84 uint8_t *preallocated;
86 length = expected_length;
87 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
88 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
89 if (!(result != NULL))
91 if (!(result == preallocated))
93 if (!(length == expected_length))
95 if (!(u8_cmp (result, expected, expected_length) == 0))
107 static const uint8_t input[] = { 0x20 };
108 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
111 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
112 static const uint8_t input[] = { 0xC3, 0x84 };
113 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
114 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
115 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
118 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
119 static const uint8_t input[] = { 0xC7, 0x9E };
120 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
121 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
122 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
125 { /* ANGSTROM SIGN */
126 static const uint8_t input[] = { 0xE2, 0x84, 0xAB };
127 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
128 static const uint8_t expected[] = { 0xC3, 0x85 };
129 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
130 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
131 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
134 { /* GREEK DIALYTIKA AND PERISPOMENI */
135 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
136 static const uint8_t decomposed[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 };
137 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
138 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
141 { /* SCRIPT SMALL L */
142 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
143 static const uint8_t decomposed[] = { 0x6C };
144 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
145 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
148 { /* NO-BREAK SPACE */
149 static const uint8_t input[] = { 0xC2, 0xA0 };
150 static const uint8_t decomposed[] = { 0x20 };
151 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
152 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
155 { /* ARABIC LETTER VEH INITIAL FORM */
156 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
157 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
158 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
159 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
162 { /* ARABIC LETTER VEH MEDIAL FORM */
163 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
164 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
165 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
166 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
169 { /* ARABIC LETTER VEH FINAL FORM */
170 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
171 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
172 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
173 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
176 { /* ARABIC LETTER VEH ISOLATED FORM */
177 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
178 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
179 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
180 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
183 { /* CIRCLED NUMBER FIFTEEN */
184 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
185 static const uint8_t decomposed[] = { 0x31, 0x35 };
186 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
187 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
190 { /* TRADE MARK SIGN */
191 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
192 static const uint8_t decomposed[] = { 0x54, 0x4D };
193 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
194 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
197 { /* LATIN SUBSCRIPT SMALL LETTER I */
198 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
199 static const uint8_t decomposed[] = { 0x69 };
200 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
201 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
204 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
205 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
206 static const uint8_t decomposed[] = { 0x28 };
207 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
208 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
211 { /* FULLWIDTH LATIN CAPITAL LETTER A */
212 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
213 static const uint8_t decomposed[] = { 0x41 };
214 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
215 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
218 { /* HALFWIDTH IDEOGRAPHIC COMMA */
219 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
220 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
221 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
222 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
225 { /* SMALL IDEOGRAPHIC COMMA */
226 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
227 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
228 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
229 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
233 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
234 static const uint8_t decomposed[] = { 0x4D, 0x48, 0x7A };
235 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
236 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
239 { /* VULGAR FRACTION THREE EIGHTHS */
240 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
241 static const uint8_t decomposed[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 };
242 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
243 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
247 static const uint8_t input[] = { 0xC2, 0xB5 };
248 static const uint8_t decomposed[] = { 0xCE, 0xBC };
249 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
250 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
253 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
254 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
255 static const uint8_t decomposed[] =
256 { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9,
257 0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87,
258 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85
260 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
261 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
264 { /* HANGUL SYLLABLE GEUL */
265 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
266 static const uint8_t decomposed[] =
267 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
268 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
269 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
272 { /* HANGUL SYLLABLE GEU */
273 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
274 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
275 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
276 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
279 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
280 static const uint8_t input[] =
281 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
282 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
283 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
284 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
285 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
286 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
287 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
289 0xEA, 0xB8, 0x80, '\n'
291 static const uint8_t decomposed[] =
292 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
293 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
294 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
295 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
296 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
297 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
298 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
299 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
300 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
302 static const uint8_t expected[] =
303 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
304 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
305 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
306 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
307 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
308 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
309 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
311 0xEA, 0xB8, 0x80, '\n'
313 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
314 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
315 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
319 /* Declare failure if test takes too long, by using default abort
320 caused by SIGALRM. */
321 signal (SIGALRM, SIG_DFL);
325 /* Check that the sorting is not O(n²) but O(n log n). */
328 for (pass = 0; pass < 3; pass++)
332 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
335 uint8_t *expected = input + (2 * m - 1);
337 size_t m2 = (m - 1) / 2;
338 /* NB: m1 + m2 == m - 1. */
347 for (i = 0; i < m1; i++)
352 for (i = 0; i < m2; i++)
360 for (i = 0; i < m2; i++)
365 for (i = 0; i < m1; i++)
373 for (i = 0; i < m2; i++)
394 for (i = 0; i < m1; i++)
399 for (i = 0; i < m2 - 1; i++)
405 for (; repeat > 0; repeat--)
407 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0);
408 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);