1 /* Test of compatibility normalization of UTF-8 strings.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
21 #if GNULIB_UNINORM_U8_NORMALIZE
32 #define SIZEOF(array) (sizeof (array) / sizeof (array[0]))
33 #define ASSERT(expr) \
38 fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
46 check (const uint8_t *input, size_t input_length,
47 const uint8_t *expected, size_t expected_length)
52 /* Test return conventions with resultbuf == NULL. */
53 result = u8_normalize (UNINORM_NFKC, input, input_length, NULL, &length);
54 if (!(result != NULL))
56 if (!(length == expected_length))
58 if (!(u8_cmp (result, expected, expected_length) == 0))
62 /* Test return conventions with resultbuf too small. */
63 if (expected_length > 0)
65 uint8_t *preallocated;
67 length = expected_length - 1;
68 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
69 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
70 if (!(result != NULL))
72 if (!(result != preallocated))
74 if (!(length == expected_length))
76 if (!(u8_cmp (result, expected, expected_length) == 0))
82 /* Test return conventions with resultbuf large enough. */
84 uint8_t *preallocated;
86 length = expected_length;
87 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
88 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
89 if (!(result != NULL))
91 if (!(preallocated == NULL || result == preallocated))
93 if (!(length == expected_length))
95 if (!(u8_cmp (result, expected, expected_length) == 0))
106 { /* Empty string. */
107 ASSERT (check (NULL, 0, NULL, 0) == 0);
110 static const uint8_t input[] = { 0x20 };
111 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
114 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115 static const uint8_t input[] = { 0xC3, 0x84 };
116 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
117 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
118 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
121 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
122 static const uint8_t input[] = { 0xC7, 0x9E };
123 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
124 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
125 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
128 { /* ANGSTROM SIGN */
129 static const uint8_t input[] = { 0xE2, 0x84, 0xAB };
130 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
131 static const uint8_t expected[] = { 0xC3, 0x85 };
132 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
133 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
134 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
137 { /* GREEK DIALYTIKA AND PERISPOMENI */
138 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
139 static const uint8_t decomposed[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 };
140 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
141 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
144 { /* SCRIPT SMALL L */
145 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
146 static const uint8_t decomposed[] = { 0x6C };
147 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
148 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
151 { /* NO-BREAK SPACE */
152 static const uint8_t input[] = { 0xC2, 0xA0 };
153 static const uint8_t decomposed[] = { 0x20 };
154 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
155 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
158 { /* ARABIC LETTER VEH INITIAL FORM */
159 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
160 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
161 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
162 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
165 { /* ARABIC LETTER VEH MEDIAL FORM */
166 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
167 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
168 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
169 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
172 { /* ARABIC LETTER VEH FINAL FORM */
173 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
174 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
175 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
176 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
179 { /* ARABIC LETTER VEH ISOLATED FORM */
180 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
181 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
182 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
183 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
186 { /* CIRCLED NUMBER FIFTEEN */
187 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
188 static const uint8_t decomposed[] = { 0x31, 0x35 };
189 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
190 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
193 { /* TRADE MARK SIGN */
194 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
195 static const uint8_t decomposed[] = { 0x54, 0x4D };
196 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
197 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
200 { /* LATIN SUBSCRIPT SMALL LETTER I */
201 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
202 static const uint8_t decomposed[] = { 0x69 };
203 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
204 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
207 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
208 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
209 static const uint8_t decomposed[] = { 0x28 };
210 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
211 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
214 { /* FULLWIDTH LATIN CAPITAL LETTER A */
215 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
216 static const uint8_t decomposed[] = { 0x41 };
217 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
218 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
221 { /* HALFWIDTH IDEOGRAPHIC COMMA */
222 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
223 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
224 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
225 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
228 { /* SMALL IDEOGRAPHIC COMMA */
229 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
230 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
231 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
232 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
236 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
237 static const uint8_t decomposed[] = { 0x4D, 0x48, 0x7A };
238 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
239 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
242 { /* VULGAR FRACTION THREE EIGHTHS */
243 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
244 static const uint8_t decomposed[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 };
245 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
246 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
250 static const uint8_t input[] = { 0xC2, 0xB5 };
251 static const uint8_t decomposed[] = { 0xCE, 0xBC };
252 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
253 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
256 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
257 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
258 static const uint8_t decomposed[] =
259 { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9,
260 0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87,
261 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85
263 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
264 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
267 { /* HANGUL SYLLABLE GEUL */
268 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
269 static const uint8_t decomposed[] =
270 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
271 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
272 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
275 { /* HANGUL SYLLABLE GEU */
276 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
277 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
278 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
279 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
282 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
283 static const uint8_t input[] =
284 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
285 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
286 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
287 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
288 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
289 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
290 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
292 0xEA, 0xB8, 0x80, '\n'
294 static const uint8_t decomposed[] =
295 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
296 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
297 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
298 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
299 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
300 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
301 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
302 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
303 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
305 static const uint8_t expected[] =
306 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
307 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
308 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
309 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
310 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
311 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
312 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
314 0xEA, 0xB8, 0x80, '\n'
316 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
317 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
318 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
322 /* Declare failure if test takes too long, by using default abort
323 caused by SIGALRM. */
324 signal (SIGALRM, SIG_DFL);
328 /* Check that the sorting is not O(n²) but O(n log n). */
331 for (pass = 0; pass < 3; pass++)
335 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
338 uint8_t *expected = input + (2 * m - 1);
340 size_t m2 = (m - 1) / 2;
341 /* NB: m1 + m2 == m - 1. */
350 for (i = 0; i < m1; i++)
355 for (i = 0; i < m2; i++)
363 for (i = 0; i < m2; i++)
368 for (i = 0; i < m1; i++)
376 for (i = 0; i < m2; i++)
397 for (i = 0; i < m1; i++)
402 for (i = 0; i < m2 - 1; i++)
408 for (; repeat > 0; repeat--)
410 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0);
411 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);