1 /* Test of canonical normalization of UTF-8 strings.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
21 #if GNULIB_UNINORM_U8_NORMALIZE
32 #define SIZEOF(array) (sizeof (array) / sizeof (array[0]))
33 #define ASSERT(expr) \
38 fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
46 check (const uint8_t *input, size_t input_length,
47 const uint8_t *expected, size_t expected_length)
52 /* Test return conventions with resultbuf == NULL. */
53 result = u8_normalize (UNINORM_NFC, input, input_length, NULL, &length);
54 if (!(result != NULL))
56 if (!(length == expected_length))
58 if (!(u8_cmp (result, expected, expected_length) == 0))
62 /* Test return conventions with resultbuf too small. */
63 if (expected_length > 0)
65 uint8_t *preallocated;
67 length = expected_length - 1;
68 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
69 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
70 if (!(result != NULL))
72 if (!(result != preallocated))
74 if (!(length == expected_length))
76 if (!(u8_cmp (result, expected, expected_length) == 0))
82 /* Test return conventions with resultbuf large enough. */
84 uint8_t *preallocated;
86 length = expected_length;
87 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
88 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
89 if (!(result != NULL))
91 if (!(preallocated == NULL || result == preallocated))
93 if (!(length == expected_length))
95 if (!(u8_cmp (result, expected, expected_length) == 0))
106 { /* Empty string. */
107 ASSERT (check (NULL, 0, NULL, 0) == 0);
110 static const uint8_t input[] = { 0x20 };
111 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
114 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
115 static const uint8_t input[] = { 0xC3, 0x84 };
116 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
117 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
118 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
121 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
122 static const uint8_t input[] = { 0xC7, 0x9E };
123 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
124 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
125 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
128 { /* ANGSTROM SIGN */
129 static const uint8_t input[] = { 0xE2, 0x84, 0xAB };
130 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
131 static const uint8_t expected[] = { 0xC3, 0x85 };
132 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
133 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
134 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
137 { /* GREEK DIALYTIKA AND PERISPOMENI */
138 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
139 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
142 { /* SCRIPT SMALL L */
143 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
144 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
147 { /* NO-BREAK SPACE */
148 static const uint8_t input[] = { 0xC2, 0xA0 };
149 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
152 { /* ARABIC LETTER VEH INITIAL FORM */
153 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
154 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
157 { /* ARABIC LETTER VEH MEDIAL FORM */
158 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
159 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
162 { /* ARABIC LETTER VEH FINAL FORM */
163 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
164 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
167 { /* ARABIC LETTER VEH ISOLATED FORM */
168 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
169 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
172 { /* CIRCLED NUMBER FIFTEEN */
173 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
174 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
177 { /* TRADE MARK SIGN */
178 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
179 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
182 { /* LATIN SUBSCRIPT SMALL LETTER I */
183 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
184 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
187 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
188 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
189 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
192 { /* FULLWIDTH LATIN CAPITAL LETTER A */
193 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
194 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
197 { /* HALFWIDTH IDEOGRAPHIC COMMA */
198 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
199 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
202 { /* SMALL IDEOGRAPHIC COMMA */
203 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
204 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
208 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
209 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
212 { /* VULGAR FRACTION THREE EIGHTHS */
213 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
214 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
218 static const uint8_t input[] = { 0xC2, 0xB5 };
219 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
222 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
223 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
224 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
227 { /* HANGUL SYLLABLE GEUL */
228 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
229 static const uint8_t decomposed[] =
230 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
231 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
232 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
235 { /* HANGUL SYLLABLE GEU */
236 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
237 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
238 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
239 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
242 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
243 static const uint8_t input[] =
244 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
245 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
246 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
247 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
248 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
249 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
250 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
252 0xEA, 0xB8, 0x80, '\n'
254 static const uint8_t decomposed[] =
255 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
256 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
257 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
258 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
259 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
260 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
261 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
262 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
263 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
265 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
266 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
270 /* Declare failure if test takes too long, by using default abort
271 caused by SIGALRM. */
272 signal (SIGALRM, SIG_DFL);
276 /* Check that the sorting is not O(n²) but O(n log n). */
279 for (pass = 0; pass < 3; pass++)
283 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
286 uint8_t *expected = input + (2 * m - 1);
288 size_t m2 = (m - 1) / 2;
289 /* NB: m1 + m2 == m - 1. */
298 for (i = 0; i < m1; i++)
303 for (i = 0; i < m2; i++)
311 for (i = 0; i < m2; i++)
316 for (i = 0; i < m1; i++)
324 for (i = 0; i < m2; i++)
345 for (i = 0; i < m1; i++)
350 for (i = 0; i < m2 - 1; i++)
356 for (; repeat > 0; repeat--)
358 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0);
359 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);