From: Bruno Haible Date: Sat, 21 Feb 2009 11:49:36 +0000 (+0100) Subject: Tests for module 'uninorm/nfc'. X-Git-Tag: v0.1~6291 X-Git-Url: http://erislabs.net/gitweb/?a=commitdiff_plain;h=bda22ddb917fc5764b51a693ef6a888462aa6301;p=gnulib.git Tests for module 'uninorm/nfc'. --- diff --git a/ChangeLog b/ChangeLog index 260711995..c59851de3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,14 @@ 2009-02-21 Bruno Haible + Tests for module 'uninorm/nfc'. + * tests/uninorm/test-nfc.c: New file. + * tests/uninorm/test-u8-nfc.c: New file. + * tests/uninorm/test-u16-nfc.c: New file. + * tests/uninorm/test-u32-nfc.c: New file. + * tests/uninorm/test-u32-nfc-big.sh: New file. + * tests/uninorm/test-u32-nfc-big.c: New file. + * modules/uninorm/nfc-tests: New file. + New module 'uninorm/nfc'. * lib/uninorm/nfc.c: New file. * modules/uninorm/nfc: New file. diff --git a/modules/uninorm/nfc-tests b/modules/uninorm/nfc-tests new file mode 100644 index 000000000..55e2d293e --- /dev/null +++ b/modules/uninorm/nfc-tests @@ -0,0 +1,33 @@ +Files: +tests/uninorm/test-nfc.c +tests/uninorm/test-u8-nfc.c +tests/uninorm/test-u16-nfc.c +tests/uninorm/test-u32-nfc.c +tests/uninorm/test-u32-nfc-big.sh +tests/uninorm/test-u32-nfc-big.c +tests/uninorm/test-u32-normalize-big.h +tests/uninorm/test-u32-normalize-big.c +tests/uninorm/NormalizationTest.txt + +Depends-on: +unistr/u8-cmp +unistr/u16-cmp +unistr/u32-cmp +unistr/u32-strlen +xalloc +progname + +configure.ac: +AC_CHECK_DECLS_ONCE([alarm]) + +Makefile.am: +TESTS += test-nfc uninorm/test-u32-nfc-big.sh +check_PROGRAMS += test-nfc test-u32-nfc-big +test_nfc_SOURCES = \ + uninorm/test-nfc.c \ + uninorm/test-u8-nfc.c \ + uninorm/test-u16-nfc.c \ + uninorm/test-u32-nfc.c +test_u32_nfc_big_SOURCES = \ + uninorm/test-u32-nfc-big.c \ + uninorm/test-u32-normalize-big.c diff --git a/tests/uninorm/test-nfc.c b/tests/uninorm/test-nfc.c new file mode 100644 index 000000000..7e7ce9c5c --- /dev/null +++ b/tests/uninorm/test-nfc.c @@ -0,0 +1,38 @@ +/* Test of canonical normalization of Unicode strings. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2009. */ + +#include + +#include "uninorm.h" + +/* Check that UNINORM_NFC is defined and links. */ +uninorm_t n = UNINORM_NFC; + +extern void test_u8_nfc (void); +extern void test_u16_nfc (void); +extern void test_u32_nfc (void); + +int +main () +{ + test_u32_nfc (); + test_u16_nfc (); + test_u8_nfc (); + + return 0; +} diff --git a/tests/uninorm/test-u16-nfc.c b/tests/uninorm/test-u16-nfc.c new file mode 100644 index 000000000..397914c5f --- /dev/null +++ b/tests/uninorm/test-u16-nfc.c @@ -0,0 +1,342 @@ +/* Test of canonical normalization of UTF-16 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2009. */ + +#include + +#if GNULIB_UNINORM_U16_NORMALIZE + +#include "uninorm.h" + +#include +#include +#include +#include + +#include "unistr.h" + +#define SIZEOF(array) (sizeof (array) / sizeof (array[0])) +#define ASSERT(expr) \ + do \ + { \ + if (!(expr)) \ + { \ + fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \ + fflush (stderr); \ + abort (); \ + } \ + } \ + while (0) + +static int +check (const uint16_t *input, size_t input_length, + const uint16_t *expected, size_t expected_length) +{ + size_t length; + uint16_t *result; + + /* Test return conventions with resultbuf == NULL. */ + result = u16_normalize (UNINORM_NFC, input, input_length, NULL, &length); + if (!(result != NULL)) + return 1; + if (!(length == expected_length)) + return 2; + if (!(u16_cmp (result, expected, expected_length) == 0)) + return 3; + free (result); + + /* Test return conventions with resultbuf too small. */ + if (expected_length > 0) + { + uint16_t *preallocated; + + length = expected_length - 1; + preallocated = (uint16_t *) malloc (length * sizeof (uint16_t)); + result = u16_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 4; + if (!(result != preallocated)) + return 5; + if (!(length == expected_length)) + return 6; + if (!(u16_cmp (result, expected, expected_length) == 0)) + return 7; + free (result); + free (preallocated); + } + + /* Test return conventions with resultbuf large enough. */ + { + uint16_t *preallocated; + + length = expected_length; + preallocated = (uint16_t *) malloc (length * sizeof (uint16_t)); + result = u16_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 8; + if (!(result == preallocated)) + return 9; + if (!(length == expected_length)) + return 10; + if (!(u16_cmp (result, expected, expected_length) == 0)) + return 11; + free (preallocated); + } + + return 0; +} + +void +test_u16_nfc (void) +{ + { /* SPACE */ + static const uint16_t input[] = { 0x0020 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + static const uint16_t input[] = { 0x00C4 }; + static const uint16_t decomposed[] = { 0x0041, 0x0308 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ + static const uint16_t input[] = { 0x01DE }; + static const uint16_t decomposed[] = { 0x0041, 0x0308, 0x0304 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* ANGSTROM SIGN */ + static const uint16_t input[] = { 0x212B }; + static const uint16_t decomposed[] = { 0x0041, 0x030A }; + static const uint16_t expected[] = { 0x00C5 }; + ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); + ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); + } + + { /* GREEK DIALYTIKA AND PERISPOMENI */ + static const uint16_t input[] = { 0x1FC1 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SCRIPT SMALL L */ + static const uint16_t input[] = { 0x2113 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* NO-BREAK SPACE */ + static const uint16_t input[] = { 0x00A0 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH INITIAL FORM */ + static const uint16_t input[] = { 0xFB6C }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH MEDIAL FORM */ + static const uint16_t input[] = { 0xFB6D }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH FINAL FORM */ + static const uint16_t input[] = { 0xFB6B }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH ISOLATED FORM */ + static const uint16_t input[] = { 0xFB6A }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* CIRCLED NUMBER FIFTEEN */ + static const uint16_t input[] = { 0x246E }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* TRADE MARK SIGN */ + static const uint16_t input[] = { 0x2122 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN SUBSCRIPT SMALL LETTER I */ + static const uint16_t input[] = { 0x1D62 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */ + static const uint16_t input[] = { 0xFE35 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* FULLWIDTH LATIN CAPITAL LETTER A */ + static const uint16_t input[] = { 0xFF21 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HALFWIDTH IDEOGRAPHIC COMMA */ + static const uint16_t input[] = { 0xFF64 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SMALL IDEOGRAPHIC COMMA */ + static const uint16_t input[] = { 0xFE51 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SQUARE MHZ */ + static const uint16_t input[] = { 0x3392 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* VULGAR FRACTION THREE EIGHTHS */ + static const uint16_t input[] = { 0x215C }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* MICRO SIGN */ + static const uint16_t input[] = { 0x00B5 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */ + static const uint16_t input[] = { 0xFDFA }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEUL */ + static const uint16_t input[] = { 0xAE00 }; + static const uint16_t decomposed[] = { 0x1100, 0x1173, 0x11AF }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEU */ + static const uint16_t input[] = { 0xADF8 }; + static const uint16_t decomposed[] = { 0x1100, 0x1173 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */ + static const uint16_t input[] = + { 'G', 'r', 0x00FC, 0x00DF, ' ', 'G', 'o', 't', 't', '.', ' ', + 0x0417, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, + 0x0439, 0x0442, 0x0435, '!', ' ', + 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, + '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', + 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', 0xD55C, 0xAE00, '\n' + }; + static const uint16_t decomposed[] = + { 'G', 'r', 0x0075, 0x0308, 0x00DF, ' ', 'G', 'o', 't', 't', '.', ' ', + 0x0417, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, + 0x0438, 0x0306, 0x0442, 0x0435, '!', ' ', + 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, + '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', + 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', + 0x1112, 0x1161, 0x11AB, 0x1100, 0x1173, 0x11AF, '\n' + }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + +#if HAVE_DECL_ALARM + /* Declare failure if test takes too long, by using default abort + caused by SIGALRM. */ + signal (SIGALRM, SIG_DFL); + alarm (50); +#endif + + /* Check that the sorting is not O(n²) but O(n log n). */ + { + int pass; + for (pass = 0; pass < 3; pass++) + { + size_t repeat = 1; + size_t m = 100000; + uint16_t *input = (uint16_t *) malloc (2 * m * sizeof (uint16_t)); + if (input != NULL) + { + uint16_t *expected = input + m; + size_t m1 = m / 2; + size_t m2 = (m - 1) / 2; + /* NB: m1 + m2 == m - 1. */ + uint16_t *p; + size_t i; + + input[0] = 0x0041; + p = input + 1; + switch (pass) + { + case 0: + for (i = 0; i < m1; i++) + *p++ = 0x0319; + for (i = 0; i < m2; i++) + *p++ = 0x0300; + break; + + case 1: + for (i = 0; i < m2; i++) + *p++ = 0x0300; + for (i = 0; i < m1; i++) + *p++ = 0x0319; + break; + + case 2: + for (i = 0; i < m2; i++) + { + *p++ = 0x0319; + *p++ = 0x0300; + } + for (; i < m1; i++) + *p++ = 0x0319; + break; + + default: + abort (); + } + + expected[0] = 0x00C0; + p = expected + 1; + for (i = 0; i < m1; i++) + *p++ = 0x0319; + for (i = 0; i < m2 - 1; i++) + *p++ = 0x0300; + + for (; repeat > 0; repeat--) + { + ASSERT (check (input, m, expected, m - 1) == 0); + ASSERT (check (expected, m - 1, expected, m - 1) == 0); + } + + free (input); + } + } + } +} + +#else + +void +test_u16_nfc (void) +{ +} + +#endif diff --git a/tests/uninorm/test-u32-nfc-big.c b/tests/uninorm/test-u32-nfc-big.c new file mode 100644 index 000000000..2a1b611fa --- /dev/null +++ b/tests/uninorm/test-u32-nfc-big.c @@ -0,0 +1,125 @@ +/* Test of Unicode compliance of canonical normalization of UTF-32 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2009. */ + +#include + +#if GNULIB_UNINORM_U32_NORMALIZE + +#include "uninorm.h" + +#include + +#include "unistr.h" +#include "progname.h" +#include "test-u32-normalize-big.h" + +static int +check (const uint32_t *c1, size_t c1_length, + const uint32_t *c2, size_t c2_length, + const uint32_t *c3, size_t c3_length, + const uint32_t *c4, size_t c4_length, + const uint32_t *c5, size_t c5_length) +{ + /* Check + c2 == NFC(c1) == NFC(c2) == NFC(c3) + c4 == NFC(c4) == NFC(c5) + */ + { + size_t length; + uint32_t *result; + + result = u32_normalize (UNINORM_NFC, c1, c1_length, NULL, &length); + if (!(result != NULL + && length == c2_length + && u32_cmp (result, c2, c2_length) == 0)) + return 1; + free (result); + } + { + size_t length; + uint32_t *result; + + result = u32_normalize (UNINORM_NFC, c2, c2_length, NULL, &length); + if (!(result != NULL + && length == c2_length + && u32_cmp (result, c2, c2_length) == 0)) + return 2; + free (result); + } + { + size_t length; + uint32_t *result; + + result = u32_normalize (UNINORM_NFC, c3, c3_length, NULL, &length); + if (!(result != NULL + && length == c2_length + && u32_cmp (result, c2, c2_length) == 0)) + return 3; + free (result); + } + { + size_t length; + uint32_t *result; + + result = u32_normalize (UNINORM_NFC, c4, c4_length, NULL, &length); + if (!(result != NULL + && length == c4_length + && u32_cmp (result, c4, c4_length) == 0)) + return 4; + free (result); + } + { + size_t length; + uint32_t *result; + + result = u32_normalize (UNINORM_NFC, c5, c5_length, NULL, &length); + if (!(result != NULL + && length == c4_length + && u32_cmp (result, c4, c4_length) == 0)) + return 5; + free (result); + } + return 0; +} + +int +main (int argc, char *argv[]) +{ + struct normalization_test_file file; + + set_program_name (argv[0]); + read_normalization_test_file (argv[1], &file); + + test_specific (&file, check); + test_other (&file, UNINORM_NFC); + + return 0; +} + +#else + +#include + +int +main () +{ + fprintf (stderr, "Skipping test: uninorm/u32-normalize module not included.\n"); + return 77; +} + +#endif diff --git a/tests/uninorm/test-u32-nfc-big.sh b/tests/uninorm/test-u32-nfc-big.sh new file mode 100755 index 000000000..93d0003da --- /dev/null +++ b/tests/uninorm/test-u32-nfc-big.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec ./test-u32-nfc-big${EXEEXT} "$srcdir/uninorm/NormalizationTest.txt" diff --git a/tests/uninorm/test-u32-nfc.c b/tests/uninorm/test-u32-nfc.c new file mode 100644 index 000000000..0c7cd790f --- /dev/null +++ b/tests/uninorm/test-u32-nfc.c @@ -0,0 +1,342 @@ +/* Test of canonical normalization of UTF-32 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2009. */ + +#include + +#if GNULIB_UNINORM_U32_NORMALIZE + +#include "uninorm.h" + +#include +#include +#include +#include + +#include "unistr.h" + +#define SIZEOF(array) (sizeof (array) / sizeof (array[0])) +#define ASSERT(expr) \ + do \ + { \ + if (!(expr)) \ + { \ + fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \ + fflush (stderr); \ + abort (); \ + } \ + } \ + while (0) + +static int +check (const uint32_t *input, size_t input_length, + const uint32_t *expected, size_t expected_length) +{ + size_t length; + uint32_t *result; + + /* Test return conventions with resultbuf == NULL. */ + result = u32_normalize (UNINORM_NFC, input, input_length, NULL, &length); + if (!(result != NULL)) + return 1; + if (!(length == expected_length)) + return 2; + if (!(u32_cmp (result, expected, expected_length) == 0)) + return 3; + free (result); + + /* Test return conventions with resultbuf too small. */ + if (expected_length > 0) + { + uint32_t *preallocated; + + length = expected_length - 1; + preallocated = (uint32_t *) malloc (length * sizeof (uint32_t)); + result = u32_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 4; + if (!(result != preallocated)) + return 5; + if (!(length == expected_length)) + return 6; + if (!(u32_cmp (result, expected, expected_length) == 0)) + return 7; + free (result); + free (preallocated); + } + + /* Test return conventions with resultbuf large enough. */ + { + uint32_t *preallocated; + + length = expected_length; + preallocated = (uint32_t *) malloc (length * sizeof (uint32_t)); + result = u32_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 8; + if (!(result == preallocated)) + return 9; + if (!(length == expected_length)) + return 10; + if (!(u32_cmp (result, expected, expected_length) == 0)) + return 11; + free (preallocated); + } + + return 0; +} + +void +test_u32_nfc (void) +{ + { /* SPACE */ + static const uint32_t input[] = { 0x0020 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + static const uint32_t input[] = { 0x00C4 }; + static const uint32_t decomposed[] = { 0x0041, 0x0308 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ + static const uint32_t input[] = { 0x01DE }; + static const uint32_t decomposed[] = { 0x0041, 0x0308, 0x0304 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* ANGSTROM SIGN */ + static const uint32_t input[] = { 0x212B }; + static const uint32_t decomposed[] = { 0x0041, 0x030A }; + static const uint32_t expected[] = { 0x00C5 }; + ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); + ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); + } + + { /* GREEK DIALYTIKA AND PERISPOMENI */ + static const uint32_t input[] = { 0x1FC1 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SCRIPT SMALL L */ + static const uint32_t input[] = { 0x2113 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* NO-BREAK SPACE */ + static const uint32_t input[] = { 0x00A0 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH INITIAL FORM */ + static const uint32_t input[] = { 0xFB6C }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH MEDIAL FORM */ + static const uint32_t input[] = { 0xFB6D }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH FINAL FORM */ + static const uint32_t input[] = { 0xFB6B }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH ISOLATED FORM */ + static const uint32_t input[] = { 0xFB6A }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* CIRCLED NUMBER FIFTEEN */ + static const uint32_t input[] = { 0x246E }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* TRADE MARK SIGN */ + static const uint32_t input[] = { 0x2122 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN SUBSCRIPT SMALL LETTER I */ + static const uint32_t input[] = { 0x1D62 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */ + static const uint32_t input[] = { 0xFE35 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* FULLWIDTH LATIN CAPITAL LETTER A */ + static const uint32_t input[] = { 0xFF21 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HALFWIDTH IDEOGRAPHIC COMMA */ + static const uint32_t input[] = { 0xFF64 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SMALL IDEOGRAPHIC COMMA */ + static const uint32_t input[] = { 0xFE51 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SQUARE MHZ */ + static const uint32_t input[] = { 0x3392 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* VULGAR FRACTION THREE EIGHTHS */ + static const uint32_t input[] = { 0x215C }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* MICRO SIGN */ + static const uint32_t input[] = { 0x00B5 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */ + static const uint32_t input[] = { 0xFDFA }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEUL */ + static const uint32_t input[] = { 0xAE00 }; + static const uint32_t decomposed[] = { 0x1100, 0x1173, 0x11AF }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEU */ + static const uint32_t input[] = { 0xADF8 }; + static const uint32_t decomposed[] = { 0x1100, 0x1173 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */ + static const uint32_t input[] = + { 'G', 'r', 0x00FC, 0x00DF, ' ', 'G', 'o', 't', 't', '.', ' ', + 0x0417, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, + 0x0439, 0x0442, 0x0435, '!', ' ', + 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, + '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', + 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', 0xD55C, 0xAE00, '\n' + }; + static const uint32_t decomposed[] = + { 'G', 'r', 0x0075, 0x0308, 0x00DF, ' ', 'G', 'o', 't', 't', '.', ' ', + 0x0417, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, + 0x0438, 0x0306, 0x0442, 0x0435, '!', ' ', + 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, + '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', + 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', + 0x1112, 0x1161, 0x11AB, 0x1100, 0x1173, 0x11AF, '\n' + }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + +#if HAVE_DECL_ALARM + /* Declare failure if test takes too long, by using default abort + caused by SIGALRM. */ + signal (SIGALRM, SIG_DFL); + alarm (50); +#endif + + /* Check that the sorting is not O(n²) but O(n log n). */ + { + int pass; + for (pass = 0; pass < 3; pass++) + { + size_t repeat = 1; + size_t m = 100000; + uint32_t *input = (uint32_t *) malloc (2 * m * sizeof (uint32_t)); + if (input != NULL) + { + uint32_t *expected = input + m; + size_t m1 = m / 2; + size_t m2 = (m - 1) / 2; + /* NB: m1 + m2 == m - 1. */ + uint32_t *p; + size_t i; + + input[0] = 0x0041; + p = input + 1; + switch (pass) + { + case 0: + for (i = 0; i < m1; i++) + *p++ = 0x0319; + for (i = 0; i < m2; i++) + *p++ = 0x0300; + break; + + case 1: + for (i = 0; i < m2; i++) + *p++ = 0x0300; + for (i = 0; i < m1; i++) + *p++ = 0x0319; + break; + + case 2: + for (i = 0; i < m2; i++) + { + *p++ = 0x0319; + *p++ = 0x0300; + } + for (; i < m1; i++) + *p++ = 0x0319; + break; + + default: + abort (); + } + + expected[0] = 0x00C0; + p = expected + 1; + for (i = 0; i < m1; i++) + *p++ = 0x0319; + for (i = 0; i < m2 - 1; i++) + *p++ = 0x0300; + + for (; repeat > 0; repeat--) + { + ASSERT (check (input, m, expected, m - 1) == 0); + ASSERT (check (expected, m - 1, expected, m - 1) == 0); + } + + free (input); + } + } + } +} + +#else + +void +test_u32_nfc (void) +{ +} + +#endif diff --git a/tests/uninorm/test-u8-nfc.c b/tests/uninorm/test-u8-nfc.c new file mode 100644 index 000000000..0e1915708 --- /dev/null +++ b/tests/uninorm/test-u8-nfc.c @@ -0,0 +1,372 @@ +/* Test of canonical normalization of UTF-8 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2009. */ + +#include + +#if GNULIB_UNINORM_U8_NORMALIZE + +#include "uninorm.h" + +#include +#include +#include +#include + +#include "unistr.h" + +#define SIZEOF(array) (sizeof (array) / sizeof (array[0])) +#define ASSERT(expr) \ + do \ + { \ + if (!(expr)) \ + { \ + fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \ + fflush (stderr); \ + abort (); \ + } \ + } \ + while (0) + +static int +check (const uint8_t *input, size_t input_length, + const uint8_t *expected, size_t expected_length) +{ + size_t length; + uint8_t *result; + + /* Test return conventions with resultbuf == NULL. */ + result = u8_normalize (UNINORM_NFC, input, input_length, NULL, &length); + if (!(result != NULL)) + return 1; + if (!(length == expected_length)) + return 2; + if (!(u8_cmp (result, expected, expected_length) == 0)) + return 3; + free (result); + + /* Test return conventions with resultbuf too small. */ + if (expected_length > 0) + { + uint8_t *preallocated; + + length = expected_length - 1; + preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); + result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 4; + if (!(result != preallocated)) + return 5; + if (!(length == expected_length)) + return 6; + if (!(u8_cmp (result, expected, expected_length) == 0)) + return 7; + free (result); + free (preallocated); + } + + /* Test return conventions with resultbuf large enough. */ + { + uint8_t *preallocated; + + length = expected_length; + preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); + result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length); + if (!(result != NULL)) + return 8; + if (!(result == preallocated)) + return 9; + if (!(length == expected_length)) + return 10; + if (!(u8_cmp (result, expected, expected_length) == 0)) + return 11; + free (preallocated); + } + + return 0; +} + +void +test_u8_nfc (void) +{ + { /* SPACE */ + static const uint8_t input[] = { 0x20 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + static const uint8_t input[] = { 0xC3, 0x84 }; + static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ + static const uint8_t input[] = { 0xC7, 0x9E }; + static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* ANGSTROM SIGN */ + static const uint8_t input[] = { 0xE2, 0x84, 0xAB }; + static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A }; + static const uint8_t expected[] = { 0xC3, 0x85 }; + ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); + ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); + } + + { /* GREEK DIALYTIKA AND PERISPOMENI */ + static const uint8_t input[] = { 0xE1, 0xBF, 0x81 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SCRIPT SMALL L */ + static const uint8_t input[] = { 0xE2, 0x84, 0x93 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* NO-BREAK SPACE */ + static const uint8_t input[] = { 0xC2, 0xA0 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH INITIAL FORM */ + static const uint8_t input[] = { 0xEF, 0xAD, 0xAC }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH MEDIAL FORM */ + static const uint8_t input[] = { 0xEF, 0xAD, 0xAD }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH FINAL FORM */ + static const uint8_t input[] = { 0xEF, 0xAD, 0xAB }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LETTER VEH ISOLATED FORM */ + static const uint8_t input[] = { 0xEF, 0xAD, 0xAA }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* CIRCLED NUMBER FIFTEEN */ + static const uint8_t input[] = { 0xE2, 0x91, 0xAE }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* TRADE MARK SIGN */ + static const uint8_t input[] = { 0xE2, 0x84, 0xA2 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* LATIN SUBSCRIPT SMALL LETTER I */ + static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */ + static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* FULLWIDTH LATIN CAPITAL LETTER A */ + static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HALFWIDTH IDEOGRAPHIC COMMA */ + static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SMALL IDEOGRAPHIC COMMA */ + static const uint8_t input[] = { 0xEF, 0xB9, 0x91 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* SQUARE MHZ */ + static const uint8_t input[] = { 0xE3, 0x8E, 0x92 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* VULGAR FRACTION THREE EIGHTHS */ + static const uint8_t input[] = { 0xE2, 0x85, 0x9C }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* MICRO SIGN */ + static const uint8_t input[] = { 0xC2, 0xB5 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */ + static const uint8_t input[] = { 0xEF, 0xB7, 0xBA }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEUL */ + static const uint8_t input[] = { 0xEA, 0xB8, 0x80 }; + static const uint8_t decomposed[] = + { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* HANGUL SYLLABLE GEU */ + static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 }; + static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + + { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */ + static const uint8_t input[] = + { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', + ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, + 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, + 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, + 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', + '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, + 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', + 0xED, 0x95, 0x9C, + 0xEA, 0xB8, 0x80, '\n' + }; + static const uint8_t decomposed[] = + { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', + ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, + 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86, + 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, + 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', + '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, + 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', + 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB, + 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n' + }; + ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); + ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); + } + +#if HAVE_DECL_ALARM + /* Declare failure if test takes too long, by using default abort + caused by SIGALRM. */ + signal (SIGALRM, SIG_DFL); + alarm (50); +#endif + + /* Check that the sorting is not O(n²) but O(n log n). */ + { + int pass; + for (pass = 0; pass < 3; pass++) + { + size_t repeat = 1; + size_t m = 100000; + uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t)); + if (input != NULL) + { + uint8_t *expected = input + (2 * m - 1); + size_t m1 = m / 2; + size_t m2 = (m - 1) / 2; + /* NB: m1 + m2 == m - 1. */ + uint8_t *p; + size_t i; + + input[0] = 0x41; + p = input + 1; + switch (pass) + { + case 0: + for (i = 0; i < m1; i++) + { + *p++ = 0xCC; + *p++ = 0x99; + } + for (i = 0; i < m2; i++) + { + *p++ = 0xCC; + *p++ = 0x80; + } + break; + + case 1: + for (i = 0; i < m2; i++) + { + *p++ = 0xCC; + *p++ = 0x80; + } + for (i = 0; i < m1; i++) + { + *p++ = 0xCC; + *p++ = 0x99; + } + break; + + case 2: + for (i = 0; i < m2; i++) + { + *p++ = 0xCC; + *p++ = 0x99; + *p++ = 0xCC; + *p++ = 0x80; + } + for (; i < m1; i++) + { + *p++ = 0xCC; + *p++ = 0x99; + } + break; + + default: + abort (); + } + + expected[0] = 0xC3; + expected[1] = 0x80; + p = expected + 2; + for (i = 0; i < m1; i++) + { + *p++ = 0xCC; + *p++ = 0x99; + } + for (i = 0; i < m2 - 1; i++) + { + *p++ = 0xCC; + *p++ = 0x80; + } + + for (; repeat > 0; repeat--) + { + ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0); + ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0); + } + + free (input); + } + } + } +} + +#else + +void +test_u8_nfc (void) +{ +} + +#endif