tests/unicase/test-u8-casecmp.c

   1 /* Test of case and normalization insensitive comparison of UTF-8 strings.
   2    Copyright (C) 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Bruno Haible <bruno@clisp.org>, 2009.  */
  18
  19 #include <config.h>
  20
  21 #include "unicase.h"
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25
  26 #include "uninorm.h"
  27
  28 #define SIZEOF(array) (sizeof (array) / sizeof (array[0]))
  29 #define ASSERT(expr) \
  30   do                                                                         \
  31     {                                                                        \
  32       if (!(expr))                                                           \
  33         {                                                                    \
  34           fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
  35           fflush (stderr);                                                   \
  36           abort ();                                                          \
  37         }                                                                    \
  38     }                                                                        \
  39   while (0)
  40
  41 #define UNIT uint8_t
  42 #include "test-casecmp.h"
  43 #undef UNIT
  44
  45 static void
  46 test_nonascii (int (*my_casecmp) (const uint8_t *, size_t, const uint8_t *, size_t, const char *, uninorm_t, int *))
  47 {
  48   /* Normalization effects.  */
  49   {
  50     static const uint8_t input1[] = { 'H', 0xC3, 0xB6, 'h', 'l', 'e' };
  51     static const uint8_t input2[] = { 'H', 'O', 0xCC, 0x88, 'h', 'L', 'e' };
  52     static const uint8_t input3[] = { 'H', 0xC3, 0xB6, 'h', 'l', 'e', 'n' };
  53     static const uint8_t input4[] = { 'H', 'O', 0xCC, 0x88, 'h', 'L', 'e', 'n' };
  54     static const uint8_t input5[] = { 'H', 'u', 'r', 'z' };
  55     int cmp;
  56
  57     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
  58     ASSERT (cmp == 0);
  59
  60     ASSERT (my_casecmp (input2, SIZEOF (input2), input1, SIZEOF (input1), NULL, UNINORM_NFD, &cmp) == 0);
  61     ASSERT (cmp == 0);
  62
  63     ASSERT (my_casecmp (input3, SIZEOF (input3), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
  64     ASSERT (cmp == 0);
  65
  66     ASSERT (my_casecmp (input4, SIZEOF (input4), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
  67     ASSERT (cmp == 0);
  68
  69     ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
  70     ASSERT (cmp == -1);
  71
  72     ASSERT (my_casecmp (input1, SIZEOF (input1), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
  73     ASSERT (cmp == -1);
  74
  75     ASSERT (my_casecmp (input1, SIZEOF (input1), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
  76     ASSERT (cmp == -1);
  77
  78     ASSERT (my_casecmp (input2, SIZEOF (input2), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
  79     ASSERT (cmp == -1);
  80   }
  81   { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
  82     static const uint8_t input1[] = { 0xC3, 0x84 };
  83     static const uint8_t input2[] = { 0x41, 0xCC, 0x88 };
  84     int cmp;
  85
  86     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
  87     ASSERT (cmp == 0);
  88   }
  89   { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
  90     static const uint8_t input1[] = { 0xC7, 0x9E };
  91     static const uint8_t input2[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
  92     int cmp;
  93
  94     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
  95     ASSERT (cmp == 0);
  96   }
  97   { /* GREEK DIALYTIKA AND PERISPOMENI */
  98     static const uint8_t input1[] = { 0xE1, 0xBF, 0x81 };
  99     static const uint8_t input2[] = { 0xC2, 0xA8, 0xCD, 0x82 };
 100     int cmp;
 101
 102     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 103     ASSERT (cmp == 0);
 104   }
 105   { /* HANGUL SYLLABLE GEUL */
 106     static const uint8_t input1[] = { 0xEA, 0xB8, 0x80 };
 107     static const uint8_t input2[] = { 0xEA, 0xB7, 0xB8, 0xE1, 0x86, 0xAF };
 108     static const uint8_t input3[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
 109     int cmp;
 110
 111     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 112     ASSERT (cmp == 0);
 113
 114     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
 115     ASSERT (cmp == 0);
 116   }
 117   { /* HANGUL SYLLABLE GEU */
 118     static const uint8_t input1[] = { 0xEA, 0xB7, 0xB8 };
 119     static const uint8_t input2[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
 120     int cmp;
 121
 122     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 123     ASSERT (cmp == 0);
 124   }
 125
 126   /* Simple string.  */
 127   { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a)  日本語,中文,한글" */
 128     static const uint8_t input1[] =
 129       { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', ' ',
 130         0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 0x81,
 131         0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5,
 132         '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 's', 'q', 'r', 't', '(',
 133         'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')',
 134         ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
 135         0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
 136         0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
 137       };
 138     static const uint8_t input2[] =
 139       { 'g', 'r', 0xC3, 0xBC, 0x73, 0x73, ' ', 'g', 'o', 't', 't', '.', ' ',
 140         0xD0, 0xB7, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 0x81,
 141         0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5,
 142         '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 's', 'q', 'r', 't', '(',
 143         'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')',
 144         ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
 145         0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
 146         0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
 147       };
 148     static const uint8_t input3[] =
 149       { 'G', 'R', 0xC3, 0x9C, 0x53, 0x53, ' ', 'G', 'O', 'T', 'T', '.', ' ',
 150         0xD0, 0x97, 0xD0, 0x94, 0xD0, 0xA0, 0xD0, 0x90, 0xD0, 0x92, 0xD0, 0xA1,
 151         0xD0, 0xA2, 0xD0, 0x92, 0xD0, 0xA3, 0xD0, 0x99, 0xD0, 0xA2, 0xD0, 0x95,
 152         '!', ' ', 'X', '=', '(', '-', 'B', 0xC2, 0xB1, 'S', 'Q', 'R', 'T', '(',
 153         'B', 0xC2, 0xB2, '-', '4', 'A', 'C', ')', ')', '/', '(', '2', 'A', ')',
 154         ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
 155         0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
 156         0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
 157       };
 158     int cmp;
 159
 160     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
 161     ASSERT (cmp == 0);
 162
 163     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 164     ASSERT (cmp == 0);
 165
 166     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
 167     ASSERT (cmp == 0);
 168
 169     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
 170     ASSERT (cmp == 0);
 171
 172     ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
 173     ASSERT (cmp == 0);
 174
 175     ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
 176     ASSERT (cmp == 0);
 177   }
 178
 179   /* Case mapping can increase the number of Unicode characters.  */
 180   { /* LATIN SMALL LETTER N PRECEDED BY APOSTROPHE */
 181     static const uint8_t input1[] = { 0xC5, 0x89 };
 182     static const uint8_t input2[] = { 0xCA, 0xBC, 0x6E };
 183     static const uint8_t input3[] = { 0xCA, 0xBC, 0x4E };
 184     int cmp;
 185
 186     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
 187     ASSERT (cmp == 0);
 188
 189     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 190     ASSERT (cmp == 0);
 191
 192     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
 193     ASSERT (cmp == 0);
 194
 195     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
 196     ASSERT (cmp == 0);
 197   }
 198   { /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS */
 199     static const uint8_t input1[] = { 0xCE, 0x90 };
 200     static const uint8_t input2[] = { 0xCE, 0xB9, 0xCC, 0x88, 0xCC, 0x81 };
 201     int cmp;
 202
 203     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
 204     ASSERT (cmp == 0);
 205
 206     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
 207     ASSERT (cmp == 0);
 208   }
 209
 210   /* Turkish letters i İ ı I */
 211   { /* LATIN CAPITAL LETTER I */
 212     static const uint8_t input[]         = { 0x49 };
 213     static const uint8_t casefolded[]    = { 0x69 };
 214     static const uint8_t casefolded_tr[] = { 0xC4, 0xB1 };
 215     int cmp;
 216
 217     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 218     ASSERT (cmp == 0);
 219
 220     ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
 221     ASSERT (cmp == 0);
 222   }
 223   { /* LATIN SMALL LETTER I */
 224     static const uint8_t input[]         = { 0x69 };
 225     static const uint8_t casefolded[]    = { 0x49 };
 226     static const uint8_t casefolded_tr[] = { 0xC4, 0xB0 };
 227     int cmp;
 228
 229     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 230     ASSERT (cmp == 0);
 231
 232     ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
 233     ASSERT (cmp == 0);
 234   }
 235   { /* LATIN CAPITAL LETTER I WITH DOT ABOVE */
 236     static const uint8_t input[]         = { 0xC4, 0xB0 };
 237     static const uint8_t casefolded[]    = { 0x69, 0xCC, 0x87 };
 238     static const uint8_t casefolded_tr[] = { 0x69 };
 239     int cmp;
 240
 241     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 242     ASSERT (cmp == 0);
 243
 244     ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
 245     ASSERT (cmp == 0);
 246   }
 247   { /* LATIN SMALL LETTER DOTLESS I */
 248     static const uint8_t input[]      = { 0xC4, 0xB1 };
 249     static const uint8_t casefolded[] = { 0x49 };
 250     int cmp;
 251
 252     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 253     ASSERT (cmp == 1);
 254
 255     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0);
 256     ASSERT (cmp == 0);
 257   }
 258   { /* "topkapı" */
 259     static const uint8_t input[] =
 260       { 0x54, 0x4F, 0x50, 0x4B, 0x41, 0x50, 0x49 };
 261     static const uint8_t casefolded[] =
 262       { 0x74, 0x6F, 0x70, 0x6B, 0x61, 0x70, 0xC4, 0xB1 };
 263     int cmp;
 264
 265     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 266     ASSERT (cmp == -1);
 267
 268     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0);
 269     ASSERT (cmp == 0);
 270   }
 271
 272   /* Uppercasing can increase the number of Unicode characters.  */
 273   { /* "heiß" */
 274     static const uint8_t input1[] = { 0x68, 0x65, 0x69, 0xC3, 0x9F };
 275     static const uint8_t input2[] = { 0x68, 0x65, 0x69, 0x73, 0x73 };
 276     int cmp;
 277
 278     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
 279     ASSERT (cmp == 0);
 280   }
 281
 282   /* Case mappings for some characters can depend on the surrounding characters.  */
 283   { /* "περισσότερες πληροφορίες" */
 284     static const uint8_t input1[] =
 285       {
 286         0xCF, 0x80, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB9, 0xCF, 0x83, 0xCF, 0x83,
 287         0xCF, 0x8C, 0xCF, 0x84, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB5, 0xCF, 0x82,
 288         ' ', 0xCF, 0x80, 0xCE, 0xBB, 0xCE, 0xB7, 0xCF, 0x81, 0xCE, 0xBF,
 289         0xCF, 0x86, 0xCE, 0xBF, 0xCF, 0x81, 0xCE, 0xAF, 0xCE, 0xB5, 0xCF, 0x82
 290       };
 291     static const uint8_t input2[] =
 292       {
 293         0xCF, 0x80, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB9, 0xCF, 0x83, 0xCF, 0x83,
 294         0xCF, 0x8C, 0xCF, 0x84, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB5, 0xCF, 0x83,
 295         ' ', 0xCF, 0x80, 0xCE, 0xBB, 0xCE, 0xB7, 0xCF, 0x81, 0xCE, 0xBF,
 296         0xCF, 0x86, 0xCE, 0xBF, 0xCF, 0x81, 0xCE, 0xAF, 0xCE, 0xB5, 0xCF, 0x83
 297       };
 298     static const uint8_t input3[] =
 299       {
 300         0xCE, 0xA0, 0xCE, 0x95, 0xCE, 0xA1, 0xCE, 0x99, 0xCE, 0xA3, 0xCE, 0xA3,
 301         0xCE, 0x8C, 0xCE, 0xA4, 0xCE, 0x95, 0xCE, 0xA1, 0xCE, 0x95, 0xCE, 0xA3,
 302         ' ', 0xCE, 0xA0, 0xCE, 0x9B, 0xCE, 0x97, 0xCE, 0xA1, 0xCE, 0x9F,
 303         0xCE, 0xA6, 0xCE, 0x9F, 0xCE, 0xA1, 0xCE, 0x8A, 0xCE, 0x95, 0xCE, 0xA3
 304       };
 305     int cmp;
 306
 307     ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
 308     ASSERT (cmp == 0);
 309
 310     ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
 311     ASSERT (cmp == 0);
 312
 313     ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
 314     ASSERT (cmp == 0);
 315   }
 316
 317   /* Case mapping can require subsequent normalization.  */
 318   { /* LATIN SMALL LETTER J WITH CARON, COMBINING DOT BELOW */
 319     static const uint8_t input[]                 = { 0xC7, 0xB0, 0xCC, 0xA3 };
 320     static const uint8_t casefolded[]            = { 0x6A, 0xCC, 0x8C, 0xCC, 0xA3 };
 321     static const uint8_t casefolded_decomposed[] = { 0x6A, 0xCC, 0xA3, 0xCC, 0x8C };
 322     int cmp;
 323
 324     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
 325     ASSERT (cmp == 0);
 326
 327     ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, NULL, &cmp) == 0);
 328     ASSERT (cmp != 0);
 329
 330     ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, UNINORM_NFD, &cmp) == 0);
 331     ASSERT (cmp == 0);
 332
 333     ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, UNINORM_NFD, &cmp) == 0);
 334     ASSERT (cmp == 0);
 335   }
 336 }
 337
 338 int
 339 main ()
 340 {
 341   test_ascii (u8_casecmp, UNINORM_NFD);
 342   test_nonascii (u8_casecmp);
 343
 344   return 0;
 345 }