tests/uniname/test-uninames.c

   1 /* Test the Unicode character name functions.
   2    Copyright (C) 2000-2003, 2005, 2007, 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22
  23 #include "xalloc.h"
  24 #include "uniname.h"
  25 #include "progname.h"
  26
  27 /* The names according to the UnicodeData.txt file, modified to contain the
  28    Hangul syllable names, as described in the Unicode 3.0 book.  */
  29 const char * unicode_names [0x110000];
  30
  31 /* Maximum length of a field in the UnicodeData.txt file.  */
  32 #define FIELDLEN 120
  33
  34 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  35    Reads up to (but excluding) DELIM.
  36    Returns 1 when a field was successfully read, otherwise 0.  */
  37 static int
  38 getfield (FILE *stream, char *buffer, int delim)
  39 {
  40   int count = 0;
  41   int c;
  42
  43   for (; (c = getc (stream)), (c != EOF && c != delim); )
  44     {
  45       /* Put c into the buffer.  */
  46       if (++count >= FIELDLEN - 1)
  47         {
  48           fprintf (stderr, "field too long\n");
  49           exit (EXIT_FAILURE);
  50         }
  51       *buffer++ = c;
  52     }
  53
  54   if (c == EOF)
  55     return 0;
  56
  57   *buffer = '\0';
  58   return 1;
  59 }
  60
  61 /* Stores in unicode_names[] the relevant contents of the UnicodeData.txt
  62    file.  */
  63 static void
  64 fill_names (const char *unicodedata_filename)
  65 {
  66   unsigned int i;
  67   FILE *stream;
  68   char field0[FIELDLEN];
  69   char field1[FIELDLEN];
  70   int lineno = 0;
  71
  72   for (i = 0; i < 0x110000; i++)
  73     unicode_names[i] = NULL;
  74
  75   stream = fopen (unicodedata_filename, "r");
  76   if (stream == NULL)
  77     {
  78       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
  79       exit (EXIT_FAILURE);
  80     }
  81
  82   for (;;)
  83     {
  84       int n;
  85       int c;
  86
  87       lineno++;
  88       n = getfield (stream, field0, ';');
  89       n += getfield (stream, field1, ';');
  90       if (n == 0)
  91         break;
  92       if (n != 2)
  93         {
  94           fprintf (stderr, "short line in '%s':%d\n",
  95                    unicodedata_filename, lineno);
  96           exit (EXIT_FAILURE);
  97         }
  98       for (; (c = getc (stream)), (c != EOF && c != '\n'); )
  99         ;
 100       i = strtoul (field0, NULL, 16);
 101       if (i >= 0x110000)
 102         {
 103           fprintf (stderr, "index too large\n");
 104           exit (EXIT_FAILURE);
 105         }
 106       unicode_names[i] = xstrdup (field1);
 107     }
 108   if (ferror (stream) || fclose (stream))
 109     {
 110       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 111       exit (1);
 112     }
 113 }
 114
 115 /* Perform an exhaustive test of the unicode_character_name function.  */
 116 static int
 117 test_name_lookup ()
 118 {
 119   int error = 0;
 120   unsigned int i;
 121   char buf[UNINAME_MAX];
 122
 123   for (i = 0; i < 0x11000; i++)
 124     {
 125       char *result = unicode_character_name (i, buf);
 126
 127       if (unicode_names[i] != NULL)
 128         {
 129           if (result == NULL)
 130             {
 131               fprintf (stderr, "\\u%04X name lookup failed!\n", i);
 132               error = 1;
 133             }
 134           else if (strcmp (result, unicode_names[i]) != 0)
 135             {
 136               fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
 137                                i, result);
 138               error = 1;
 139             }
 140         }
 141       else
 142         {
 143           if (result != NULL)
 144             {
 145               fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
 146                                i, result);
 147               error = 1;
 148             }
 149         }
 150     }
 151
 152   for (i = 0x110000; i < 0x1000000; i++)
 153     {
 154       char *result = unicode_character_name (i, buf);
 155
 156       if (result != NULL)
 157         {
 158           fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
 159                            i, result);
 160           error = 1;
 161         }
 162     }
 163
 164   return error;
 165 }
 166
 167 /* Perform a test of the unicode_name_character function.  */
 168 static int
 169 test_inverse_lookup ()
 170 {
 171   int error = 0;
 172   unsigned int i;
 173
 174   /* First, verify all valid character names are recognized.  */
 175   for (i = 0; i < 0x110000; i++)
 176     if (unicode_names[i] != NULL)
 177       {
 178         unsigned int result = unicode_name_character (unicode_names[i]);
 179         if (result != i)
 180           {
 181             if (result == UNINAME_INVALID)
 182               fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
 183                        unicode_names[i]);
 184             else
 185               fprintf (stderr,
 186                        "inverse name lookup of \"%s\" returned 0x%04X\n",
 187                        unicode_names[i], result);
 188             error = 1;
 189           }
 190       }
 191
 192   /* Second, generate random but likely names and verify they are not
 193      recognized unless really valid.  */
 194   for (i = 0; i < 10000; i++)
 195     {
 196       unsigned int i1, i2;
 197       const char *s1;
 198       const char *s2;
 199       unsigned int l1, l2, j1, j2;
 200       char buf[2*UNINAME_MAX];
 201       unsigned int result;
 202
 203       do i1 = ((rand () % 0x11) << 16)
 204               + ((rand () & 0xff) << 8)
 205               + (rand () & 0xff);
 206       while (unicode_names[i1] == NULL);
 207
 208       do i2 = ((rand () % 0x11) << 16)
 209               + ((rand () & 0xff) << 8)
 210               + (rand () & 0xff);
 211       while (unicode_names[i2] == NULL);
 212
 213       s1 = unicode_names[i1];
 214       l1 = strlen (s1);
 215       s2 = unicode_names[i2];
 216       l2 = strlen (s2);
 217
 218       /* Concatenate a starting piece of s1 with an ending piece of s2.  */
 219       for (j1 = 1; j1 <= l1; j1++)
 220         if (j1 == l1 || s1[j1] == ' ')
 221           for (j2 = 0; j2 < l2; j2++)
 222             if (j2 == 0 || s2[j2-1] == ' ')
 223               {
 224                 memcpy (buf, s1, j1);
 225                 buf[j1] = ' ';
 226                 memcpy (buf + j1 + 1, s2 + j2, l2 - j2 + 1);
 227
 228                 result = unicode_name_character (buf);
 229                 if (result != UNINAME_INVALID
 230                     && !(unicode_names[result] != NULL
 231                          && strcmp (unicode_names[result], buf) == 0))
 232                   {
 233                     fprintf (stderr,
 234                              "inverse name lookup of \"%s\" returned 0x%04X\n",
 235                              unicode_names[i], result);
 236                     error = 1;
 237                   }
 238               }
 239     }
 240
 241   /* Third, some extreme case that used to loop.  */
 242   if (unicode_name_character ("A A") != UNINAME_INVALID)
 243     error = 1;
 244
 245   return error;
 246 }
 247
 248 int
 249 main (int argc, char *argv[])
 250 {
 251   int error = 0;
 252
 253   set_program_name (argv[0]);
 254
 255   fill_names (argv[1]);
 256
 257   error |= test_name_lookup ();
 258   error |= test_inverse_lookup ();
 259
 260   return error;
 261 }