lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU Lesser General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 /* Specification.  */
  20 #include "uniname.h"
  21
  22 #include <assert.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  29
  30
  31 /* Table of Unicode character names, derived from UnicodeData.txt.
  32    This table is generated in a way to minimize the memory footprint:
  33      1. its compiled size is small (less than 350 KB),
  34      2. it resides entirely in the text or read-only data segment of the
  35         executable or shared library: the table contains only immediate
  36         integers, no pointers, and the functions don't do heap allocation.
  37  */
  38 #include "uninames.h"
  39 /* It contains:
  40   static const char unicode_name_words[36303] = ...;
  41   #define UNICODE_CHARNAME_NUM_WORDS 6260
  42   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  43   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  44   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  45   #define UNICODE_CHARNAME_WORD_CJK 417
  46   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  47   static const uint16_t unicode_names[68940] = ...;
  48   static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
  49   static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
  50   #define UNICODE_CHARNAME_MAX_LENGTH 83
  51   #define UNICODE_CHARNAME_MAX_WORDS 13
  52 */
  53
  54 /* Returns the word with a given index.  */
  55 static const char *
  56 unicode_name_word (unsigned int index, unsigned int *lengthp)
  57 {
  58   unsigned int i1;
  59   unsigned int i2;
  60   unsigned int i;
  61
  62   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  63
  64   /* Binary search for i with
  65        unicode_name_by_length[i].ind_offset <= index
  66      and
  67        index < unicode_name_by_length[i+1].ind_offset
  68    */
  69
  70   i1 = 0;
  71   i2 = SIZEOF (unicode_name_by_length) - 1;
  72   while (i2 - i1 > 1)
  73     {
  74       unsigned int i = (i1 + i2) >> 1;
  75       if (unicode_name_by_length[i].ind_offset <= index)
  76         i1 = i;
  77       else
  78         i2 = i;
  79     }
  80   i = i1;
  81   assert (unicode_name_by_length[i].ind_offset <= index
  82           && index < unicode_name_by_length[i+1].ind_offset);
  83   *lengthp = i;
  84   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  85                              + (index-unicode_name_by_length[i].ind_offset)*i];
  86 }
  87
  88 /* Looks up the index of a word.  */
  89 static int
  90 unicode_name_word_lookup (const char *word, unsigned int length)
  91 {
  92   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  93     {
  94       /* Binary search among the words of given length.  */
  95       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  96       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  97       unsigned int i1 = i0;
  98       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
  99       while (i2 - i1 > 0)
 100         {
 101           unsigned int i = (i1 + i2) >> 1;
 102           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 103           const char *w = word;
 104           unsigned int n = length;
 105           for (;;)
 106             {
 107               if (*p < *w)
 108                 {
 109                   if (i1 == i)
 110                     return -1;
 111                   /* Note here: i1 < i < i2.  */
 112                   i1 = i;
 113                   break;
 114                 }
 115               if (*p > *w)
 116                 {
 117                   /* Note here: i1 <= i < i2.  */
 118                   i2 = i;
 119                   break;
 120                 }
 121               p++; w++; n--;
 122               if (n == 0)
 123                 return i;
 124             }
 125         }
 126     }
 127   return -1;
 128 }
 129
 130 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 131    sections 3.11 and 4.4.  */
 132 static const char jamo_initial_short_name[19][3] =
 133 {
 134   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 135   "C", "K", "T", "P", "H"
 136 };
 137 static const char jamo_medial_short_name[21][4] =
 138 {
 139   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 140   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 141 };
 142 static const char jamo_final_short_name[28][3] =
 143 {
 144   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 145   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 146 };
 147
 148 /* Looks up the name of a Unicode character, in uppercase ASCII.
 149    Returns the filled buf, or NULL if the character does not have a name.  */
 150 char *
 151 unicode_character_name (ucs4_t c, char *buf)
 152 {
 153   if (c >= 0xAC00 && c <= 0xD7A3)
 154     {
 155       /* Special case for Hangul syllables. Keeps the tables small.  */
 156       char *ptr;
 157       unsigned int tmp;
 158       unsigned int index1;
 159       unsigned int index2;
 160       unsigned int index3;
 161       const char *q;
 162
 163       /* buf needs to have at least 16 + 7 bytes here.  */
 164       memcpy (buf, "HANGUL SYLLABLE ", 16);
 165       ptr = buf + 16;
 166
 167       tmp = c - 0xAC00;
 168       index3 = tmp % 28; tmp = tmp / 28;
 169       index2 = tmp % 21; tmp = tmp / 21;
 170       index1 = tmp;
 171
 172       q = jamo_initial_short_name[index1];
 173       while (*q != '\0')
 174         *ptr++ = *q++;
 175       q = jamo_medial_short_name[index2];
 176       while (*q != '\0')
 177         *ptr++ = *q++;
 178       q = jamo_final_short_name[index3];
 179       while (*q != '\0')
 180         *ptr++ = *q++;
 181       *ptr = '\0';
 182       return buf;
 183     }
 184   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 185            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 186     {
 187       /* Special case for CJK compatibility ideographs. Keeps the tables
 188          small.  */
 189       char *ptr;
 190       int i;
 191
 192       /* buf needs to have at least 28 + 5 bytes here.  */
 193       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 194       ptr = buf + 28;
 195
 196       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 197         {
 198           unsigned int x = (c >> i) & 0xf;
 199           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 200         }
 201       *ptr = '\0';
 202       return buf;
 203     }
 204   else
 205     {
 206       const uint16_t *words;
 207
 208       /* Transform the code so that it fits in 16 bits.  */
 209       switch (c >> 12)
 210         {
 211         case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
 212           break;
 213         case 0x0A:
 214           c -= 0x05000;
 215           break;
 216         case 0x0F:
 217           c -= 0x09000;
 218           break;
 219         case 0x10:
 220           c -= 0x09000;
 221           break;
 222         case 0x12:
 223           c -= 0x0A000;
 224           break;
 225         case 0x1D:
 226           c -= 0x14000;
 227           break;
 228         case 0x2F:
 229           c -= 0x25000;
 230           break;
 231         case 0xE0:
 232           c -= 0xD5000;
 233           break;
 234         default:
 235           return NULL;
 236         }
 237
 238       {
 239         /* Binary search in unicode_code_to_name.  */
 240         unsigned int i1 = 0;
 241         unsigned int i2 = SIZEOF (unicode_code_to_name);
 242         for (;;)
 243           {
 244             unsigned int i = (i1 + i2) >> 1;
 245             if (unicode_code_to_name[i].code == c)
 246               {
 247                 words = &unicode_names[unicode_code_to_name[i].name];
 248                 break;
 249               }
 250             else if (unicode_code_to_name[i].code < c)
 251               {
 252                 if (i1 == i)
 253                   {
 254                     words = NULL;
 255                     break;
 256                   }
 257                 /* Note here: i1 < i < i2.  */
 258                 i1 = i;
 259               }
 260             else if (unicode_code_to_name[i].code > c)
 261               {
 262                 if (i2 == i)
 263                   {
 264                     words = NULL;
 265                     break;
 266                   }
 267                 /* Note here: i1 <= i < i2.  */
 268                 i2 = i;
 269               }
 270           }
 271       }
 272       if (words != NULL)
 273         {
 274           /* Found it in unicode_code_to_name. Now concatenate the words.  */
 275           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 276           char *ptr = buf;
 277           for (;;)
 278             {
 279               unsigned int wordlen;
 280               const char *word = unicode_name_word (*words>>1, &wordlen);
 281               do
 282                 *ptr++ = *word++;
 283               while (--wordlen > 0);
 284               if ((*words & 1) == 0)
 285                 break;
 286               *ptr++ = ' ';
 287               words++;
 288             }
 289           *ptr = '\0';
 290           return buf;
 291         }
 292       return NULL;
 293     }
 294 }
 295
 296 /* Looks up the Unicode character with a given name, in upper- or lowercase
 297    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 298 ucs4_t
 299 unicode_name_character (const char *name)
 300 {
 301   unsigned int len = strlen (name);
 302   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 303     {
 304       /* Test for "word1 word2 ..." syntax.  */
 305       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 306       char *ptr = buf;
 307       for (;;)
 308         {
 309           char c = *name++;
 310           if (!(c >= ' ' && c <= '~'))
 311             break;
 312           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 313           if (--len == 0)
 314             goto filled_buf;
 315         }
 316       if (false)
 317       filled_buf:
 318         {
 319           /* Convert the constituents to uint16_t words.  */
 320           uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 321           uint16_t *wordptr = words;
 322           {
 323             const char *p1 = buf;
 324             for (;;)
 325               {
 326                 {
 327                   int word;
 328                   const char *p2 = p1;
 329                   while (p2 < ptr && *p2 != ' ')
 330                     p2++;
 331                   word = unicode_name_word_lookup (p1, p2 - p1);
 332                   if (word < 0)
 333                     break;
 334                   if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 335                     break;
 336                   *wordptr++ = word;
 337                   if (p2 == ptr)
 338                     goto filled_words;
 339                   p1 = p2 + 1;
 340                 }
 341                 /* Special case for Hangul syllables. Keeps the tables small. */
 342                 if (wordptr == &words[2]
 343                     && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 344                     && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 345                   {
 346                     /* Split the last word [p1..ptr) into three parts:
 347                          1) [BCDGHJKMNPRST]
 348                          2) [AEIOUWY]
 349                          3) [BCDGHIJKLMNPST]
 350                      */
 351                     const char *p2;
 352                     const char *p3;
 353                     const char *p4;
 354
 355                     p2 = p1;
 356                     while (p2 < ptr
 357                            && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 358                                || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 359                                || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 360                                || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 361                                || *p2 == 'T'))
 362                       p2++;
 363                     p3 = p2;
 364                     while (p3 < ptr
 365                            && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 366                                || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 367                                || *p3 == 'Y'))
 368                       p3++;
 369                     p4 = p3;
 370                     while (p4 < ptr
 371                            && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 372                                || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 373                                || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 374                                || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 375                                || *p4 == 'S' || *p4 == 'T'))
 376                       p4++;
 377                     if (p4 == ptr)
 378                       {
 379                         unsigned int n1 = p2 - p1;
 380                         unsigned int n2 = p3 - p2;
 381                         unsigned int n3 = p4 - p3;
 382
 383                         if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 384                           {
 385                             unsigned int index1;
 386
 387                             for (index1 = 0; index1 < 19; index1++)
 388                               if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
 389                                   && jamo_initial_short_name[index1][n1] == '\0')
 390                                 {
 391                                   unsigned int index2;
 392
 393                                   for (index2 = 0; index2 < 21; index2++)
 394                                     if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
 395                                         && jamo_medial_short_name[index2][n2] == '\0')
 396                                       {
 397                                         unsigned int index3;
 398
 399                                         for (index3 = 0; index3 < 28; index3++)
 400                                           if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
 401                                               && jamo_final_short_name[index3][n3] == '\0')
 402                                             {
 403                                               return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 404                                             }
 405                                         break;
 406                                       }
 407                                   break;
 408                                 }
 409                           }
 410                       }
 411                   }
 412                 /* Special case for CJK compatibility ideographs. Keeps the
 413                    tables small.  */
 414                 if (wordptr == &words[2]
 415                     && words[0] == UNICODE_CHARNAME_WORD_CJK
 416                     && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 417                     && p1 + 14 <= ptr
 418                     && p1 + 15 >= ptr
 419                     && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 420                   {
 421                     const char *p2 = p1 + 10;
 422
 423                     if (*p2 != '0')
 424                       {
 425                         unsigned int c = 0;
 426
 427                         for (;;)
 428                           {
 429                             if (*p2 >= '0' && *p2 <= '9')
 430                               c += (*p2 - '0');
 431                             else if (*p2 >= 'A' && *p2 <= 'F')
 432                               c += (*p2 - 'A' + 10);
 433                             else
 434                               break;
 435                             p2++;
 436                             if (p2 == ptr)
 437                               {
 438                                 if ((c >= 0xF900 && c <= 0xFA2D)
 439                                     || (c >= 0xFA30 && c <= 0xFA6A)
 440                                     || (c >= 0xFA70 && c <= 0xFAD9)
 441                                     || (c >= 0x2F800 && c <= 0x2FA1D))
 442                                   return c;
 443                                 else
 444                                   break;
 445                               }
 446                             c = c << 4;
 447                           }
 448                       }
 449                   }
 450               }
 451           }
 452           if (false)
 453           filled_words:
 454             {
 455               /* Multiply by 2, to simplify later comparisons.  */
 456               unsigned int words_length = wordptr - words;
 457               {
 458                 int i = words_length - 1;
 459                 words[i] = 2 * words[i];
 460                 for (; --i >= 0; )
 461                   words[i] = 2 * words[i] + 1;
 462               }
 463               /* Binary search in unicode_name_to_code.  */
 464               {
 465                 unsigned int i1 = 0;
 466                 unsigned int i2 = SIZEOF (unicode_name_to_code);
 467                 for (;;)
 468                   {
 469                     unsigned int i = (i1 + i2) >> 1;
 470                     const uint16_t *w = words;
 471                     const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
 472                     unsigned int n = words_length;
 473                     for (;;)
 474                       {
 475                         if (*p < *w)
 476                           {
 477                             if (i1 == i)
 478                               goto name_not_found;
 479                             /* Note here: i1 < i < i2.  */
 480                             i1 = i;
 481                             break;
 482                           }
 483                         else if (*p > *w)
 484                           {
 485                             if (i2 == i)
 486                               goto name_not_found;
 487                             /* Note here: i1 <= i < i2.  */
 488                             i2 = i;
 489                             break;
 490                           }
 491                         p++; w++; n--;
 492                         if (n == 0)
 493                           {
 494                             unsigned int c = unicode_name_to_code[i].code;
 495
 496                             /* Undo the transformation to 16-bit space.  */
 497                             static const unsigned int offset[12] =
 498                               {
 499                                 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
 500                                 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
 501                                 0x25000, 0xD5000
 502                               };
 503                             return c + offset[c >> 12];
 504                           }
 505                       }
 506                   }
 507               }
 508             name_not_found: ;
 509             }
 510         }
 511     }
 512   return UNINAME_INVALID;
 513 }