lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007, 2009-2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU Lesser General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 /* Specification.  */
  20 #include "uniname.h"
  21
  22 #include <assert.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  29
  30
  31 /* Table of Unicode character names, derived from UnicodeData.txt.
  32    This table is generated in a way to minimize the memory footprint:
  33      1. its compiled size is small (less than 350 KB),
  34      2. it resides entirely in the text or read-only data segment of the
  35         executable or shared library: the table contains only immediate
  36         integers, no pointers, and the functions don't do heap allocation.
  37  */
  38 #include "uninames.h"
  39 /* It contains:
  40   static const char unicode_name_words[36303] = ...;
  41   #define UNICODE_CHARNAME_NUM_WORDS 6260
  42   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  43   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  44   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  45   #define UNICODE_CHARNAME_WORD_CJK 417
  46   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  47   static const uint16_t unicode_names[68940] = ...;
  48   static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
  49   static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
  50   #define UNICODE_CHARNAME_MAX_LENGTH 83
  51   #define UNICODE_CHARNAME_MAX_WORDS 13
  52 */
  53
  54 /* Returns the word with a given index.  */
  55 static const char *
  56 unicode_name_word (unsigned int index, unsigned int *lengthp)
  57 {
  58   unsigned int i1;
  59   unsigned int i2;
  60   unsigned int i;
  61
  62   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  63
  64   /* Binary search for i with
  65        unicode_name_by_length[i].ind_offset <= index
  66      and
  67        index < unicode_name_by_length[i+1].ind_offset
  68    */
  69
  70   i1 = 0;
  71   i2 = SIZEOF (unicode_name_by_length) - 1;
  72   while (i2 - i1 > 1)
  73     {
  74       unsigned int i = (i1 + i2) >> 1;
  75       if (unicode_name_by_length[i].ind_offset <= index)
  76         i1 = i;
  77       else
  78         i2 = i;
  79     }
  80   i = i1;
  81   assert (unicode_name_by_length[i].ind_offset <= index
  82           && index < unicode_name_by_length[i+1].ind_offset);
  83   *lengthp = i;
  84   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  85                              + (index-unicode_name_by_length[i].ind_offset)*i];
  86 }
  87
  88 /* Looks up the index of a word.  */
  89 static int
  90 unicode_name_word_lookup (const char *word, unsigned int length)
  91 {
  92   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  93     {
  94       /* Binary search among the words of given length.  */
  95       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  96       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  97       unsigned int i1 = i0;
  98       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
  99       while (i2 - i1 > 0)
 100         {
 101           unsigned int i = (i1 + i2) >> 1;
 102           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 103           const char *w = word;
 104           unsigned int n = length;
 105           for (;;)
 106             {
 107               if (*p < *w)
 108                 {
 109                   if (i1 == i)
 110                     return -1;
 111                   /* Note here: i1 < i < i2.  */
 112                   i1 = i;
 113                   break;
 114                 }
 115               if (*p > *w)
 116                 {
 117                   /* Note here: i1 <= i < i2.  */
 118                   i2 = i;
 119                   break;
 120                 }
 121               p++; w++; n--;
 122               if (n == 0)
 123                 return i;
 124             }
 125         }
 126     }
 127   return -1;
 128 }
 129
 130 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 131    sections 3.11 and 4.4.  */
 132 static const char jamo_initial_short_name[19][3] =
 133 {
 134   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 135   "C", "K", "T", "P", "H"
 136 };
 137 static const char jamo_medial_short_name[21][4] =
 138 {
 139   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 140   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 141 };
 142 static const char jamo_final_short_name[28][3] =
 143 {
 144   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 145   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 146 };
 147
 148 /* Looks up the name of a Unicode character, in uppercase ASCII.
 149    Returns the filled buf, or NULL if the character does not have a name.  */
 150 char *
 151 unicode_character_name (ucs4_t c, char *buf)
 152 {
 153   if (c >= 0xAC00 && c <= 0xD7A3)
 154     {
 155       /* Special case for Hangul syllables. Keeps the tables small.  */
 156       char *ptr;
 157       unsigned int tmp;
 158       unsigned int index1;
 159       unsigned int index2;
 160       unsigned int index3;
 161       const char *q;
 162
 163       /* buf needs to have at least 16 + 7 bytes here.  */
 164       memcpy (buf, "HANGUL SYLLABLE ", 16);
 165       ptr = buf + 16;
 166
 167       tmp = c - 0xAC00;
 168       index3 = tmp % 28; tmp = tmp / 28;
 169       index2 = tmp % 21; tmp = tmp / 21;
 170       index1 = tmp;
 171
 172       q = jamo_initial_short_name[index1];
 173       while (*q != '\0')
 174         *ptr++ = *q++;
 175       q = jamo_medial_short_name[index2];
 176       while (*q != '\0')
 177         *ptr++ = *q++;
 178       q = jamo_final_short_name[index3];
 179       while (*q != '\0')
 180         *ptr++ = *q++;
 181       *ptr = '\0';
 182       return buf;
 183     }
 184   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 185            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 186     {
 187       /* Special case for CJK compatibility ideographs. Keeps the tables
 188          small.  */
 189       char *ptr;
 190       int i;
 191
 192       /* buf needs to have at least 28 + 5 bytes here.  */
 193       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 194       ptr = buf + 28;
 195
 196       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 197         {
 198           unsigned int x = (c >> i) & 0xf;
 199           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 200         }
 201       *ptr = '\0';
 202       return buf;
 203     }
 204   else
 205     {
 206       const uint16_t *words;
 207
 208       /* Transform the code so that it fits in 16 bits.  */
 209       switch (c >> 12)
 210         {
 211         case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
 212           break;
 213         case 0x0A:
 214           c -= 0x05000;
 215           break;
 216         case 0x0F:
 217           c -= 0x09000;
 218           break;
 219         case 0x10:
 220           c -= 0x09000;
 221           break;
 222         case 0x12:
 223           c -= 0x0A000;
 224           break;
 225         case 0x1D:
 226           c -= 0x14000;
 227           break;
 228         case 0x1F:
 229           c -= 0x15000;
 230           break;
 231         case 0x2F:
 232           c -= 0x24000;
 233           break;
 234         case 0xE0:
 235           c -= 0xD4000;
 236           break;
 237         default:
 238           return NULL;
 239         }
 240
 241       {
 242         /* Binary search in unicode_code_to_name.  */
 243         unsigned int i1 = 0;
 244         unsigned int i2 = SIZEOF (unicode_code_to_name);
 245         for (;;)
 246           {
 247             unsigned int i = (i1 + i2) >> 1;
 248             if (unicode_code_to_name[i].code == c)
 249               {
 250                 words = &unicode_names[unicode_code_to_name[i].name];
 251                 break;
 252               }
 253             else if (unicode_code_to_name[i].code < c)
 254               {
 255                 if (i1 == i)
 256                   {
 257                     words = NULL;
 258                     break;
 259                   }
 260                 /* Note here: i1 < i < i2.  */
 261                 i1 = i;
 262               }
 263             else if (unicode_code_to_name[i].code > c)
 264               {
 265                 if (i2 == i)
 266                   {
 267                     words = NULL;
 268                     break;
 269                   }
 270                 /* Note here: i1 <= i < i2.  */
 271                 i2 = i;
 272               }
 273           }
 274       }
 275       if (words != NULL)
 276         {
 277           /* Found it in unicode_code_to_name. Now concatenate the words.  */
 278           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 279           char *ptr = buf;
 280           for (;;)
 281             {
 282               unsigned int wordlen;
 283               const char *word = unicode_name_word (*words>>1, &wordlen);
 284               do
 285                 *ptr++ = *word++;
 286               while (--wordlen > 0);
 287               if ((*words & 1) == 0)
 288                 break;
 289               *ptr++ = ' ';
 290               words++;
 291             }
 292           *ptr = '\0';
 293           return buf;
 294         }
 295       return NULL;
 296     }
 297 }
 298
 299 /* Looks up the Unicode character with a given name, in upper- or lowercase
 300    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 301 ucs4_t
 302 unicode_name_character (const char *name)
 303 {
 304   unsigned int len = strlen (name);
 305   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 306     {
 307       /* Test for "word1 word2 ..." syntax.  */
 308       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 309       char *ptr = buf;
 310       for (;;)
 311         {
 312           char c = *name++;
 313           if (!(c >= ' ' && c <= '~'))
 314             break;
 315           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 316           if (--len == 0)
 317             goto filled_buf;
 318         }
 319       if (false)
 320       filled_buf:
 321         {
 322           /* Convert the constituents to uint16_t words.  */
 323           uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 324           uint16_t *wordptr = words;
 325           {
 326             const char *p1 = buf;
 327             for (;;)
 328               {
 329                 {
 330                   int word;
 331                   const char *p2 = p1;
 332                   while (p2 < ptr && *p2 != ' ')
 333                     p2++;
 334                   word = unicode_name_word_lookup (p1, p2 - p1);
 335                   if (word < 0)
 336                     break;
 337                   if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 338                     break;
 339                   *wordptr++ = word;
 340                   if (p2 == ptr)
 341                     goto filled_words;
 342                   p1 = p2 + 1;
 343                 }
 344                 /* Special case for Hangul syllables. Keeps the tables small. */
 345                 if (wordptr == &words[2]
 346                     && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 347                     && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 348                   {
 349                     /* Split the last word [p1..ptr) into three parts:
 350                          1) [BCDGHJKMNPRST]
 351                          2) [AEIOUWY]
 352                          3) [BCDGHIJKLMNPST]
 353                      */
 354                     const char *p2;
 355                     const char *p3;
 356                     const char *p4;
 357
 358                     p2 = p1;
 359                     while (p2 < ptr
 360                            && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 361                                || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 362                                || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 363                                || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 364                                || *p2 == 'T'))
 365                       p2++;
 366                     p3 = p2;
 367                     while (p3 < ptr
 368                            && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 369                                || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 370                                || *p3 == 'Y'))
 371                       p3++;
 372                     p4 = p3;
 373                     while (p4 < ptr
 374                            && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 375                                || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 376                                || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 377                                || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 378                                || *p4 == 'S' || *p4 == 'T'))
 379                       p4++;
 380                     if (p4 == ptr)
 381                       {
 382                         unsigned int n1 = p2 - p1;
 383                         unsigned int n2 = p3 - p2;
 384                         unsigned int n3 = p4 - p3;
 385
 386                         if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 387                           {
 388                             unsigned int index1;
 389
 390                             for (index1 = 0; index1 < 19; index1++)
 391                               if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
 392                                   && jamo_initial_short_name[index1][n1] == '\0')
 393                                 {
 394                                   unsigned int index2;
 395
 396                                   for (index2 = 0; index2 < 21; index2++)
 397                                     if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
 398                                         && jamo_medial_short_name[index2][n2] == '\0')
 399                                       {
 400                                         unsigned int index3;
 401
 402                                         for (index3 = 0; index3 < 28; index3++)
 403                                           if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
 404                                               && jamo_final_short_name[index3][n3] == '\0')
 405                                             {
 406                                               return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 407                                             }
 408                                         break;
 409                                       }
 410                                   break;
 411                                 }
 412                           }
 413                       }
 414                   }
 415                 /* Special case for CJK compatibility ideographs. Keeps the
 416                    tables small.  */
 417                 if (wordptr == &words[2]
 418                     && words[0] == UNICODE_CHARNAME_WORD_CJK
 419                     && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 420                     && p1 + 14 <= ptr
 421                     && p1 + 15 >= ptr
 422                     && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 423                   {
 424                     const char *p2 = p1 + 10;
 425
 426                     if (*p2 != '0')
 427                       {
 428                         unsigned int c = 0;
 429
 430                         for (;;)
 431                           {
 432                             if (*p2 >= '0' && *p2 <= '9')
 433                               c += (*p2 - '0');
 434                             else if (*p2 >= 'A' && *p2 <= 'F')
 435                               c += (*p2 - 'A' + 10);
 436                             else
 437                               break;
 438                             p2++;
 439                             if (p2 == ptr)
 440                               {
 441                                 if ((c >= 0xF900 && c <= 0xFA2D)
 442                                     || (c >= 0xFA30 && c <= 0xFA6A)
 443                                     || (c >= 0xFA70 && c <= 0xFAD9)
 444                                     || (c >= 0x2F800 && c <= 0x2FA1D))
 445                                   return c;
 446                                 else
 447                                   break;
 448                               }
 449                             c = c << 4;
 450                           }
 451                       }
 452                   }
 453               }
 454           }
 455           if (false)
 456           filled_words:
 457             {
 458               /* Multiply by 2, to simplify later comparisons.  */
 459               unsigned int words_length = wordptr - words;
 460               {
 461                 int i = words_length - 1;
 462                 words[i] = 2 * words[i];
 463                 for (; --i >= 0; )
 464                   words[i] = 2 * words[i] + 1;
 465               }
 466               /* Binary search in unicode_name_to_code.  */
 467               {
 468                 unsigned int i1 = 0;
 469                 unsigned int i2 = SIZEOF (unicode_name_to_code);
 470                 for (;;)
 471                   {
 472                     unsigned int i = (i1 + i2) >> 1;
 473                     const uint16_t *w = words;
 474                     const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
 475                     unsigned int n = words_length;
 476                     for (;;)
 477                       {
 478                         if (*p < *w)
 479                           {
 480                             if (i1 == i)
 481                               goto name_not_found;
 482                             /* Note here: i1 < i < i2.  */
 483                             i1 = i;
 484                             break;
 485                           }
 486                         else if (*p > *w)
 487                           {
 488                             if (i2 == i)
 489                               goto name_not_found;
 490                             /* Note here: i1 <= i < i2.  */
 491                             i2 = i;
 492                             break;
 493                           }
 494                         p++; w++; n--;
 495                         if (n == 0)
 496                           {
 497                             unsigned int c = unicode_name_to_code[i].code;
 498
 499                             /* Undo the transformation to 16-bit space.  */
 500                             static const unsigned int offset[13] =
 501                               {
 502                                 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
 503                                 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
 504                                 0x15000, 0x24000, 0xD4000
 505                               };
 506                             return c + offset[c >> 12];
 507                           }
 508                       }
 509                   }
 510               }
 511             name_not_found: ;
 512             }
 513         }
 514     }
 515   return UNINAME_INVALID;
 516 }