lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify it
   5    under the terms of the GNU Library General Public License as published
   6    by the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Library General Public License for more details.
  13
  14    You should have received a copy of the GNU Library General Public
  15    License along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  17    USA.  */
  18
  19 #include <config.h>
  20
  21 /* Specification.  */
  22 #include "uniname.h"
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29
  30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  31
  32
  33 /* Table of Unicode character names, derived from UnicodeData.txt.  */
  34 #include "uninames.h"
  35 /* It contains:
  36   static const char unicode_name_words[34594] = ...;
  37   #define UNICODE_CHARNAME_NUM_WORDS 5906
  38   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  39   #define UNICODE_CHARNAME_WORD_HANGUL 3624
  40   #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
  41   #define UNICODE_CHARNAME_WORD_CJK 401
  42   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
  43   static const uint16_t unicode_names[62620] = ...;
  44   static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
  45   static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
  46   #define UNICODE_CHARNAME_MAX_LENGTH 83
  47   #define UNICODE_CHARNAME_MAX_WORDS 13
  48 */
  49
  50 /* Returns the word with a given index.  */
  51 static const char *
  52 unicode_name_word (unsigned int index, unsigned int *lengthp)
  53 {
  54   unsigned int i1;
  55   unsigned int i2;
  56   unsigned int i;
  57
  58   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  59
  60   /* Binary search for i with
  61        unicode_name_by_length[i].ind_offset <= index
  62      and
  63        index < unicode_name_by_length[i+1].ind_offset
  64    */
  65
  66   i1 = 0;
  67   i2 = SIZEOF (unicode_name_by_length) - 1;
  68   while (i2 - i1 > 1)
  69     {
  70       unsigned int i = (i1 + i2) >> 1;
  71       if (unicode_name_by_length[i].ind_offset <= index)
  72         i1 = i;
  73       else
  74         i2 = i;
  75     }
  76   i = i1;
  77   assert (unicode_name_by_length[i].ind_offset <= index
  78           && index < unicode_name_by_length[i+1].ind_offset);
  79   *lengthp = i;
  80   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  81                              + (index-unicode_name_by_length[i].ind_offset)*i];
  82 }
  83
  84 /* Looks up the index of a word.  */
  85 static int
  86 unicode_name_word_lookup (const char *word, unsigned int length)
  87 {
  88   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  89     {
  90       /* Binary search among the words of given length.  */
  91       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  92       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  93       unsigned int i1 = i0;
  94       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
  95       while (i2 - i1 > 0)
  96         {
  97           unsigned int i = (i1 + i2) >> 1;
  98           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
  99           const char *w = word;
 100           unsigned int n = length;
 101           for (;;)
 102             {
 103               if (*p < *w)
 104                 {
 105                   if (i1 == i)
 106                     return -1;
 107                   /* Note here: i1 < i < i2.  */
 108                   i1 = i;
 109                   break;
 110                 }
 111               if (*p > *w)
 112                 {
 113                   /* Note here: i1 <= i < i2.  */
 114                   i2 = i;
 115                   break;
 116                 }
 117               p++; w++; n--;
 118               if (n == 0)
 119                 return i;
 120             }
 121         }
 122     }
 123   return -1;
 124 }
 125
 126 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 127    sections 3.11 and 4.4.  */
 128 static const char jamo_initial_short_name[19][3] =
 129 {
 130   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 131   "C", "K", "T", "P", "H"
 132 };
 133 static const char jamo_medial_short_name[21][4] =
 134 {
 135   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 136   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 137 };
 138 static const char jamo_final_short_name[28][3] =
 139 {
 140   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 141   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 142 };
 143
 144 /* Looks up the name of a Unicode character, in uppercase ASCII.
 145    Returns the filled buf, or NULL if the character does not have a name.  */
 146 char *
 147 unicode_character_name (ucs4_t c, char *buf)
 148 {
 149   if (c >= 0xAC00 && c <= 0xD7A3)
 150     {
 151       /* Special case for Hangul syllables. Keeps the tables small.  */
 152       char *ptr;
 153       unsigned int tmp;
 154       unsigned int index1;
 155       unsigned int index2;
 156       unsigned int index3;
 157       const char *q;
 158
 159       /* buf needs to have at least 16 + 7 bytes here.  */
 160       memcpy (buf, "HANGUL SYLLABLE ", 16);
 161       ptr = buf + 16;
 162
 163       tmp = c - 0xAC00;
 164       index3 = tmp % 28; tmp = tmp / 28;
 165       index2 = tmp % 21; tmp = tmp / 21;
 166       index1 = tmp;
 167
 168       q = jamo_initial_short_name[index1];
 169       while (*q != '\0')
 170         *ptr++ = *q++;
 171       q = jamo_medial_short_name[index2];
 172       while (*q != '\0')
 173         *ptr++ = *q++;
 174       q = jamo_final_short_name[index3];
 175       while (*q != '\0')
 176         *ptr++ = *q++;
 177       *ptr = '\0';
 178       return buf;
 179     }
 180   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 181            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 182     {
 183       /* Special case for CJK compatibility ideographs. Keeps the tables
 184          small.  */
 185       char *ptr;
 186       int i;
 187
 188       /* buf needs to have at least 28 + 5 bytes here.  */
 189       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 190       ptr = buf + 28;
 191
 192       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 193         {
 194           unsigned int x = (c >> i) & 0xf;
 195           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 196         }
 197       *ptr = '\0';
 198       return buf;
 199     }
 200   else
 201     {
 202       const uint16_t *words;
 203
 204       /* Transform the code so that it fits in 16 bits.  */
 205       switch (c >> 12)
 206         {
 207         case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
 208           break;
 209         case 0x0A:
 210           c -= 0x05000;
 211           break;
 212         case 0x0F:
 213           c -= 0x09000;
 214           break;
 215         case 0x10:
 216           c -= 0x09000;
 217           break;
 218         case 0x1D:
 219           c -= 0x15000;
 220           break;
 221         case 0x2F:
 222           c -= 0x26000;
 223           break;
 224         case 0xE0:
 225           c -= 0xD6000;
 226           break;
 227         default:
 228           return NULL;
 229         }
 230
 231       {
 232         /* Binary search in unicode_code_to_name.  */
 233         unsigned int i1 = 0;
 234         unsigned int i2 = SIZEOF (unicode_code_to_name);
 235         for (;;)
 236           {
 237             unsigned int i = (i1 + i2) >> 1;
 238             if (unicode_code_to_name[i].code == c)
 239               {
 240                 words = &unicode_names[unicode_code_to_name[i].name];
 241                 break;
 242               }
 243             else if (unicode_code_to_name[i].code < c)
 244               {
 245                 if (i1 == i)
 246                   {
 247                     words = NULL;
 248                     break;
 249                   }
 250                 /* Note here: i1 < i < i2.  */
 251                 i1 = i;
 252               }
 253             else if (unicode_code_to_name[i].code > c)
 254               {
 255                 if (i2 == i)
 256                   {
 257                     words = NULL;
 258                     break;
 259                   }
 260                 /* Note here: i1 <= i < i2.  */
 261                 i2 = i;
 262               }
 263           }
 264       }
 265       if (words != NULL)
 266         {
 267           /* Found it in unicode_code_to_name. Now concatenate the words.  */
 268           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 269           char *ptr = buf;
 270           for (;;)
 271             {
 272               unsigned int wordlen;
 273               const char *word = unicode_name_word (*words>>1, &wordlen);
 274               do
 275                 *ptr++ = *word++;
 276               while (--wordlen > 0);
 277               if ((*words & 1) == 0)
 278                 break;
 279               *ptr++ = ' ';
 280               words++;
 281             }
 282           *ptr = '\0';
 283           return buf;
 284         }
 285       return NULL;
 286     }
 287 }
 288
 289 /* Looks up the Unicode character with a given name, in upper- or lowercase
 290    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 291 ucs4_t
 292 unicode_name_character (const char *name)
 293 {
 294   unsigned int len = strlen (name);
 295   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 296     {
 297       /* Test for "word1 word2 ..." syntax.  */
 298       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 299       char *ptr = buf;
 300       for (;;)
 301         {
 302           char c = *name++;
 303           if (!(c >= ' ' && c <= '~'))
 304             break;
 305           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 306           if (--len == 0)
 307             goto filled_buf;
 308         }
 309       if (false)
 310       filled_buf:
 311         {
 312           /* Convert the constituents to uint16_t words.  */
 313           uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 314           uint16_t *wordptr = words;
 315           {
 316             const char *p1 = buf;
 317             for (;;)
 318               {
 319                 {
 320                   int word;
 321                   const char *p2 = p1;
 322                   while (p2 < ptr && *p2 != ' ')
 323                     p2++;
 324                   word = unicode_name_word_lookup (p1, p2 - p1);
 325                   if (word < 0)
 326                     break;
 327                   if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 328                     break;
 329                   *wordptr++ = word;
 330                   if (p2 == ptr)
 331                     goto filled_words;
 332                   p1 = p2 + 1;
 333                 }
 334                 /* Special case for Hangul syllables. Keeps the tables small. */
 335                 if (wordptr == &words[2]
 336                     && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 337                     && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 338                   {
 339                     /* Split the last word [p1..ptr) into three parts:
 340                          1) [BCDGHJKMNPRST]
 341                          2) [AEIOUWY]
 342                          3) [BCDGHIJKLMNPST]
 343                      */
 344                     const char *p2;
 345                     const char *p3;
 346                     const char *p4;
 347
 348                     p2 = p1;
 349                     while (p2 < ptr
 350                            && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 351                                || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 352                                || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 353                                || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 354                                || *p2 == 'T'))
 355                       p2++;
 356                     p3 = p2;
 357                     while (p3 < ptr
 358                            && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 359                                || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 360                                || *p3 == 'Y'))
 361                       p3++;
 362                     p4 = p3;
 363                     while (p4 < ptr
 364                            && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 365                                || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 366                                || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 367                                || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 368                                || *p4 == 'S' || *p4 == 'T'))
 369                       p4++;
 370                     if (p4 == ptr)
 371                       {
 372                         unsigned int n1 = p2 - p1;
 373                         unsigned int n2 = p3 - p2;
 374                         unsigned int n3 = p4 - p3;
 375
 376                         if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 377                           {
 378                             unsigned int index1;
 379
 380                             for (index1 = 0; index1 < 19; index1++)
 381                               if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
 382                                   && jamo_initial_short_name[index1][n1] == '\0')
 383                                 {
 384                                   unsigned int index2;
 385
 386                                   for (index2 = 0; index2 < 21; index2++)
 387                                     if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
 388                                         && jamo_medial_short_name[index2][n2] == '\0')
 389                                       {
 390                                         unsigned int index3;
 391
 392                                         for (index3 = 0; index3 < 28; index3++)
 393                                           if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
 394                                               && jamo_final_short_name[index3][n3] == '\0')
 395                                             {
 396                                               return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 397                                             }
 398                                         break;
 399                                       }
 400                                   break;
 401                                 }
 402                           }
 403                       }
 404                   }
 405                 /* Special case for CJK compatibility ideographs. Keeps the
 406                    tables small.  */
 407                 if (wordptr == &words[2]
 408                     && words[0] == UNICODE_CHARNAME_WORD_CJK
 409                     && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 410                     && p1 + 14 <= ptr
 411                     && p1 + 15 >= ptr
 412                     && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 413                   {
 414                     const char *p2 = p1 + 10;
 415
 416                     if (*p2 != '0')
 417                       {
 418                         unsigned int c = 0;
 419
 420                         for (;;)
 421                           {
 422                             if (*p2 >= '0' && *p2 <= '9')
 423                               c += (*p2 - '0');
 424                             else if (*p2 >= 'A' && *p2 <= 'F')
 425                               c += (*p2 - 'A' + 10);
 426                             else
 427                               break;
 428                             p2++;
 429                             if (p2 == ptr)
 430                               {
 431                                 if ((c >= 0xF900 && c <= 0xFA2D)
 432                                     || (c >= 0xFA30 && c <= 0xFA6A)
 433                                     || (c >= 0xFA70 && c <= 0xFAD9)
 434                                     || (c >= 0x2F800 && c <= 0x2FA1D))
 435                                   return c;
 436                                 else
 437                                   break;
 438                               }
 439                             c = c << 4;
 440                           }
 441                       }
 442                   }
 443               }
 444           }
 445           if (false)
 446           filled_words:
 447             {
 448               /* Multiply by 2, to simplify later comparisons.  */
 449               unsigned int words_length = wordptr - words;
 450               {
 451                 int i = words_length - 1;
 452                 words[i] = 2 * words[i];
 453                 for (; --i >= 0; )
 454                   words[i] = 2 * words[i] + 1;
 455               }
 456               /* Binary search in unicode_name_to_code.  */
 457               {
 458                 unsigned int i1 = 0;
 459                 unsigned int i2 = SIZEOF (unicode_name_to_code);
 460                 for (;;)
 461                   {
 462                     unsigned int i = (i1 + i2) >> 1;
 463                     const uint16_t *w = words;
 464                     const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
 465                     unsigned int n = words_length;
 466                     for (;;)
 467                       {
 468                         if (*p < *w)
 469                           {
 470                             if (i1 == i)
 471                               goto name_not_found;
 472                             /* Note here: i1 < i < i2.  */
 473                             i1 = i;
 474                             break;
 475                           }
 476                         else if (*p > *w)
 477                           {
 478                             if (i2 == i)
 479                               goto name_not_found;
 480                             /* Note here: i1 <= i < i2.  */
 481                             i2 = i;
 482                             break;
 483                           }
 484                         p++; w++; n--;
 485                         if (n == 0)
 486                           {
 487                             unsigned int c = unicode_name_to_code[i].code;
 488
 489                             /* Undo the transformation to 16-bit space.  */
 490                             static const unsigned int offset[11] =
 491                               {
 492                                 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
 493                                 0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
 494                                 0xD6000
 495                               };
 496                             return c + offset[c >> 12];
 497                           }
 498                       }
 499                   }
 500               }
 501             name_not_found: ;
 502             }
 503         }
 504     }
 505   return UNINAME_INVALID;
 506 }