1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
31 /* Table of Unicode character names, derived from UnicodeData.txt.
32 This table is generated in a way to minimize the memory footprint:
33 1. its compiled size is small (less than 350 KB),
34 2. it resides entirely in the text or read-only data segment of the
35 executable or shared library: the table contains only immediate
36 integers, no pointers, and the functions don't do heap allocation.
40 static const char unicode_name_words[36303] = ...;
41 #define UNICODE_CHARNAME_NUM_WORDS 6260
42 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
43 #define UNICODE_CHARNAME_WORD_HANGUL 3902
44 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
45 #define UNICODE_CHARNAME_WORD_CJK 417
46 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
47 static const uint16_t unicode_names[68940] = ...;
48 static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
49 static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
50 #define UNICODE_CHARNAME_MAX_LENGTH 83
51 #define UNICODE_CHARNAME_MAX_WORDS 13
54 /* Returns the word with a given index. */
56 unicode_name_word (unsigned int index, unsigned int *lengthp)
62 assert (index < UNICODE_CHARNAME_NUM_WORDS);
64 /* Binary search for i with
65 unicode_name_by_length[i].ind_offset <= index
67 index < unicode_name_by_length[i+1].ind_offset
71 i2 = SIZEOF (unicode_name_by_length) - 1;
74 unsigned int i = (i1 + i2) >> 1;
75 if (unicode_name_by_length[i].ind_offset <= index)
81 assert (unicode_name_by_length[i].ind_offset <= index
82 && index < unicode_name_by_length[i+1].ind_offset);
84 return &unicode_name_words[unicode_name_by_length[i].extra_offset
85 + (index-unicode_name_by_length[i].ind_offset)*i];
88 /* Looks up the index of a word. */
90 unicode_name_word_lookup (const char *word, unsigned int length)
92 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
94 /* Binary search among the words of given length. */
95 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
96 unsigned int i0 = unicode_name_by_length[length].ind_offset;
98 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
101 unsigned int i = (i1 + i2) >> 1;
102 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
103 const char *w = word;
104 unsigned int n = length;
111 /* Note here: i1 < i < i2. */
117 /* Note here: i1 <= i < i2. */
130 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
131 sections 3.11 and 4.4. */
132 static const char jamo_initial_short_name[19][3] =
134 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
135 "C", "K", "T", "P", "H"
137 static const char jamo_medial_short_name[21][4] =
139 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
140 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
142 static const char jamo_final_short_name[28][3] =
144 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
145 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
148 /* Looks up the name of a Unicode character, in uppercase ASCII.
149 Returns the filled buf, or NULL if the character does not have a name. */
151 unicode_character_name (ucs4_t c, char *buf)
153 if (c >= 0xAC00 && c <= 0xD7A3)
155 /* Special case for Hangul syllables. Keeps the tables small. */
163 /* buf needs to have at least 16 + 7 bytes here. */
164 memcpy (buf, "HANGUL SYLLABLE ", 16);
168 index3 = tmp % 28; tmp = tmp / 28;
169 index2 = tmp % 21; tmp = tmp / 21;
172 q = jamo_initial_short_name[index1];
175 q = jamo_medial_short_name[index2];
178 q = jamo_final_short_name[index3];
184 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
185 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
187 /* Special case for CJK compatibility ideographs. Keeps the tables
192 /* buf needs to have at least 28 + 5 bytes here. */
193 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
196 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
198 unsigned int x = (c >> i) & 0xf;
199 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
206 const uint16_t *words;
208 /* Transform the code so that it fits in 16 bits. */
211 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
242 /* Binary search in unicode_code_to_name. */
244 unsigned int i2 = SIZEOF (unicode_code_to_name);
247 unsigned int i = (i1 + i2) >> 1;
248 if (unicode_code_to_name[i].code == c)
250 words = &unicode_names[unicode_code_to_name[i].name];
253 else if (unicode_code_to_name[i].code < c)
260 /* Note here: i1 < i < i2. */
263 else if (unicode_code_to_name[i].code > c)
270 /* Note here: i1 <= i < i2. */
277 /* Found it in unicode_code_to_name. Now concatenate the words. */
278 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
282 unsigned int wordlen;
283 const char *word = unicode_name_word (*words>>1, &wordlen);
286 while (--wordlen > 0);
287 if ((*words & 1) == 0)
299 /* Looks up the Unicode character with a given name, in upper- or lowercase
300 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
302 unicode_name_character (const char *name)
304 unsigned int len = strlen (name);
305 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
307 /* Test for "word1 word2 ..." syntax. */
308 char buf[UNICODE_CHARNAME_MAX_LENGTH];
313 if (!(c >= ' ' && c <= '~'))
315 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
322 /* Convert the constituents to uint16_t words. */
323 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
324 uint16_t *wordptr = words;
326 const char *p1 = buf;
332 while (p2 < ptr && *p2 != ' ')
334 word = unicode_name_word_lookup (p1, p2 - p1);
337 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
344 /* Special case for Hangul syllables. Keeps the tables small. */
345 if (wordptr == &words[2]
346 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
347 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
349 /* Split the last word [p1..ptr) into three parts:
360 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
361 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
362 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
363 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
368 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
369 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
374 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
375 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
376 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
377 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
378 || *p4 == 'S' || *p4 == 'T'))
382 unsigned int n1 = p2 - p1;
383 unsigned int n2 = p3 - p2;
384 unsigned int n3 = p4 - p3;
386 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
390 for (index1 = 0; index1 < 19; index1++)
391 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
392 && jamo_initial_short_name[index1][n1] == '\0')
396 for (index2 = 0; index2 < 21; index2++)
397 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
398 && jamo_medial_short_name[index2][n2] == '\0')
402 for (index3 = 0; index3 < 28; index3++)
403 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
404 && jamo_final_short_name[index3][n3] == '\0')
406 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
415 /* Special case for CJK compatibility ideographs. Keeps the
417 if (wordptr == &words[2]
418 && words[0] == UNICODE_CHARNAME_WORD_CJK
419 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
422 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
424 const char *p2 = p1 + 10;
432 if (*p2 >= '0' && *p2 <= '9')
434 else if (*p2 >= 'A' && *p2 <= 'F')
435 c += (*p2 - 'A' + 10);
441 if ((c >= 0xF900 && c <= 0xFA2D)
442 || (c >= 0xFA30 && c <= 0xFA6A)
443 || (c >= 0xFA70 && c <= 0xFAD9)
444 || (c >= 0x2F800 && c <= 0x2FA1D))
458 /* Multiply by 2, to simplify later comparisons. */
459 unsigned int words_length = wordptr - words;
461 int i = words_length - 1;
462 words[i] = 2 * words[i];
464 words[i] = 2 * words[i] + 1;
466 /* Binary search in unicode_name_to_code. */
469 unsigned int i2 = SIZEOF (unicode_name_to_code);
472 unsigned int i = (i1 + i2) >> 1;
473 const uint16_t *w = words;
474 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
475 unsigned int n = words_length;
482 /* Note here: i1 < i < i2. */
490 /* Note here: i1 <= i < i2. */
497 unsigned int c = unicode_name_to_code[i].code;
499 /* Undo the transformation to 16-bit space. */
500 static const unsigned int offset[13] =
502 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
503 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
504 0x15000, 0x24000, 0xD4000
506 return c + offset[c >> 12];
515 return UNINAME_INVALID;