1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
31 /* Table of Unicode character names, derived from UnicodeData.txt.
32 This table is generated in a way to minimize the memory footprint:
33 1. its compiled size is small (less than 350 KB),
34 2. it resides entirely in the text or read-only data segment of the
35 executable or shared library: the table contains only immediate
36 integers, no pointers, and the functions don't do heap allocation.
40 static const char unicode_name_words[36303] = ...;
41 #define UNICODE_CHARNAME_NUM_WORDS 6260
42 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
43 #define UNICODE_CHARNAME_WORD_HANGUL 3902
44 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
45 #define UNICODE_CHARNAME_WORD_CJK 417
46 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
47 static const uint16_t unicode_names[68940] = ...;
48 static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
49 static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
50 #define UNICODE_CHARNAME_MAX_LENGTH 83
51 #define UNICODE_CHARNAME_MAX_WORDS 13
54 /* Returns the word with a given index. */
56 unicode_name_word (unsigned int index, unsigned int *lengthp)
62 assert (index < UNICODE_CHARNAME_NUM_WORDS);
64 /* Binary search for i with
65 unicode_name_by_length[i].ind_offset <= index
67 index < unicode_name_by_length[i+1].ind_offset
71 i2 = SIZEOF (unicode_name_by_length) - 1;
74 unsigned int i = (i1 + i2) >> 1;
75 if (unicode_name_by_length[i].ind_offset <= index)
81 assert (unicode_name_by_length[i].ind_offset <= index
82 && index < unicode_name_by_length[i+1].ind_offset);
84 return &unicode_name_words[unicode_name_by_length[i].extra_offset
85 + (index-unicode_name_by_length[i].ind_offset)*i];
88 /* Looks up the index of a word. */
90 unicode_name_word_lookup (const char *word, unsigned int length)
92 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
94 /* Binary search among the words of given length. */
95 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
96 unsigned int i0 = unicode_name_by_length[length].ind_offset;
98 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
101 unsigned int i = (i1 + i2) >> 1;
102 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
103 const char *w = word;
104 unsigned int n = length;
111 /* Note here: i1 < i < i2. */
117 /* Note here: i1 <= i < i2. */
130 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
131 sections 3.11 and 4.4. */
132 static const char jamo_initial_short_name[19][3] =
134 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
135 "C", "K", "T", "P", "H"
137 static const char jamo_medial_short_name[21][4] =
139 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
140 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
142 static const char jamo_final_short_name[28][3] =
144 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
145 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
148 /* Looks up the name of a Unicode character, in uppercase ASCII.
149 Returns the filled buf, or NULL if the character does not have a name. */
151 unicode_character_name (ucs4_t c, char *buf)
153 if (c >= 0xAC00 && c <= 0xD7A3)
155 /* Special case for Hangul syllables. Keeps the tables small. */
163 /* buf needs to have at least 16 + 7 bytes here. */
164 memcpy (buf, "HANGUL SYLLABLE ", 16);
168 index3 = tmp % 28; tmp = tmp / 28;
169 index2 = tmp % 21; tmp = tmp / 21;
172 q = jamo_initial_short_name[index1];
175 q = jamo_medial_short_name[index2];
178 q = jamo_final_short_name[index3];
184 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
185 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
187 /* Special case for CJK compatibility ideographs. Keeps the tables
192 /* buf needs to have at least 28 + 5 bytes here. */
193 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
196 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
198 unsigned int x = (c >> i) & 0xf;
199 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
206 const uint16_t *words;
208 /* Transform the code so that it fits in 16 bits. */
211 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
239 /* Binary search in unicode_code_to_name. */
241 unsigned int i2 = SIZEOF (unicode_code_to_name);
244 unsigned int i = (i1 + i2) >> 1;
245 if (unicode_code_to_name[i].code == c)
247 words = &unicode_names[unicode_code_to_name[i].name];
250 else if (unicode_code_to_name[i].code < c)
257 /* Note here: i1 < i < i2. */
260 else if (unicode_code_to_name[i].code > c)
267 /* Note here: i1 <= i < i2. */
274 /* Found it in unicode_code_to_name. Now concatenate the words. */
275 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
279 unsigned int wordlen;
280 const char *word = unicode_name_word (*words>>1, &wordlen);
283 while (--wordlen > 0);
284 if ((*words & 1) == 0)
296 /* Looks up the Unicode character with a given name, in upper- or lowercase
297 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
299 unicode_name_character (const char *name)
301 unsigned int len = strlen (name);
302 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
304 /* Test for "word1 word2 ..." syntax. */
305 char buf[UNICODE_CHARNAME_MAX_LENGTH];
310 if (!(c >= ' ' && c <= '~'))
312 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
319 /* Convert the constituents to uint16_t words. */
320 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
321 uint16_t *wordptr = words;
323 const char *p1 = buf;
329 while (p2 < ptr && *p2 != ' ')
331 word = unicode_name_word_lookup (p1, p2 - p1);
334 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
341 /* Special case for Hangul syllables. Keeps the tables small. */
342 if (wordptr == &words[2]
343 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
344 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
346 /* Split the last word [p1..ptr) into three parts:
357 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
358 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
359 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
360 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
365 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
366 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
371 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
372 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
373 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
374 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
375 || *p4 == 'S' || *p4 == 'T'))
379 unsigned int n1 = p2 - p1;
380 unsigned int n2 = p3 - p2;
381 unsigned int n3 = p4 - p3;
383 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
387 for (index1 = 0; index1 < 19; index1++)
388 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
389 && jamo_initial_short_name[index1][n1] == '\0')
393 for (index2 = 0; index2 < 21; index2++)
394 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
395 && jamo_medial_short_name[index2][n2] == '\0')
399 for (index3 = 0; index3 < 28; index3++)
400 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
401 && jamo_final_short_name[index3][n3] == '\0')
403 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
412 /* Special case for CJK compatibility ideographs. Keeps the
414 if (wordptr == &words[2]
415 && words[0] == UNICODE_CHARNAME_WORD_CJK
416 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
419 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
421 const char *p2 = p1 + 10;
429 if (*p2 >= '0' && *p2 <= '9')
431 else if (*p2 >= 'A' && *p2 <= 'F')
432 c += (*p2 - 'A' + 10);
438 if ((c >= 0xF900 && c <= 0xFA2D)
439 || (c >= 0xFA30 && c <= 0xFA6A)
440 || (c >= 0xFA70 && c <= 0xFAD9)
441 || (c >= 0x2F800 && c <= 0x2FA1D))
455 /* Multiply by 2, to simplify later comparisons. */
456 unsigned int words_length = wordptr - words;
458 int i = words_length - 1;
459 words[i] = 2 * words[i];
461 words[i] = 2 * words[i] + 1;
463 /* Binary search in unicode_name_to_code. */
466 unsigned int i2 = SIZEOF (unicode_name_to_code);
469 unsigned int i = (i1 + i2) >> 1;
470 const uint16_t *w = words;
471 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
472 unsigned int n = words_length;
479 /* Note here: i1 < i < i2. */
487 /* Note here: i1 <= i < i2. */
494 unsigned int c = unicode_name_to_code[i].code;
496 /* Undo the transformation to 16-bit space. */
497 static const unsigned int offset[12] =
499 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
500 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
503 return c + offset[c >> 12];
512 return UNINAME_INVALID;