1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU Library General Public License as published
6 by the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU Library General Public
15 License along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt.
34 This table is generated in a way to minimize the memory footprint:
35 1. its compiled size is small (less than 350 KB),
36 2. it resides entirely in the text or read-only data segment of the
37 executable or shared library: the table contains only immediate
38 integers, no pointers, and the functions don't do heap allocation.
42 static const char unicode_name_words[36303] = ...;
43 #define UNICODE_CHARNAME_NUM_WORDS 6260
44 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
45 #define UNICODE_CHARNAME_WORD_HANGUL 3902
46 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
47 #define UNICODE_CHARNAME_WORD_CJK 417
48 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
49 static const uint16_t unicode_names[68940] = ...;
50 static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
51 static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
52 #define UNICODE_CHARNAME_MAX_LENGTH 83
53 #define UNICODE_CHARNAME_MAX_WORDS 13
56 /* Returns the word with a given index. */
58 unicode_name_word (unsigned int index, unsigned int *lengthp)
64 assert (index < UNICODE_CHARNAME_NUM_WORDS);
66 /* Binary search for i with
67 unicode_name_by_length[i].ind_offset <= index
69 index < unicode_name_by_length[i+1].ind_offset
73 i2 = SIZEOF (unicode_name_by_length) - 1;
76 unsigned int i = (i1 + i2) >> 1;
77 if (unicode_name_by_length[i].ind_offset <= index)
83 assert (unicode_name_by_length[i].ind_offset <= index
84 && index < unicode_name_by_length[i+1].ind_offset);
86 return &unicode_name_words[unicode_name_by_length[i].extra_offset
87 + (index-unicode_name_by_length[i].ind_offset)*i];
90 /* Looks up the index of a word. */
92 unicode_name_word_lookup (const char *word, unsigned int length)
94 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
96 /* Binary search among the words of given length. */
97 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
98 unsigned int i0 = unicode_name_by_length[length].ind_offset;
100 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
103 unsigned int i = (i1 + i2) >> 1;
104 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
105 const char *w = word;
106 unsigned int n = length;
113 /* Note here: i1 < i < i2. */
119 /* Note here: i1 <= i < i2. */
132 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
133 sections 3.11 and 4.4. */
134 static const char jamo_initial_short_name[19][3] =
136 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
137 "C", "K", "T", "P", "H"
139 static const char jamo_medial_short_name[21][4] =
141 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
142 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
144 static const char jamo_final_short_name[28][3] =
146 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
147 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
150 /* Looks up the name of a Unicode character, in uppercase ASCII.
151 Returns the filled buf, or NULL if the character does not have a name. */
153 unicode_character_name (ucs4_t c, char *buf)
155 if (c >= 0xAC00 && c <= 0xD7A3)
157 /* Special case for Hangul syllables. Keeps the tables small. */
165 /* buf needs to have at least 16 + 7 bytes here. */
166 memcpy (buf, "HANGUL SYLLABLE ", 16);
170 index3 = tmp % 28; tmp = tmp / 28;
171 index2 = tmp % 21; tmp = tmp / 21;
174 q = jamo_initial_short_name[index1];
177 q = jamo_medial_short_name[index2];
180 q = jamo_final_short_name[index3];
186 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
187 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
189 /* Special case for CJK compatibility ideographs. Keeps the tables
194 /* buf needs to have at least 28 + 5 bytes here. */
195 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
198 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
200 unsigned int x = (c >> i) & 0xf;
201 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
208 const uint16_t *words;
210 /* Transform the code so that it fits in 16 bits. */
213 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
241 /* Binary search in unicode_code_to_name. */
243 unsigned int i2 = SIZEOF (unicode_code_to_name);
246 unsigned int i = (i1 + i2) >> 1;
247 if (unicode_code_to_name[i].code == c)
249 words = &unicode_names[unicode_code_to_name[i].name];
252 else if (unicode_code_to_name[i].code < c)
259 /* Note here: i1 < i < i2. */
262 else if (unicode_code_to_name[i].code > c)
269 /* Note here: i1 <= i < i2. */
276 /* Found it in unicode_code_to_name. Now concatenate the words. */
277 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
281 unsigned int wordlen;
282 const char *word = unicode_name_word (*words>>1, &wordlen);
285 while (--wordlen > 0);
286 if ((*words & 1) == 0)
298 /* Looks up the Unicode character with a given name, in upper- or lowercase
299 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
301 unicode_name_character (const char *name)
303 unsigned int len = strlen (name);
304 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
306 /* Test for "word1 word2 ..." syntax. */
307 char buf[UNICODE_CHARNAME_MAX_LENGTH];
312 if (!(c >= ' ' && c <= '~'))
314 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
321 /* Convert the constituents to uint16_t words. */
322 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
323 uint16_t *wordptr = words;
325 const char *p1 = buf;
331 while (p2 < ptr && *p2 != ' ')
333 word = unicode_name_word_lookup (p1, p2 - p1);
336 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
343 /* Special case for Hangul syllables. Keeps the tables small. */
344 if (wordptr == &words[2]
345 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
346 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
348 /* Split the last word [p1..ptr) into three parts:
359 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
360 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
361 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
362 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
367 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
368 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
373 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
374 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
375 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
376 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
377 || *p4 == 'S' || *p4 == 'T'))
381 unsigned int n1 = p2 - p1;
382 unsigned int n2 = p3 - p2;
383 unsigned int n3 = p4 - p3;
385 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
389 for (index1 = 0; index1 < 19; index1++)
390 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
391 && jamo_initial_short_name[index1][n1] == '\0')
395 for (index2 = 0; index2 < 21; index2++)
396 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
397 && jamo_medial_short_name[index2][n2] == '\0')
401 for (index3 = 0; index3 < 28; index3++)
402 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
403 && jamo_final_short_name[index3][n3] == '\0')
405 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
414 /* Special case for CJK compatibility ideographs. Keeps the
416 if (wordptr == &words[2]
417 && words[0] == UNICODE_CHARNAME_WORD_CJK
418 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
421 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
423 const char *p2 = p1 + 10;
431 if (*p2 >= '0' && *p2 <= '9')
433 else if (*p2 >= 'A' && *p2 <= 'F')
434 c += (*p2 - 'A' + 10);
440 if ((c >= 0xF900 && c <= 0xFA2D)
441 || (c >= 0xFA30 && c <= 0xFA6A)
442 || (c >= 0xFA70 && c <= 0xFAD9)
443 || (c >= 0x2F800 && c <= 0x2FA1D))
457 /* Multiply by 2, to simplify later comparisons. */
458 unsigned int words_length = wordptr - words;
460 int i = words_length - 1;
461 words[i] = 2 * words[i];
463 words[i] = 2 * words[i] + 1;
465 /* Binary search in unicode_name_to_code. */
468 unsigned int i2 = SIZEOF (unicode_name_to_code);
471 unsigned int i = (i1 + i2) >> 1;
472 const uint16_t *w = words;
473 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
474 unsigned int n = words_length;
481 /* Note here: i1 < i < i2. */
489 /* Note here: i1 <= i < i2. */
496 unsigned int c = unicode_name_to_code[i].code;
498 /* Undo the transformation to 16-bit space. */
499 static const unsigned int offset[12] =
501 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
502 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
505 return c + offset[c >> 12];
514 return UNINAME_INVALID;