1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU Library General Public License as published
6 by the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU Library General Public
15 License along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt. */
36 static const char unicode_name_words[34594] = ...;
37 #define UNICODE_CHARNAME_NUM_WORDS 5906
38 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
39 #define UNICODE_CHARNAME_WORD_HANGUL 3624
40 #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
41 #define UNICODE_CHARNAME_WORD_CJK 401
42 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
43 static const uint16_t unicode_names[62620] = ...;
44 static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
45 static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
46 #define UNICODE_CHARNAME_MAX_LENGTH 83
47 #define UNICODE_CHARNAME_MAX_WORDS 13
50 /* Returns the word with a given index. */
52 unicode_name_word (unsigned int index, unsigned int *lengthp)
58 assert (index < UNICODE_CHARNAME_NUM_WORDS);
60 /* Binary search for i with
61 unicode_name_by_length[i].ind_offset <= index
63 index < unicode_name_by_length[i+1].ind_offset
67 i2 = SIZEOF (unicode_name_by_length) - 1;
70 unsigned int i = (i1 + i2) >> 1;
71 if (unicode_name_by_length[i].ind_offset <= index)
77 assert (unicode_name_by_length[i].ind_offset <= index
78 && index < unicode_name_by_length[i+1].ind_offset);
80 return &unicode_name_words[unicode_name_by_length[i].extra_offset
81 + (index-unicode_name_by_length[i].ind_offset)*i];
84 /* Looks up the index of a word. */
86 unicode_name_word_lookup (const char *word, unsigned int length)
88 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
90 /* Binary search among the words of given length. */
91 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
92 unsigned int i0 = unicode_name_by_length[length].ind_offset;
94 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
97 unsigned int i = (i1 + i2) >> 1;
98 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
100 unsigned int n = length;
107 /* Note here: i1 < i < i2. */
113 /* Note here: i1 <= i < i2. */
126 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
127 sections 3.11 and 4.4. */
128 static const char jamo_initial_short_name[19][3] =
130 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
131 "C", "K", "T", "P", "H"
133 static const char jamo_medial_short_name[21][4] =
135 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
136 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
138 static const char jamo_final_short_name[28][3] =
140 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
141 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
144 /* Looks up the name of a Unicode character, in uppercase ASCII.
145 Returns the filled buf, or NULL if the character does not have a name. */
147 unicode_character_name (ucs4_t c, char *buf)
149 if (c >= 0xAC00 && c <= 0xD7A3)
151 /* Special case for Hangul syllables. Keeps the tables small. */
159 /* buf needs to have at least 16 + 7 bytes here. */
160 memcpy (buf, "HANGUL SYLLABLE ", 16);
164 index3 = tmp % 28; tmp = tmp / 28;
165 index2 = tmp % 21; tmp = tmp / 21;
168 q = jamo_initial_short_name[index1];
171 q = jamo_medial_short_name[index2];
174 q = jamo_final_short_name[index3];
180 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
181 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
183 /* Special case for CJK compatibility ideographs. Keeps the tables
188 /* buf needs to have at least 28 + 5 bytes here. */
189 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
192 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
194 unsigned int x = (c >> i) & 0xf;
195 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
202 const uint16_t *words;
204 /* Transform the code so that it fits in 16 bits. */
207 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
232 /* Binary search in unicode_code_to_name. */
234 unsigned int i2 = SIZEOF (unicode_code_to_name);
237 unsigned int i = (i1 + i2) >> 1;
238 if (unicode_code_to_name[i].code == c)
240 words = &unicode_names[unicode_code_to_name[i].name];
243 else if (unicode_code_to_name[i].code < c)
250 /* Note here: i1 < i < i2. */
253 else if (unicode_code_to_name[i].code > c)
260 /* Note here: i1 <= i < i2. */
267 /* Found it in unicode_code_to_name. Now concatenate the words. */
268 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
272 unsigned int wordlen;
273 const char *word = unicode_name_word (*words>>1, &wordlen);
276 while (--wordlen > 0);
277 if ((*words & 1) == 0)
289 /* Looks up the Unicode character with a given name, in upper- or lowercase
290 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
292 unicode_name_character (const char *name)
294 unsigned int len = strlen (name);
295 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
297 /* Test for "word1 word2 ..." syntax. */
298 char buf[UNICODE_CHARNAME_MAX_LENGTH];
303 if (!(c >= ' ' && c <= '~'))
305 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
312 /* Convert the constituents to uint16_t words. */
313 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
314 uint16_t *wordptr = words;
316 const char *p1 = buf;
322 while (p2 < ptr && *p2 != ' ')
324 word = unicode_name_word_lookup (p1, p2 - p1);
327 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
334 /* Special case for Hangul syllables. Keeps the tables small. */
335 if (wordptr == &words[2]
336 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
337 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
339 /* Split the last word [p1..ptr) into three parts:
350 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
351 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
352 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
353 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
358 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
359 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
364 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
365 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
366 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
367 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
368 || *p4 == 'S' || *p4 == 'T'))
372 unsigned int n1 = p2 - p1;
373 unsigned int n2 = p3 - p2;
374 unsigned int n3 = p4 - p3;
376 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
380 for (index1 = 0; index1 < 19; index1++)
381 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
382 && jamo_initial_short_name[index1][n1] == '\0')
386 for (index2 = 0; index2 < 21; index2++)
387 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
388 && jamo_medial_short_name[index2][n2] == '\0')
392 for (index3 = 0; index3 < 28; index3++)
393 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
394 && jamo_final_short_name[index3][n3] == '\0')
396 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
405 /* Special case for CJK compatibility ideographs. Keeps the
407 if (wordptr == &words[2]
408 && words[0] == UNICODE_CHARNAME_WORD_CJK
409 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
412 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
414 const char *p2 = p1 + 10;
422 if (*p2 >= '0' && *p2 <= '9')
424 else if (*p2 >= 'A' && *p2 <= 'F')
425 c += (*p2 - 'A' + 10);
431 if ((c >= 0xF900 && c <= 0xFA2D)
432 || (c >= 0xFA30 && c <= 0xFA6A)
433 || (c >= 0xFA70 && c <= 0xFAD9)
434 || (c >= 0x2F800 && c <= 0x2FA1D))
448 /* Multiply by 2, to simplify later comparisons. */
449 unsigned int words_length = wordptr - words;
451 int i = words_length - 1;
452 words[i] = 2 * words[i];
454 words[i] = 2 * words[i] + 1;
456 /* Binary search in unicode_name_to_code. */
459 unsigned int i2 = SIZEOF (unicode_name_to_code);
462 unsigned int i = (i1 + i2) >> 1;
463 const uint16_t *w = words;
464 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
465 unsigned int n = words_length;
472 /* Note here: i1 < i < i2. */
480 /* Note here: i1 <= i < i2. */
487 unsigned int c = unicode_name_to_code[i].code;
489 /* Undo the transformation to 16-bit space. */
490 static const unsigned int offset[11] =
492 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
493 0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
496 return c + offset[c >> 12];
505 return UNINAME_INVALID;