1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/ArabicShaping.txt \
25 /usr/local/share/Unidata/Scripts.txt \
26 /usr/local/share/Unidata/Blocks.txt \
27 /usr/local/share/Unidata/PropList-3.0.1.txt \
28 /usr/local/share/Unidata/EastAsianWidth.txt \
29 /usr/local/share/Unidata/LineBreak.txt \
30 /usr/local/share/Unidata/WordBreakProperty.txt \
31 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
32 /usr/local/share/Unidata/CompositionExclusions.txt \
33 /usr/local/share/Unidata/SpecialCasing.txt \
34 /usr/local/share/Unidata/CaseFolding.txt \
45 /* ========================================================================= */
47 /* Reading UnicodeData.txt. */
50 /* This structure represents one line in the UnicodeData.txt file. */
51 struct unicode_attribute
53 const char *name; /* Character name */
54 const char *category; /* General category */
55 const char *combining; /* Canonical combining class */
56 const char *bidi; /* Bidirectional category */
57 const char *decomposition; /* Character decomposition mapping */
58 const char *decdigit; /* Decimal digit value */
59 const char *digit; /* Digit value */
60 const char *numeric; /* Numeric value */
61 bool mirrored; /* mirrored */
62 const char *oldname; /* Old Unicode 1.0 name */
63 const char *comment; /* Comment */
64 unsigned int upper; /* Uppercase mapping */
65 unsigned int lower; /* Lowercase mapping */
66 unsigned int title; /* Titlecase mapping */
69 /* Missing fields are represented with "" for strings, and NONE for
71 #define NONE (~(unsigned int)0)
73 /* The entire contents of the UnicodeData.txt file. */
74 struct unicode_attribute unicode_attributes [0x110000];
76 /* Stores in unicode_attributes[i] the values from the given fields. */
78 fill_attribute (unsigned int i,
79 const char *field1, const char *field2,
80 const char *field3, const char *field4,
81 const char *field5, const char *field6,
82 const char *field7, const char *field8,
83 const char *field9, const char *field10,
84 const char *field11, const char *field12,
85 const char *field13, const char *field14)
87 struct unicode_attribute * uni;
91 fprintf (stderr, "index too large\n");
94 if (strcmp (field2, "Cs") == 0)
95 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
97 uni = &unicode_attributes[i];
98 /* Copy the strings. */
99 uni->name = strdup (field1);
100 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
101 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
102 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
103 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
104 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
105 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
106 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
107 uni->mirrored = (field9[0] == 'Y');
108 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
109 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
110 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
111 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
112 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
115 /* Maximum length of a field in the UnicodeData.txt file. */
118 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
119 Reads up to (but excluding) DELIM.
120 Returns 1 when a field was successfully read, otherwise 0. */
122 getfield (FILE *stream, char *buffer, int delim)
127 for (; (c = getc (stream)), (c != EOF && c != delim); )
129 /* The original unicode.org UnicodeData.txt file happens to have
130 CR/LF line terminators. Silently convert to LF. */
134 /* Put c into the buffer. */
135 if (++count >= FIELDLEN - 1)
137 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
150 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
153 fill_attributes (const char *unicodedata_filename)
157 char field0[FIELDLEN];
158 char field1[FIELDLEN];
159 char field2[FIELDLEN];
160 char field3[FIELDLEN];
161 char field4[FIELDLEN];
162 char field5[FIELDLEN];
163 char field6[FIELDLEN];
164 char field7[FIELDLEN];
165 char field8[FIELDLEN];
166 char field9[FIELDLEN];
167 char field10[FIELDLEN];
168 char field11[FIELDLEN];
169 char field12[FIELDLEN];
170 char field13[FIELDLEN];
171 char field14[FIELDLEN];
174 for (i = 0; i < 0x110000; i++)
175 unicode_attributes[i].name = NULL;
177 stream = fopen (unicodedata_filename, "r");
180 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
189 n = getfield (stream, field0, ';');
190 n += getfield (stream, field1, ';');
191 n += getfield (stream, field2, ';');
192 n += getfield (stream, field3, ';');
193 n += getfield (stream, field4, ';');
194 n += getfield (stream, field5, ';');
195 n += getfield (stream, field6, ';');
196 n += getfield (stream, field7, ';');
197 n += getfield (stream, field8, ';');
198 n += getfield (stream, field9, ';');
199 n += getfield (stream, field10, ';');
200 n += getfield (stream, field11, ';');
201 n += getfield (stream, field12, ';');
202 n += getfield (stream, field13, ';');
203 n += getfield (stream, field14, '\n');
208 fprintf (stderr, "short line in '%s':%d\n",
209 unicodedata_filename, lineno);
212 i = strtoul (field0, NULL, 16);
214 && strlen (field1) >= 9
215 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
217 /* Deal with a range. */
219 n = getfield (stream, field0, ';');
220 n += getfield (stream, field1, ';');
221 n += getfield (stream, field2, ';');
222 n += getfield (stream, field3, ';');
223 n += getfield (stream, field4, ';');
224 n += getfield (stream, field5, ';');
225 n += getfield (stream, field6, ';');
226 n += getfield (stream, field7, ';');
227 n += getfield (stream, field8, ';');
228 n += getfield (stream, field9, ';');
229 n += getfield (stream, field10, ';');
230 n += getfield (stream, field11, ';');
231 n += getfield (stream, field12, ';');
232 n += getfield (stream, field13, ';');
233 n += getfield (stream, field14, '\n');
236 fprintf (stderr, "missing end range in '%s':%d\n",
237 unicodedata_filename, lineno);
240 if (!(field1[0] == '<'
241 && strlen (field1) >= 8
242 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
244 fprintf (stderr, "missing end range in '%s':%d\n",
245 unicodedata_filename, lineno);
248 field1[strlen (field1) - 7] = '\0';
249 j = strtoul (field0, NULL, 16);
251 fill_attribute (i, field1+1, field2, field3, field4, field5,
252 field6, field7, field8, field9, field10,
253 field11, field12, field13, field14);
257 /* Single character line */
258 fill_attribute (i, field1, field2, field3, field4, field5,
259 field6, field7, field8, field9, field10,
260 field11, field12, field13, field14);
264 if (ferror (stream) || fclose (stream))
266 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
271 /* ========================================================================= */
273 /* General category. */
274 /* See Unicode 3.0 book, section 4.5,
278 is_category_L (unsigned int ch)
280 return (unicode_attributes[ch].name != NULL
281 && unicode_attributes[ch].category[0] == 'L');
285 is_category_Lu (unsigned int ch)
287 return (unicode_attributes[ch].name != NULL
288 && unicode_attributes[ch].category[0] == 'L'
289 && unicode_attributes[ch].category[1] == 'u');
293 is_category_Ll (unsigned int ch)
295 return (unicode_attributes[ch].name != NULL
296 && unicode_attributes[ch].category[0] == 'L'
297 && unicode_attributes[ch].category[1] == 'l');
301 is_category_Lt (unsigned int ch)
303 return (unicode_attributes[ch].name != NULL
304 && unicode_attributes[ch].category[0] == 'L'
305 && unicode_attributes[ch].category[1] == 't');
309 is_category_Lm (unsigned int ch)
311 return (unicode_attributes[ch].name != NULL
312 && unicode_attributes[ch].category[0] == 'L'
313 && unicode_attributes[ch].category[1] == 'm');
317 is_category_Lo (unsigned int ch)
319 return (unicode_attributes[ch].name != NULL
320 && unicode_attributes[ch].category[0] == 'L'
321 && unicode_attributes[ch].category[1] == 'o');
325 is_category_M (unsigned int ch)
327 return (unicode_attributes[ch].name != NULL
328 && unicode_attributes[ch].category[0] == 'M');
332 is_category_Mn (unsigned int ch)
334 return (unicode_attributes[ch].name != NULL
335 && unicode_attributes[ch].category[0] == 'M'
336 && unicode_attributes[ch].category[1] == 'n');
340 is_category_Mc (unsigned int ch)
342 return (unicode_attributes[ch].name != NULL
343 && unicode_attributes[ch].category[0] == 'M'
344 && unicode_attributes[ch].category[1] == 'c');
348 is_category_Me (unsigned int ch)
350 return (unicode_attributes[ch].name != NULL
351 && unicode_attributes[ch].category[0] == 'M'
352 && unicode_attributes[ch].category[1] == 'e');
356 is_category_N (unsigned int ch)
358 return (unicode_attributes[ch].name != NULL
359 && unicode_attributes[ch].category[0] == 'N');
363 is_category_Nd (unsigned int ch)
365 return (unicode_attributes[ch].name != NULL
366 && unicode_attributes[ch].category[0] == 'N'
367 && unicode_attributes[ch].category[1] == 'd');
371 is_category_Nl (unsigned int ch)
373 return (unicode_attributes[ch].name != NULL
374 && unicode_attributes[ch].category[0] == 'N'
375 && unicode_attributes[ch].category[1] == 'l');
379 is_category_No (unsigned int ch)
381 return (unicode_attributes[ch].name != NULL
382 && unicode_attributes[ch].category[0] == 'N'
383 && unicode_attributes[ch].category[1] == 'o');
387 is_category_P (unsigned int ch)
389 return (unicode_attributes[ch].name != NULL
390 && unicode_attributes[ch].category[0] == 'P');
394 is_category_Pc (unsigned int ch)
396 return (unicode_attributes[ch].name != NULL
397 && unicode_attributes[ch].category[0] == 'P'
398 && unicode_attributes[ch].category[1] == 'c');
402 is_category_Pd (unsigned int ch)
404 return (unicode_attributes[ch].name != NULL
405 && unicode_attributes[ch].category[0] == 'P'
406 && unicode_attributes[ch].category[1] == 'd');
410 is_category_Ps (unsigned int ch)
412 return (unicode_attributes[ch].name != NULL
413 && unicode_attributes[ch].category[0] == 'P'
414 && unicode_attributes[ch].category[1] == 's');
418 is_category_Pe (unsigned int ch)
420 return (unicode_attributes[ch].name != NULL
421 && unicode_attributes[ch].category[0] == 'P'
422 && unicode_attributes[ch].category[1] == 'e');
426 is_category_Pi (unsigned int ch)
428 return (unicode_attributes[ch].name != NULL
429 && unicode_attributes[ch].category[0] == 'P'
430 && unicode_attributes[ch].category[1] == 'i');
434 is_category_Pf (unsigned int ch)
436 return (unicode_attributes[ch].name != NULL
437 && unicode_attributes[ch].category[0] == 'P'
438 && unicode_attributes[ch].category[1] == 'f');
442 is_category_Po (unsigned int ch)
444 return (unicode_attributes[ch].name != NULL
445 && unicode_attributes[ch].category[0] == 'P'
446 && unicode_attributes[ch].category[1] == 'o');
450 is_category_S (unsigned int ch)
452 return (unicode_attributes[ch].name != NULL
453 && unicode_attributes[ch].category[0] == 'S');
457 is_category_Sm (unsigned int ch)
459 return (unicode_attributes[ch].name != NULL
460 && unicode_attributes[ch].category[0] == 'S'
461 && unicode_attributes[ch].category[1] == 'm');
465 is_category_Sc (unsigned int ch)
467 return (unicode_attributes[ch].name != NULL
468 && unicode_attributes[ch].category[0] == 'S'
469 && unicode_attributes[ch].category[1] == 'c');
473 is_category_Sk (unsigned int ch)
475 return (unicode_attributes[ch].name != NULL
476 && unicode_attributes[ch].category[0] == 'S'
477 && unicode_attributes[ch].category[1] == 'k');
481 is_category_So (unsigned int ch)
483 return (unicode_attributes[ch].name != NULL
484 && unicode_attributes[ch].category[0] == 'S'
485 && unicode_attributes[ch].category[1] == 'o');
489 is_category_Z (unsigned int ch)
491 return (unicode_attributes[ch].name != NULL
492 && unicode_attributes[ch].category[0] == 'Z');
496 is_category_Zs (unsigned int ch)
498 return (unicode_attributes[ch].name != NULL
499 && unicode_attributes[ch].category[0] == 'Z'
500 && unicode_attributes[ch].category[1] == 's');
504 is_category_Zl (unsigned int ch)
506 return (unicode_attributes[ch].name != NULL
507 && unicode_attributes[ch].category[0] == 'Z'
508 && unicode_attributes[ch].category[1] == 'l');
512 is_category_Zp (unsigned int ch)
514 return (unicode_attributes[ch].name != NULL
515 && unicode_attributes[ch].category[0] == 'Z'
516 && unicode_attributes[ch].category[1] == 'p');
520 is_category_C (unsigned int ch)
522 return (unicode_attributes[ch].name == NULL
523 || unicode_attributes[ch].category[0] == 'C');
527 is_category_Cc (unsigned int ch)
529 return (unicode_attributes[ch].name != NULL
530 && unicode_attributes[ch].category[0] == 'C'
531 && unicode_attributes[ch].category[1] == 'c');
535 is_category_Cf (unsigned int ch)
537 return (unicode_attributes[ch].name != NULL
538 && unicode_attributes[ch].category[0] == 'C'
539 && unicode_attributes[ch].category[1] == 'f');
543 is_category_Cs (unsigned int ch)
545 return (ch >= 0xd800 && ch < 0xe000);
549 is_category_Co (unsigned int ch)
551 return (unicode_attributes[ch].name != NULL
552 && unicode_attributes[ch].category[0] == 'C'
553 && unicode_attributes[ch].category[1] == 'o');
557 is_category_Cn (unsigned int ch)
559 return (unicode_attributes[ch].name == NULL
560 && !(ch >= 0xd800 && ch < 0xe000));
563 /* Output a boolean property in a human readable format. */
565 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
570 stream = fopen (filename, "w");
573 fprintf (stderr, "cannot open '%s' for writing\n", filename);
577 #if 0 /* This yields huge text output. */
578 for (ch = 0; ch < 0x110000; ch++)
581 fprintf (stream, "0x%04X\n", ch);
584 for (ch = 0; ch < 0x110000; ch++)
587 unsigned int first = ch;
590 while (ch + 1 < 0x110000 && predicate (ch + 1))
594 fprintf (stream, "0x%04X..0x%04X\n", first, last);
596 fprintf (stream, "0x%04X\n", ch);
600 if (ferror (stream) || fclose (stream))
602 fprintf (stderr, "error writing to '%s'\n", filename);
607 /* Output the unit test for a boolean property. */
609 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
615 stream = fopen (filename, "w");
618 fprintf (stderr, "cannot open '%s' for writing\n", filename);
622 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
623 fprintf (stream, "/* Test the Unicode character type functions.\n");
624 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
625 fprintf (stream, "\n");
626 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
627 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
628 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
629 fprintf (stream, " (at your option) any later version.\n");
630 fprintf (stream, "\n");
631 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
632 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
633 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
634 fprintf (stream, " GNU General Public License for more details.\n");
635 fprintf (stream, "\n");
636 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
637 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
638 fprintf (stream, "\n");
639 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
640 fprintf (stream, "\n");
643 for (ch = 0; ch < 0x110000; ch++)
646 unsigned int first = ch;
649 while (ch + 1 < 0x110000 && predicate (ch + 1))
653 fprintf (stream, ",\n");
654 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
658 fprintf (stream, "\n");
660 fprintf (stream, "\n");
661 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
662 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
664 if (ferror (stream) || fclose (stream))
666 fprintf (stderr, "error writing to '%s'\n", filename);
671 /* Construction of sparse 3-level tables. */
672 #define TABLE predicate_table
673 #define xmalloc malloc
674 #define xrealloc realloc
675 #include "3levelbit.h"
677 /* Output a boolean property in a three-level bitmap. */
679 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
683 struct predicate_table t;
684 unsigned int level1_offset, level2_offset, level3_offset;
686 stream = fopen (filename, "w");
689 fprintf (stderr, "cannot open '%s' for writing\n", filename);
693 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
694 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
695 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
700 predicate_table_init (&t);
702 for (ch = 0; ch < 0x110000; ch++)
704 predicate_table_add (&t, ch);
706 predicate_table_finalize (&t);
708 /* Offsets in t.result, in memory of this process. */
710 5 * sizeof (uint32_t);
712 5 * sizeof (uint32_t)
713 + t.level1_size * sizeof (uint32_t);
715 5 * sizeof (uint32_t)
716 + t.level1_size * sizeof (uint32_t)
717 + (t.level2_size << t.q) * sizeof (uint32_t);
719 for (i = 0; i < 5; i++)
721 fprintf (stream, "#define header_%d %d\n", i,
722 ((uint32_t *) t.result)[i]);
724 fprintf (stream, "static const\n");
725 fprintf (stream, "struct\n");
726 fprintf (stream, " {\n");
727 fprintf (stream, " int header[1];\n");
728 fprintf (stream, " int level1[%zu];\n", t.level1_size);
729 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
730 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
731 fprintf (stream, " }\n");
732 fprintf (stream, "%s =\n", name);
733 fprintf (stream, "{\n");
734 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
735 fprintf (stream, " {");
736 if (t.level1_size > 1)
737 fprintf (stream, "\n ");
738 for (i = 0; i < t.level1_size; i++)
741 if (i > 0 && (i % 1) == 0)
742 fprintf (stream, "\n ");
743 offset = ((uint32_t *) (t.result + level1_offset))[i];
745 fprintf (stream, " %5d", -1);
747 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
748 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
749 if (i+1 < t.level1_size)
750 fprintf (stream, ",");
752 if (t.level1_size > 1)
753 fprintf (stream, "\n ");
754 fprintf (stream, " },\n");
755 fprintf (stream, " {");
756 if (t.level2_size << t.q > 1)
757 fprintf (stream, "\n ");
758 for (i = 0; i < t.level2_size << t.q; i++)
761 if (i > 0 && (i % 1) == 0)
762 fprintf (stream, "\n ");
763 offset = ((uint32_t *) (t.result + level2_offset))[i];
765 fprintf (stream, " %5d", -1);
767 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
768 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
769 if (i+1 < t.level2_size << t.q)
770 fprintf (stream, ",");
772 if (t.level2_size << t.q > 1)
773 fprintf (stream, "\n ");
774 fprintf (stream, " },\n");
775 fprintf (stream, " {");
776 if (t.level3_size << t.p > 4)
777 fprintf (stream, "\n ");
778 for (i = 0; i < t.level3_size << t.p; i++)
780 if (i > 0 && (i % 4) == 0)
781 fprintf (stream, "\n ");
782 fprintf (stream, " 0x%08X",
783 ((uint32_t *) (t.result + level3_offset))[i]);
784 if (i+1 < t.level3_size << t.p)
785 fprintf (stream, ",");
787 if (t.level3_size << t.p > 4)
788 fprintf (stream, "\n ");
789 fprintf (stream, " }\n");
790 fprintf (stream, "};\n");
792 if (ferror (stream) || fclose (stream))
794 fprintf (stderr, "error writing to '%s'\n", filename);
799 /* Output all categories. */
801 output_categories (const char *version)
803 #define CATEGORY(C) \
804 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
805 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
806 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
849 UC_CATEGORY_MASK_L = 0x0000001f,
850 UC_CATEGORY_MASK_Lu = 0x00000001,
851 UC_CATEGORY_MASK_Ll = 0x00000002,
852 UC_CATEGORY_MASK_Lt = 0x00000004,
853 UC_CATEGORY_MASK_Lm = 0x00000008,
854 UC_CATEGORY_MASK_Lo = 0x00000010,
855 UC_CATEGORY_MASK_M = 0x000000e0,
856 UC_CATEGORY_MASK_Mn = 0x00000020,
857 UC_CATEGORY_MASK_Mc = 0x00000040,
858 UC_CATEGORY_MASK_Me = 0x00000080,
859 UC_CATEGORY_MASK_N = 0x00000700,
860 UC_CATEGORY_MASK_Nd = 0x00000100,
861 UC_CATEGORY_MASK_Nl = 0x00000200,
862 UC_CATEGORY_MASK_No = 0x00000400,
863 UC_CATEGORY_MASK_P = 0x0003f800,
864 UC_CATEGORY_MASK_Pc = 0x00000800,
865 UC_CATEGORY_MASK_Pd = 0x00001000,
866 UC_CATEGORY_MASK_Ps = 0x00002000,
867 UC_CATEGORY_MASK_Pe = 0x00004000,
868 UC_CATEGORY_MASK_Pi = 0x00008000,
869 UC_CATEGORY_MASK_Pf = 0x00010000,
870 UC_CATEGORY_MASK_Po = 0x00020000,
871 UC_CATEGORY_MASK_S = 0x003c0000,
872 UC_CATEGORY_MASK_Sm = 0x00040000,
873 UC_CATEGORY_MASK_Sc = 0x00080000,
874 UC_CATEGORY_MASK_Sk = 0x00100000,
875 UC_CATEGORY_MASK_So = 0x00200000,
876 UC_CATEGORY_MASK_Z = 0x01c00000,
877 UC_CATEGORY_MASK_Zs = 0x00400000,
878 UC_CATEGORY_MASK_Zl = 0x00800000,
879 UC_CATEGORY_MASK_Zp = 0x01000000,
880 UC_CATEGORY_MASK_C = 0x3e000000,
881 UC_CATEGORY_MASK_Cc = 0x02000000,
882 UC_CATEGORY_MASK_Cf = 0x04000000,
883 UC_CATEGORY_MASK_Cs = 0x08000000,
884 UC_CATEGORY_MASK_Co = 0x10000000,
885 UC_CATEGORY_MASK_Cn = 0x20000000
889 general_category_byname (const char *category_name)
891 if (category_name[0] != '\0'
892 && (category_name[1] == '\0' || category_name[2] == '\0'))
893 switch (category_name[0])
896 switch (category_name[1])
898 case '\0': return UC_CATEGORY_MASK_L;
899 case 'u': return UC_CATEGORY_MASK_Lu;
900 case 'l': return UC_CATEGORY_MASK_Ll;
901 case 't': return UC_CATEGORY_MASK_Lt;
902 case 'm': return UC_CATEGORY_MASK_Lm;
903 case 'o': return UC_CATEGORY_MASK_Lo;
907 switch (category_name[1])
909 case '\0': return UC_CATEGORY_MASK_M;
910 case 'n': return UC_CATEGORY_MASK_Mn;
911 case 'c': return UC_CATEGORY_MASK_Mc;
912 case 'e': return UC_CATEGORY_MASK_Me;
916 switch (category_name[1])
918 case '\0': return UC_CATEGORY_MASK_N;
919 case 'd': return UC_CATEGORY_MASK_Nd;
920 case 'l': return UC_CATEGORY_MASK_Nl;
921 case 'o': return UC_CATEGORY_MASK_No;
925 switch (category_name[1])
927 case '\0': return UC_CATEGORY_MASK_P;
928 case 'c': return UC_CATEGORY_MASK_Pc;
929 case 'd': return UC_CATEGORY_MASK_Pd;
930 case 's': return UC_CATEGORY_MASK_Ps;
931 case 'e': return UC_CATEGORY_MASK_Pe;
932 case 'i': return UC_CATEGORY_MASK_Pi;
933 case 'f': return UC_CATEGORY_MASK_Pf;
934 case 'o': return UC_CATEGORY_MASK_Po;
938 switch (category_name[1])
940 case '\0': return UC_CATEGORY_MASK_S;
941 case 'm': return UC_CATEGORY_MASK_Sm;
942 case 'c': return UC_CATEGORY_MASK_Sc;
943 case 'k': return UC_CATEGORY_MASK_Sk;
944 case 'o': return UC_CATEGORY_MASK_So;
948 switch (category_name[1])
950 case '\0': return UC_CATEGORY_MASK_Z;
951 case 's': return UC_CATEGORY_MASK_Zs;
952 case 'l': return UC_CATEGORY_MASK_Zl;
953 case 'p': return UC_CATEGORY_MASK_Zp;
957 switch (category_name[1])
959 case '\0': return UC_CATEGORY_MASK_C;
960 case 'c': return UC_CATEGORY_MASK_Cc;
961 case 'f': return UC_CATEGORY_MASK_Cf;
962 case 's': return UC_CATEGORY_MASK_Cs;
963 case 'o': return UC_CATEGORY_MASK_Co;
964 case 'n': return UC_CATEGORY_MASK_Cn;
968 /* Invalid category name. */
972 /* Construction of sparse 3-level tables. */
973 #define TABLE category_table
974 #define ELEMENT uint8_t
975 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
976 #define xmalloc malloc
977 #define xrealloc realloc
980 /* Output the per-character category table. */
982 output_category (const char *filename, const char *version)
986 struct category_table t;
987 unsigned int level1_offset, level2_offset, level3_offset;
988 uint16_t *level3_packed;
990 stream = fopen (filename, "w");
993 fprintf (stderr, "cannot open '%s' for writing\n", filename);
997 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
998 fprintf (stream, "/* Categories of Unicode characters. */\n");
999 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1004 category_table_init (&t);
1006 for (ch = 0; ch < 0x110000; ch++)
1009 unsigned int log2_value;
1011 if (is_category_Cs (ch))
1012 value = UC_CATEGORY_MASK_Cs;
1013 else if (unicode_attributes[ch].name != NULL)
1014 value = general_category_byname (unicode_attributes[ch].category);
1018 /* Now value should contain exactly one bit. */
1019 if (value == 0 || ((value & (value - 1)) != 0))
1022 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1024 category_table_add (&t, ch, log2_value);
1027 category_table_finalize (&t);
1029 /* Offsets in t.result, in memory of this process. */
1031 5 * sizeof (uint32_t);
1033 5 * sizeof (uint32_t)
1034 + t.level1_size * sizeof (uint32_t);
1036 5 * sizeof (uint32_t)
1037 + t.level1_size * sizeof (uint32_t)
1038 + (t.level2_size << t.q) * sizeof (uint32_t);
1040 for (i = 0; i < 5; i++)
1041 fprintf (stream, "#define category_header_%d %d\n", i,
1042 ((uint32_t *) t.result)[i]);
1043 fprintf (stream, "static const\n");
1044 fprintf (stream, "struct\n");
1045 fprintf (stream, " {\n");
1046 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1047 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1048 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1049 (1 << t.p) * 5 / 16);
1050 fprintf (stream, " }\n");
1051 fprintf (stream, "u_category =\n");
1052 fprintf (stream, "{\n");
1053 fprintf (stream, " {");
1054 if (t.level1_size > 8)
1055 fprintf (stream, "\n ");
1056 for (i = 0; i < t.level1_size; i++)
1059 if (i > 0 && (i % 8) == 0)
1060 fprintf (stream, "\n ");
1061 offset = ((uint32_t *) (t.result + level1_offset))[i];
1063 fprintf (stream, " %5d", -1);
1065 fprintf (stream, " %5zu",
1066 (offset - level2_offset) / sizeof (uint32_t));
1067 if (i+1 < t.level1_size)
1068 fprintf (stream, ",");
1070 if (t.level1_size > 8)
1071 fprintf (stream, "\n ");
1072 fprintf (stream, " },\n");
1073 fprintf (stream, " {");
1074 if (t.level2_size << t.q > 8)
1075 fprintf (stream, "\n ");
1076 for (i = 0; i < t.level2_size << t.q; i++)
1079 if (i > 0 && (i % 8) == 0)
1080 fprintf (stream, "\n ");
1081 offset = ((uint32_t *) (t.result + level2_offset))[i];
1083 fprintf (stream, " %5d", -1);
1085 fprintf (stream, " %5zu",
1086 (offset - level3_offset) / sizeof (uint8_t));
1087 if (i+1 < t.level2_size << t.q)
1088 fprintf (stream, ",");
1090 if (t.level2_size << t.q > 8)
1091 fprintf (stream, "\n ");
1092 fprintf (stream, " },\n");
1093 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1094 not 32-bit units, in order to make the lookup function easier. */
1097 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1098 for (i = 0; i < t.level3_size << t.p; i++)
1100 unsigned int j = (i * 5) / 16;
1101 unsigned int k = (i * 5) % 16;
1102 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1103 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1104 level3_packed[j] = value & 0xffff;
1105 level3_packed[j+1] = value >> 16;
1107 fprintf (stream, " {");
1108 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1109 fprintf (stream, "\n ");
1110 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1112 if (i > 0 && (i % 8) == 0)
1113 fprintf (stream, "\n ");
1114 fprintf (stream, " 0x%04x", level3_packed[i]);
1115 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1116 fprintf (stream, ",");
1118 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1119 fprintf (stream, "\n ");
1120 fprintf (stream, " }\n");
1121 free (level3_packed);
1122 fprintf (stream, "};\n");
1124 if (ferror (stream) || fclose (stream))
1126 fprintf (stderr, "error writing to '%s'\n", filename);
1131 /* ========================================================================= */
1133 /* Canonical combining class. */
1134 /* See Unicode 3.0 book, section 4.2,
1137 /* Construction of sparse 3-level tables. */
1138 #define TABLE combclass_table
1139 #define ELEMENT uint8_t
1141 #define xmalloc malloc
1142 #define xrealloc realloc
1145 /* Output the per-character combining class table. */
1147 output_combclass (const char *filename, const char *version)
1151 struct combclass_table t;
1152 unsigned int level1_offset, level2_offset, level3_offset;
1154 stream = fopen (filename, "w");
1157 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1161 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1162 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1163 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1168 combclass_table_init (&t);
1170 for (ch = 0; ch < 0x110000; ch++)
1171 if (unicode_attributes[ch].name != NULL)
1173 int value = atoi (unicode_attributes[ch].combining);
1174 if (!(value >= 0 && value <= 255))
1176 combclass_table_add (&t, ch, value);
1179 combclass_table_finalize (&t);
1181 /* Offsets in t.result, in memory of this process. */
1183 5 * sizeof (uint32_t);
1185 5 * sizeof (uint32_t)
1186 + t.level1_size * sizeof (uint32_t);
1188 5 * sizeof (uint32_t)
1189 + t.level1_size * sizeof (uint32_t)
1190 + (t.level2_size << t.q) * sizeof (uint32_t);
1192 for (i = 0; i < 5; i++)
1193 fprintf (stream, "#define combclass_header_%d %d\n", i,
1194 ((uint32_t *) t.result)[i]);
1195 fprintf (stream, "static const\n");
1196 fprintf (stream, "struct\n");
1197 fprintf (stream, " {\n");
1198 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1199 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1200 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1201 fprintf (stream, " }\n");
1202 fprintf (stream, "u_combclass =\n");
1203 fprintf (stream, "{\n");
1204 fprintf (stream, " {");
1205 if (t.level1_size > 8)
1206 fprintf (stream, "\n ");
1207 for (i = 0; i < t.level1_size; i++)
1210 if (i > 0 && (i % 8) == 0)
1211 fprintf (stream, "\n ");
1212 offset = ((uint32_t *) (t.result + level1_offset))[i];
1214 fprintf (stream, " %5d", -1);
1216 fprintf (stream, " %5zu",
1217 (offset - level2_offset) / sizeof (uint32_t));
1218 if (i+1 < t.level1_size)
1219 fprintf (stream, ",");
1221 if (t.level1_size > 8)
1222 fprintf (stream, "\n ");
1223 fprintf (stream, " },\n");
1224 fprintf (stream, " {");
1225 if (t.level2_size << t.q > 8)
1226 fprintf (stream, "\n ");
1227 for (i = 0; i < t.level2_size << t.q; i++)
1230 if (i > 0 && (i % 8) == 0)
1231 fprintf (stream, "\n ");
1232 offset = ((uint32_t *) (t.result + level2_offset))[i];
1234 fprintf (stream, " %5d", -1);
1236 fprintf (stream, " %5zu",
1237 (offset - level3_offset) / sizeof (uint8_t));
1238 if (i+1 < t.level2_size << t.q)
1239 fprintf (stream, ",");
1241 if (t.level2_size << t.q > 8)
1242 fprintf (stream, "\n ");
1243 fprintf (stream, " },\n");
1244 fprintf (stream, " {");
1245 if (t.level3_size << t.p > 8)
1246 fprintf (stream, "\n ");
1247 for (i = 0; i < t.level3_size << t.p; i++)
1249 if (i > 0 && (i % 8) == 0)
1250 fprintf (stream, "\n ");
1251 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1252 if (i+1 < t.level3_size << t.p)
1253 fprintf (stream, ",");
1255 if (t.level3_size << t.p > 8)
1256 fprintf (stream, "\n ");
1257 fprintf (stream, " }\n");
1258 fprintf (stream, "};\n");
1260 if (ferror (stream) || fclose (stream))
1262 fprintf (stderr, "error writing to '%s'\n", filename);
1267 /* ========================================================================= */
1269 /* Bidirectional category. */
1270 /* See Unicode 3.0 book, section 4.3,
1275 UC_BIDI_L, /* Left-to-Right */
1276 UC_BIDI_LRE, /* Left-to-Right Embedding */
1277 UC_BIDI_LRO, /* Left-to-Right Override */
1278 UC_BIDI_R, /* Right-to-Left */
1279 UC_BIDI_AL, /* Right-to-Left Arabic */
1280 UC_BIDI_RLE, /* Right-to-Left Embedding */
1281 UC_BIDI_RLO, /* Right-to-Left Override */
1282 UC_BIDI_PDF, /* Pop Directional Format */
1283 UC_BIDI_EN, /* European Number */
1284 UC_BIDI_ES, /* European Number Separator */
1285 UC_BIDI_ET, /* European Number Terminator */
1286 UC_BIDI_AN, /* Arabic Number */
1287 UC_BIDI_CS, /* Common Number Separator */
1288 UC_BIDI_NSM, /* Non-Spacing Mark */
1289 UC_BIDI_BN, /* Boundary Neutral */
1290 UC_BIDI_B, /* Paragraph Separator */
1291 UC_BIDI_S, /* Segment Separator */
1292 UC_BIDI_WS, /* Whitespace */
1293 UC_BIDI_ON /* Other Neutral */
1297 bidi_category_byname (const char *category_name)
1299 switch (category_name[0])
1302 switch (category_name[1])
1305 if (category_name[2] == '\0')
1309 if (category_name[2] == '\0')
1315 switch (category_name[1])
1320 if (category_name[2] == '\0')
1326 switch (category_name[1])
1329 if (category_name[2] == '\0')
1335 switch (category_name[1])
1338 if (category_name[2] == '\0')
1342 if (category_name[2] == '\0')
1346 if (category_name[2] == '\0')
1352 switch (category_name[1])
1357 switch (category_name[2])
1360 if (category_name[3] == '\0')
1364 if (category_name[3] == '\0')
1372 switch (category_name[1])
1375 switch (category_name[2])
1378 if (category_name[3] == '\0')
1386 switch (category_name[1])
1389 if (category_name[2] == '\0')
1395 switch (category_name[1])
1398 switch (category_name[2])
1401 if (category_name[3] == '\0')
1409 switch (category_name[1])
1414 switch (category_name[2])
1417 if (category_name[3] == '\0')
1421 if (category_name[3] == '\0')
1429 if (category_name[1] == '\0')
1433 switch (category_name[1])
1436 if (category_name[2] == '\0')
1442 /* Invalid bidi category name. */
1447 get_bidi_category (unsigned int ch)
1449 if (unicode_attributes[ch].name != NULL)
1450 return bidi_category_byname (unicode_attributes[ch].bidi);
1453 /* The bidi category of unassigned characters depends on the range.
1454 See UTR #9 and DerivedBidiClass.txt. */
1455 if ((ch >= 0x0590 && ch <= 0x05FF)
1456 || (ch >= 0x07FB && ch <= 0x08FF)
1457 || (ch >= 0xFB37 && ch <= 0xFB45)
1458 || (ch >= 0x10800 && ch <= 0x10FFF))
1460 else if ((ch >= 0x0600 && ch <= 0x07BF)
1461 || (ch >= 0x2064 && ch <= 0x2069)
1462 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1463 || (ch >= 0xFDFE && ch <= 0xFEFE))
1465 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1466 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1467 || (ch & 0xFFFF) == 0xFFFE
1468 || (ch & 0xFFFF) == 0xFFFF
1469 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1476 /* Construction of sparse 3-level tables. */
1477 #define TABLE bidi_category_table
1478 #define ELEMENT uint8_t
1479 #define DEFAULT UC_BIDI_L
1480 #define xmalloc malloc
1481 #define xrealloc realloc
1484 /* Output the per-character bidi category table. */
1486 output_bidi_category (const char *filename, const char *version)
1490 struct bidi_category_table t;
1491 unsigned int level1_offset, level2_offset, level3_offset;
1492 uint16_t *level3_packed;
1494 stream = fopen (filename, "w");
1497 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1501 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1502 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1503 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1508 bidi_category_table_init (&t);
1510 for (ch = 0; ch < 0x110000; ch++)
1512 int value = get_bidi_category (ch);
1514 bidi_category_table_add (&t, ch, value);
1517 bidi_category_table_finalize (&t);
1519 /* Offsets in t.result, in memory of this process. */
1521 5 * sizeof (uint32_t);
1523 5 * sizeof (uint32_t)
1524 + t.level1_size * sizeof (uint32_t);
1526 5 * sizeof (uint32_t)
1527 + t.level1_size * sizeof (uint32_t)
1528 + (t.level2_size << t.q) * sizeof (uint32_t);
1530 for (i = 0; i < 5; i++)
1531 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1532 ((uint32_t *) t.result)[i]);
1533 fprintf (stream, "static const\n");
1534 fprintf (stream, "struct\n");
1535 fprintf (stream, " {\n");
1536 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1537 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1538 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1539 (1 << t.p) * 5 / 16);
1540 fprintf (stream, " }\n");
1541 fprintf (stream, "u_bidi_category =\n");
1542 fprintf (stream, "{\n");
1543 fprintf (stream, " {");
1544 if (t.level1_size > 8)
1545 fprintf (stream, "\n ");
1546 for (i = 0; i < t.level1_size; i++)
1549 if (i > 0 && (i % 8) == 0)
1550 fprintf (stream, "\n ");
1551 offset = ((uint32_t *) (t.result + level1_offset))[i];
1553 fprintf (stream, " %5d", -1);
1555 fprintf (stream, " %5zu",
1556 (offset - level2_offset) / sizeof (uint32_t));
1557 if (i+1 < t.level1_size)
1558 fprintf (stream, ",");
1560 if (t.level1_size > 8)
1561 fprintf (stream, "\n ");
1562 fprintf (stream, " },\n");
1563 fprintf (stream, " {");
1564 if (t.level2_size << t.q > 8)
1565 fprintf (stream, "\n ");
1566 for (i = 0; i < t.level2_size << t.q; i++)
1569 if (i > 0 && (i % 8) == 0)
1570 fprintf (stream, "\n ");
1571 offset = ((uint32_t *) (t.result + level2_offset))[i];
1573 fprintf (stream, " %5d", -1);
1575 fprintf (stream, " %5zu",
1576 (offset - level3_offset) / sizeof (uint8_t));
1577 if (i+1 < t.level2_size << t.q)
1578 fprintf (stream, ",");
1580 if (t.level2_size << t.q > 8)
1581 fprintf (stream, "\n ");
1582 fprintf (stream, " },\n");
1583 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1584 not 32-bit units, in order to make the lookup function easier. */
1587 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1588 for (i = 0; i < t.level3_size << t.p; i++)
1590 unsigned int j = (i * 5) / 16;
1591 unsigned int k = (i * 5) % 16;
1592 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1593 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1594 level3_packed[j] = value & 0xffff;
1595 level3_packed[j+1] = value >> 16;
1597 fprintf (stream, " {");
1598 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1599 fprintf (stream, "\n ");
1600 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1602 if (i > 0 && (i % 8) == 0)
1603 fprintf (stream, "\n ");
1604 fprintf (stream, " 0x%04x", level3_packed[i]);
1605 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1606 fprintf (stream, ",");
1608 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1609 fprintf (stream, "\n ");
1610 fprintf (stream, " }\n");
1611 free (level3_packed);
1612 fprintf (stream, "};\n");
1614 if (ferror (stream) || fclose (stream))
1616 fprintf (stderr, "error writing to '%s'\n", filename);
1621 /* ========================================================================= */
1623 /* Decimal digit value. */
1624 /* See Unicode 3.0 book, section 4.6. */
1627 get_decdigit_value (unsigned int ch)
1629 if (unicode_attributes[ch].name != NULL
1630 && unicode_attributes[ch].decdigit[0] != '\0')
1631 return atoi (unicode_attributes[ch].decdigit);
1635 /* Construction of sparse 3-level tables. */
1636 #define TABLE decdigit_table
1637 #define ELEMENT uint8_t
1639 #define xmalloc malloc
1640 #define xrealloc realloc
1643 /* Output the unit test for the per-character decimal digit value table. */
1645 output_decimal_digit_test (const char *filename, const char *version)
1651 stream = fopen (filename, "w");
1654 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1658 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1659 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1660 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1664 for (ch = 0; ch < 0x110000; ch++)
1666 int value = get_decdigit_value (ch);
1668 if (!(value >= -1 && value < 10))
1674 fprintf (stream, ",\n");
1675 fprintf (stream, " { 0x%04X, %d }", ch, value);
1680 fprintf (stream, "\n");
1682 if (ferror (stream) || fclose (stream))
1684 fprintf (stderr, "error writing to '%s'\n", filename);
1689 /* Output the per-character decimal digit value table. */
1691 output_decimal_digit (const char *filename, const char *version)
1695 struct decdigit_table t;
1696 unsigned int level1_offset, level2_offset, level3_offset;
1698 stream = fopen (filename, "w");
1701 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1705 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1706 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1707 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1712 decdigit_table_init (&t);
1714 for (ch = 0; ch < 0x110000; ch++)
1716 int value = 1 + get_decdigit_value (ch);
1718 if (!(value >= 0 && value <= 10))
1721 decdigit_table_add (&t, ch, value);
1724 decdigit_table_finalize (&t);
1726 /* Offsets in t.result, in memory of this process. */
1728 5 * sizeof (uint32_t);
1730 5 * sizeof (uint32_t)
1731 + t.level1_size * sizeof (uint32_t);
1733 5 * sizeof (uint32_t)
1734 + t.level1_size * sizeof (uint32_t)
1735 + (t.level2_size << t.q) * sizeof (uint32_t);
1737 for (i = 0; i < 5; i++)
1738 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1739 ((uint32_t *) t.result)[i]);
1740 fprintf (stream, "static const\n");
1741 fprintf (stream, "struct\n");
1742 fprintf (stream, " {\n");
1743 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1744 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1745 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1747 fprintf (stream, " }\n");
1748 fprintf (stream, "u_decdigit =\n");
1749 fprintf (stream, "{\n");
1750 fprintf (stream, " {");
1751 if (t.level1_size > 8)
1752 fprintf (stream, "\n ");
1753 for (i = 0; i < t.level1_size; i++)
1756 if (i > 0 && (i % 8) == 0)
1757 fprintf (stream, "\n ");
1758 offset = ((uint32_t *) (t.result + level1_offset))[i];
1760 fprintf (stream, " %5d", -1);
1762 fprintf (stream, " %5zu",
1763 (offset - level2_offset) / sizeof (uint32_t));
1764 if (i+1 < t.level1_size)
1765 fprintf (stream, ",");
1767 if (t.level1_size > 8)
1768 fprintf (stream, "\n ");
1769 fprintf (stream, " },\n");
1770 fprintf (stream, " {");
1771 if (t.level2_size << t.q > 8)
1772 fprintf (stream, "\n ");
1773 for (i = 0; i < t.level2_size << t.q; i++)
1776 if (i > 0 && (i % 8) == 0)
1777 fprintf (stream, "\n ");
1778 offset = ((uint32_t *) (t.result + level2_offset))[i];
1780 fprintf (stream, " %5d", -1);
1782 fprintf (stream, " %5zu",
1783 (offset - level3_offset) / sizeof (uint8_t));
1784 if (i+1 < t.level2_size << t.q)
1785 fprintf (stream, ",");
1787 if (t.level2_size << t.q > 8)
1788 fprintf (stream, "\n ");
1789 fprintf (stream, " },\n");
1790 /* Pack the level3 array. Each entry needs 4 bits only. */
1791 fprintf (stream, " {");
1792 if (t.level3_size << (t.p - 1) > 8)
1793 fprintf (stream, "\n ");
1794 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1796 if (i > 0 && (i % 8) == 0)
1797 fprintf (stream, "\n ");
1798 fprintf (stream, " 0x%02x",
1799 ((uint8_t *) (t.result + level3_offset))[2*i]
1800 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1801 if (i+1 < t.level3_size << (t.p - 1))
1802 fprintf (stream, ",");
1804 if (t.level3_size << (t.p - 1) > 8)
1805 fprintf (stream, "\n ");
1806 fprintf (stream, " }\n");
1807 fprintf (stream, "};\n");
1809 if (ferror (stream) || fclose (stream))
1811 fprintf (stderr, "error writing to '%s'\n", filename);
1816 /* ========================================================================= */
1819 /* See Unicode 3.0 book, section 4.6. */
1822 get_digit_value (unsigned int ch)
1824 if (unicode_attributes[ch].name != NULL
1825 && unicode_attributes[ch].digit[0] != '\0')
1826 return atoi (unicode_attributes[ch].digit);
1830 /* Output the unit test for the per-character digit value table. */
1832 output_digit_test (const char *filename, const char *version)
1838 stream = fopen (filename, "w");
1841 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1845 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1846 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1847 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1851 for (ch = 0; ch < 0x110000; ch++)
1853 int value = get_digit_value (ch);
1855 if (!(value >= -1 && value < 10))
1861 fprintf (stream, ",\n");
1862 fprintf (stream, " { 0x%04X, %d }", ch, value);
1867 fprintf (stream, "\n");
1869 if (ferror (stream) || fclose (stream))
1871 fprintf (stderr, "error writing to '%s'\n", filename);
1876 /* Output the per-character digit value table. */
1878 output_digit (const char *filename, const char *version)
1882 struct decdigit_table t;
1883 unsigned int level1_offset, level2_offset, level3_offset;
1885 stream = fopen (filename, "w");
1888 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1892 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1893 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1894 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1899 decdigit_table_init (&t);
1901 for (ch = 0; ch < 0x110000; ch++)
1903 int value = 1 + get_digit_value (ch);
1905 if (!(value >= 0 && value <= 10))
1908 decdigit_table_add (&t, ch, value);
1911 decdigit_table_finalize (&t);
1913 /* Offsets in t.result, in memory of this process. */
1915 5 * sizeof (uint32_t);
1917 5 * sizeof (uint32_t)
1918 + t.level1_size * sizeof (uint32_t);
1920 5 * sizeof (uint32_t)
1921 + t.level1_size * sizeof (uint32_t)
1922 + (t.level2_size << t.q) * sizeof (uint32_t);
1924 for (i = 0; i < 5; i++)
1925 fprintf (stream, "#define digit_header_%d %d\n", i,
1926 ((uint32_t *) t.result)[i]);
1927 fprintf (stream, "static const\n");
1928 fprintf (stream, "struct\n");
1929 fprintf (stream, " {\n");
1930 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1931 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1932 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1934 fprintf (stream, " }\n");
1935 fprintf (stream, "u_digit =\n");
1936 fprintf (stream, "{\n");
1937 fprintf (stream, " {");
1938 if (t.level1_size > 8)
1939 fprintf (stream, "\n ");
1940 for (i = 0; i < t.level1_size; i++)
1943 if (i > 0 && (i % 8) == 0)
1944 fprintf (stream, "\n ");
1945 offset = ((uint32_t *) (t.result + level1_offset))[i];
1947 fprintf (stream, " %5d", -1);
1949 fprintf (stream, " %5zu",
1950 (offset - level2_offset) / sizeof (uint32_t));
1951 if (i+1 < t.level1_size)
1952 fprintf (stream, ",");
1954 if (t.level1_size > 8)
1955 fprintf (stream, "\n ");
1956 fprintf (stream, " },\n");
1957 fprintf (stream, " {");
1958 if (t.level2_size << t.q > 8)
1959 fprintf (stream, "\n ");
1960 for (i = 0; i < t.level2_size << t.q; i++)
1963 if (i > 0 && (i % 8) == 0)
1964 fprintf (stream, "\n ");
1965 offset = ((uint32_t *) (t.result + level2_offset))[i];
1967 fprintf (stream, " %5d", -1);
1969 fprintf (stream, " %5zu",
1970 (offset - level3_offset) / sizeof (uint8_t));
1971 if (i+1 < t.level2_size << t.q)
1972 fprintf (stream, ",");
1974 if (t.level2_size << t.q > 8)
1975 fprintf (stream, "\n ");
1976 fprintf (stream, " },\n");
1977 /* Pack the level3 array. Each entry needs 4 bits only. */
1978 fprintf (stream, " {");
1979 if (t.level3_size << (t.p - 1) > 8)
1980 fprintf (stream, "\n ");
1981 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1983 if (i > 0 && (i % 8) == 0)
1984 fprintf (stream, "\n ");
1985 fprintf (stream, " 0x%02x",
1986 ((uint8_t *) (t.result + level3_offset))[2*i]
1987 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1988 if (i+1 < t.level3_size << (t.p - 1))
1989 fprintf (stream, ",");
1991 if (t.level3_size << (t.p - 1) > 8)
1992 fprintf (stream, "\n ");
1993 fprintf (stream, " }\n");
1994 fprintf (stream, "};\n");
1996 if (ferror (stream) || fclose (stream))
1998 fprintf (stderr, "error writing to '%s'\n", filename);
2003 /* ========================================================================= */
2005 /* Numeric value. */
2006 /* See Unicode 3.0 book, section 4.6. */
2008 typedef struct { int numerator; int denominator; } uc_fraction_t;
2010 static uc_fraction_t
2011 get_numeric_value (unsigned int ch)
2013 uc_fraction_t value;
2015 if (unicode_attributes[ch].name != NULL
2016 && unicode_attributes[ch].numeric[0] != '\0')
2018 const char *str = unicode_attributes[ch].numeric;
2019 /* str is of the form "integer" or "integer/posinteger". */
2020 value.numerator = atoi (str);
2021 if (strchr (str, '/') != NULL)
2022 value.denominator = atoi (strchr (str, '/') + 1);
2024 value.denominator = 1;
2028 value.numerator = 0;
2029 value.denominator = 0;
2034 /* Output the unit test for the per-character numeric value table. */
2036 output_numeric_test (const char *filename, const char *version)
2042 stream = fopen (filename, "w");
2045 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2049 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2050 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2051 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2055 for (ch = 0; ch < 0x110000; ch++)
2057 uc_fraction_t value = get_numeric_value (ch);
2059 if (value.numerator != 0 || value.denominator != 0)
2062 fprintf (stream, ",\n");
2063 fprintf (stream, " { 0x%04X, %d, %d }",
2064 ch, value.numerator, value.denominator);
2069 fprintf (stream, "\n");
2071 if (ferror (stream) || fclose (stream))
2073 fprintf (stderr, "error writing to '%s'\n", filename);
2078 /* Construction of sparse 3-level tables. */
2079 #define TABLE numeric_table
2080 #define ELEMENT uint8_t
2082 #define xmalloc malloc
2083 #define xrealloc realloc
2086 /* Output the per-character numeric value table. */
2088 output_numeric (const char *filename, const char *version)
2091 uc_fraction_t fractions[128];
2092 unsigned int nfractions;
2093 unsigned int ch, i, j;
2094 struct numeric_table t;
2095 unsigned int level1_offset, level2_offset, level3_offset;
2096 uint16_t *level3_packed;
2098 stream = fopen (filename, "w");
2101 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2105 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2106 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2107 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2110 /* Create table of occurring fractions. */
2112 for (ch = 0; ch < 0x110000; ch++)
2114 uc_fraction_t value = get_numeric_value (ch);
2116 for (i = 0; i < nfractions; i++)
2117 if (value.numerator == fractions[i].numerator
2118 && value.denominator == fractions[i].denominator)
2120 if (i == nfractions)
2122 if (nfractions == 128)
2124 for (i = 0; i < nfractions; i++)
2125 if (value.denominator < fractions[i].denominator
2126 || (value.denominator == fractions[i].denominator
2127 && value.numerator < fractions[i].numerator))
2129 for (j = nfractions; j > i; j--)
2130 fractions[j] = fractions[j - 1];
2131 fractions[i] = value;
2136 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2138 fprintf (stream, "{\n");
2139 for (i = 0; i < nfractions; i++)
2141 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2142 fractions[i].denominator);
2143 if (i+1 < nfractions)
2144 fprintf (stream, ",");
2145 fprintf (stream, "\n");
2147 fprintf (stream, "};\n");
2151 numeric_table_init (&t);
2153 for (ch = 0; ch < 0x110000; ch++)
2155 uc_fraction_t value = get_numeric_value (ch);
2157 for (i = 0; i < nfractions; i++)
2158 if (value.numerator == fractions[i].numerator
2159 && value.denominator == fractions[i].denominator)
2161 if (i == nfractions)
2164 numeric_table_add (&t, ch, i);
2167 numeric_table_finalize (&t);
2169 /* Offsets in t.result, in memory of this process. */
2171 5 * sizeof (uint32_t);
2173 5 * sizeof (uint32_t)
2174 + t.level1_size * sizeof (uint32_t);
2176 5 * sizeof (uint32_t)
2177 + t.level1_size * sizeof (uint32_t)
2178 + (t.level2_size << t.q) * sizeof (uint32_t);
2180 for (i = 0; i < 5; i++)
2181 fprintf (stream, "#define numeric_header_%d %d\n", i,
2182 ((uint32_t *) t.result)[i]);
2183 fprintf (stream, "static const\n");
2184 fprintf (stream, "struct\n");
2185 fprintf (stream, " {\n");
2186 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2187 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2188 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2189 (1 << t.p) * 7 / 16);
2190 fprintf (stream, " }\n");
2191 fprintf (stream, "u_numeric =\n");
2192 fprintf (stream, "{\n");
2193 fprintf (stream, " {");
2194 if (t.level1_size > 8)
2195 fprintf (stream, "\n ");
2196 for (i = 0; i < t.level1_size; i++)
2199 if (i > 0 && (i % 8) == 0)
2200 fprintf (stream, "\n ");
2201 offset = ((uint32_t *) (t.result + level1_offset))[i];
2203 fprintf (stream, " %5d", -1);
2205 fprintf (stream, " %5zu",
2206 (offset - level2_offset) / sizeof (uint32_t));
2207 if (i+1 < t.level1_size)
2208 fprintf (stream, ",");
2210 if (t.level1_size > 8)
2211 fprintf (stream, "\n ");
2212 fprintf (stream, " },\n");
2213 fprintf (stream, " {");
2214 if (t.level2_size << t.q > 8)
2215 fprintf (stream, "\n ");
2216 for (i = 0; i < t.level2_size << t.q; i++)
2219 if (i > 0 && (i % 8) == 0)
2220 fprintf (stream, "\n ");
2221 offset = ((uint32_t *) (t.result + level2_offset))[i];
2223 fprintf (stream, " %5d", -1);
2225 fprintf (stream, " %5zu",
2226 (offset - level3_offset) / sizeof (uint8_t));
2227 if (i+1 < t.level2_size << t.q)
2228 fprintf (stream, ",");
2230 if (t.level2_size << t.q > 8)
2231 fprintf (stream, "\n ");
2232 fprintf (stream, " },\n");
2233 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2234 not 32-bit units, in order to make the lookup function easier. */
2237 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2238 for (i = 0; i < t.level3_size << t.p; i++)
2240 unsigned int j = (i * 7) / 16;
2241 unsigned int k = (i * 7) % 16;
2242 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2243 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2244 level3_packed[j] = value & 0xffff;
2245 level3_packed[j+1] = value >> 16;
2247 fprintf (stream, " {");
2248 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2249 fprintf (stream, "\n ");
2250 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2252 if (i > 0 && (i % 8) == 0)
2253 fprintf (stream, "\n ");
2254 fprintf (stream, " 0x%04x", level3_packed[i]);
2255 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2256 fprintf (stream, ",");
2258 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2259 fprintf (stream, "\n ");
2260 fprintf (stream, " }\n");
2261 free (level3_packed);
2262 fprintf (stream, "};\n");
2264 if (ferror (stream) || fclose (stream))
2266 fprintf (stderr, "error writing to '%s'\n", filename);
2271 /* ========================================================================= */
2274 /* See Unicode 3.0 book, section 4.7,
2277 /* List of mirrored character pairs. This is a subset of the characters
2278 having the BidiMirrored property. */
2279 static unsigned int mirror_pairs[][2] =
2336 get_mirror_value (unsigned int ch)
2339 unsigned int mirror_char;
2342 mirrored = (unicode_attributes[ch].name != NULL
2343 && unicode_attributes[ch].mirrored);
2344 mirror_char = 0xfffd;
2345 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2346 if (ch == mirror_pairs[i][0])
2348 mirror_char = mirror_pairs[i][1];
2351 else if (ch == mirror_pairs[i][1])
2353 mirror_char = mirror_pairs[i][0];
2357 return (int) mirror_char - (int) ch;
2360 if (mirror_char != 0xfffd)
2366 /* Construction of sparse 3-level tables. */
2367 #define TABLE mirror_table
2368 #define ELEMENT int32_t
2370 #define xmalloc malloc
2371 #define xrealloc realloc
2374 /* Output the per-character mirror table. */
2376 output_mirror (const char *filename, const char *version)
2380 struct mirror_table t;
2381 unsigned int level1_offset, level2_offset, level3_offset;
2383 stream = fopen (filename, "w");
2386 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2390 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2391 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2392 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2397 mirror_table_init (&t);
2399 for (ch = 0; ch < 0x110000; ch++)
2401 int value = get_mirror_value (ch);
2403 mirror_table_add (&t, ch, value);
2406 mirror_table_finalize (&t);
2408 /* Offsets in t.result, in memory of this process. */
2410 5 * sizeof (uint32_t);
2412 5 * sizeof (uint32_t)
2413 + t.level1_size * sizeof (uint32_t);
2415 5 * sizeof (uint32_t)
2416 + t.level1_size * sizeof (uint32_t)
2417 + (t.level2_size << t.q) * sizeof (uint32_t);
2419 for (i = 0; i < 5; i++)
2420 fprintf (stream, "#define mirror_header_%d %d\n", i,
2421 ((uint32_t *) t.result)[i]);
2422 fprintf (stream, "static const\n");
2423 fprintf (stream, "struct\n");
2424 fprintf (stream, " {\n");
2425 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2426 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2427 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2428 fprintf (stream, " }\n");
2429 fprintf (stream, "u_mirror =\n");
2430 fprintf (stream, "{\n");
2431 fprintf (stream, " {");
2432 if (t.level1_size > 8)
2433 fprintf (stream, "\n ");
2434 for (i = 0; i < t.level1_size; i++)
2437 if (i > 0 && (i % 8) == 0)
2438 fprintf (stream, "\n ");
2439 offset = ((uint32_t *) (t.result + level1_offset))[i];
2441 fprintf (stream, " %5d", -1);
2443 fprintf (stream, " %5zu",
2444 (offset - level2_offset) / sizeof (uint32_t));
2445 if (i+1 < t.level1_size)
2446 fprintf (stream, ",");
2448 if (t.level1_size > 8)
2449 fprintf (stream, "\n ");
2450 fprintf (stream, " },\n");
2451 fprintf (stream, " {");
2452 if (t.level2_size << t.q > 8)
2453 fprintf (stream, "\n ");
2454 for (i = 0; i < t.level2_size << t.q; i++)
2457 if (i > 0 && (i % 8) == 0)
2458 fprintf (stream, "\n ");
2459 offset = ((uint32_t *) (t.result + level2_offset))[i];
2461 fprintf (stream, " %5d", -1);
2463 fprintf (stream, " %5zu",
2464 (offset - level3_offset) / sizeof (int32_t));
2465 if (i+1 < t.level2_size << t.q)
2466 fprintf (stream, ",");
2468 if (t.level2_size << t.q > 8)
2469 fprintf (stream, "\n ");
2470 fprintf (stream, " },\n");
2471 fprintf (stream, " {");
2472 if (t.level3_size << t.p > 8)
2473 fprintf (stream, "\n ");
2474 for (i = 0; i < t.level3_size << t.p; i++)
2476 if (i > 0 && (i % 8) == 0)
2477 fprintf (stream, "\n ");
2478 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2479 if (i+1 < t.level3_size << t.p)
2480 fprintf (stream, ",");
2482 if (t.level3_size << t.p > 8)
2483 fprintf (stream, "\n ");
2484 fprintf (stream, " }\n");
2485 fprintf (stream, "};\n");
2487 if (ferror (stream) || fclose (stream))
2489 fprintf (stderr, "error writing to '%s'\n", filename);
2494 /* ========================================================================= */
2496 /* Particular values of the word break property. */
2499 is_WBP_MIDNUMLET (unsigned int ch)
2501 return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
2502 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2506 is_WBP_MIDLETTER (unsigned int ch)
2508 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2509 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
2512 /* ========================================================================= */
2516 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2525 PROP_QUOTATION_MARK,
2526 PROP_TERMINAL_PUNCTUATION,
2529 PROP_ASCII_HEX_DIGIT,
2530 PROP_OTHER_ALPHABETIC,
2534 PROP_OTHER_LOWERCASE,
2535 PROP_OTHER_UPPERCASE,
2536 PROP_NONCHARACTER_CODE_POINT,
2537 PROP_OTHER_GRAPHEME_EXTEND,
2538 PROP_IDS_BINARY_OPERATOR,
2539 PROP_IDS_TRINARY_OPERATOR,
2541 PROP_UNIFIED_IDEOGRAPH,
2542 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2545 PROP_LOGICAL_ORDER_EXCEPTION,
2546 PROP_OTHER_ID_START,
2547 PROP_OTHER_ID_CONTINUE,
2549 PROP_VARIATION_SELECTOR,
2550 PROP_PATTERN_WHITE_SPACE,
2551 PROP_PATTERN_SYNTAX,
2552 /* DerivedCoreProperties.txt */
2558 PROP_CASE_IGNORABLE,
2559 PROP_CHANGES_WHEN_LOWERCASED,
2560 PROP_CHANGES_WHEN_UPPERCASED,
2561 PROP_CHANGES_WHEN_TITLECASED,
2562 PROP_CHANGES_WHEN_CASEFOLDED,
2563 PROP_CHANGES_WHEN_CASEMAPPED,
2568 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2569 PROP_GRAPHEME_EXTEND,
2573 unsigned long long unicode_properties[0x110000];
2576 clear_properties (void)
2580 for (i = 0; i < 0x110000; i++)
2581 unicode_properties[i] = 0;
2584 /* Stores in unicode_properties[] the properties from the
2585 PropList.txt or DerivedCoreProperties.txt file. */
2587 fill_properties (const char *proplist_filename)
2592 stream = fopen (proplist_filename, "r");
2595 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2602 unsigned int i1, i2;
2603 char padding[200+1];
2604 char propname[200+1];
2605 unsigned int propvalue;
2607 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2610 if (buf[0] == '\0' || buf[0] == '#')
2613 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2615 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2617 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2622 #define PROP(name,value) \
2623 if (strcmp (propname, name) == 0) propvalue = value; else
2625 PROP ("White_Space", PROP_WHITE_SPACE)
2626 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2627 PROP ("Join_Control", PROP_JOIN_CONTROL)
2628 PROP ("Dash", PROP_DASH)
2629 PROP ("Hyphen", PROP_HYPHEN)
2630 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2631 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2632 PROP ("Other_Math", PROP_OTHER_MATH)
2633 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2634 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2635 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2636 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2637 PROP ("Diacritic", PROP_DIACRITIC)
2638 PROP ("Extender", PROP_EXTENDER)
2639 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2640 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2641 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2642 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2643 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2644 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2645 PROP ("Radical", PROP_RADICAL)
2646 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2647 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2648 PROP ("Deprecated", PROP_DEPRECATED)
2649 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2650 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2651 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2652 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2653 PROP ("STerm", PROP_STERM)
2654 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2655 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2656 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2657 /* DerivedCoreProperties.txt */
2658 PROP ("Math", PROP_MATH)
2659 PROP ("Alphabetic", PROP_ALPHABETIC)
2660 PROP ("Lowercase", PROP_LOWERCASE)
2661 PROP ("Uppercase", PROP_UPPERCASE)
2662 PROP ("Cased", PROP_CASED)
2663 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2664 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2665 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2666 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2667 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2668 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2669 PROP ("ID_Start", PROP_ID_START)
2670 PROP ("ID_Continue", PROP_ID_CONTINUE)
2671 PROP ("XID_Start", PROP_XID_START)
2672 PROP ("XID_Continue", PROP_XID_CONTINUE)
2673 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2674 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2675 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2676 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2679 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2683 if (!(i1 <= i2 && i2 < 0x110000))
2686 for (i = i1; i <= i2; i++)
2687 unicode_properties[i] |= 1ULL << propvalue;
2690 if (ferror (stream) || fclose (stream))
2692 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2697 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2700 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2706 for (i = 0; i < 0x110000; i++)
2709 stream = fopen (proplist_filename, "r");
2712 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2716 /* Search for the "Property dump for: ..." line. */
2719 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2721 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2725 while (strstr (buf, property_name) == NULL);
2729 unsigned int i1, i2;
2731 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2735 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2737 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2739 fprintf (stderr, "parse error in property in '%s'\n",
2744 else if (strlen (buf) >= 4)
2746 if (sscanf (buf, "%4X", &i1) < 1)
2748 fprintf (stderr, "parse error in property in '%s'\n",
2756 fprintf (stderr, "parse error in property in '%s'\n",
2760 if (!(i1 <= i2 && i2 < 0x110000))
2762 for (i = i1; i <= i2; i++)
2766 if (ferror (stream) || fclose (stream))
2768 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2773 /* Properties from Unicode 3.0 PropList.txt file. */
2775 /* The paired punctuation property from the PropList.txt file. */
2776 char unicode_pairedpunctuation[0x110000];
2778 /* The left of pair property from the PropList.txt file. */
2779 char unicode_leftofpair[0x110000];
2782 fill_properties30 (const char *proplist30_filename)
2784 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2785 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2788 /* ------------------------------------------------------------------------- */
2790 /* See PropList.txt, UCD.html. */
2792 is_property_white_space (unsigned int ch)
2794 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2797 /* See Unicode 3.0 book, section 4.10,
2798 PropList.txt, UCD.html,
2799 DerivedCoreProperties.txt, UCD.html. */
2801 is_property_alphabetic (unsigned int ch)
2805 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2806 /* For some reason, the following are listed as having property
2807 Alphabetic but not as having property Other_Alphabetic. */
2808 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2809 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2810 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2811 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2812 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2813 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2814 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2815 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2816 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2817 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2818 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2819 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2820 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2822 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2824 if (result1 != result2)
2829 /* See PropList.txt, UCD.html. */
2831 is_property_other_alphabetic (unsigned int ch)
2833 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2836 /* See PropList.txt, UCD.html. */
2838 is_property_not_a_character (unsigned int ch)
2840 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2843 /* See PropList.txt, UCD.html,
2844 DerivedCoreProperties.txt, UCD.html. */
2846 is_property_default_ignorable_code_point (unsigned int ch)
2849 (is_category_Cf (ch)
2850 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2851 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
2852 /* For some reason, the following are not listed as having property
2853 Default_Ignorable_Code_Point. */
2854 && !(ch == 0x110BD))
2855 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2856 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2858 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2860 if (result1 != result2)
2865 /* See PropList.txt, UCD.html. */
2867 is_property_other_default_ignorable_code_point (unsigned int ch)
2869 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2872 /* See PropList.txt, UCD.html. */
2874 is_property_deprecated (unsigned int ch)
2876 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2879 /* See PropList.txt, UCD.html. */
2881 is_property_logical_order_exception (unsigned int ch)
2883 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2886 /* See PropList.txt, UCD.html. */
2888 is_property_variation_selector (unsigned int ch)
2890 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2893 /* See PropList-3.0.1.txt. */
2895 is_property_private_use (unsigned int ch)
2897 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2898 return (ch >= 0xE000 && ch <= 0xF8FF)
2899 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2900 || (ch >= 0x100000 && ch <= 0x10FFFD);
2903 /* See PropList-3.0.1.txt. */
2905 is_property_unassigned_code_value (unsigned int ch)
2907 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2910 /* See PropList.txt, UCD.html,
2911 DerivedCoreProperties.txt, UCD.html. */
2913 is_property_uppercase (unsigned int ch)
2917 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2919 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2921 if (result1 != result2)
2926 /* See PropList.txt, UCD.html. */
2928 is_property_other_uppercase (unsigned int ch)
2930 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2933 /* See PropList.txt, UCD.html,
2934 DerivedCoreProperties.txt, UCD.html. */
2936 is_property_lowercase (unsigned int ch)
2940 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2942 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2944 if (result1 != result2)
2949 /* See PropList.txt, UCD.html. */
2951 is_property_other_lowercase (unsigned int ch)
2953 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2956 /* See PropList-3.0.1.txt. */
2958 is_property_titlecase (unsigned int ch)
2960 return is_category_Lt (ch);
2963 /* See DerivedCoreProperties.txt. */
2965 is_property_cased (unsigned int ch)
2967 bool result1 = (is_property_lowercase (ch)
2968 || is_property_uppercase (ch)
2969 || is_category_Lt (ch));
2970 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
2972 if (result1 != result2)
2977 /* See DerivedCoreProperties.txt. */
2979 is_property_case_ignorable (unsigned int ch)
2981 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
2982 || is_category_Mn (ch)
2983 || is_category_Me (ch)
2984 || is_category_Cf (ch)
2985 || is_category_Lm (ch)
2986 || is_category_Sk (ch));
2987 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
2989 if (result1 != result2)
2994 /* See DerivedCoreProperties.txt. */
2996 is_property_changes_when_lowercased (unsigned int ch)
2998 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
2999 bool result2 = (unicode_attributes[ch].name != NULL
3000 && unicode_attributes[ch].lower != NONE
3001 && unicode_attributes[ch].lower != ch);
3003 if (result1 != result2)
3008 /* See DerivedCoreProperties.txt. */
3010 is_property_changes_when_uppercased (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3015 /* See DerivedCoreProperties.txt. */
3017 is_property_changes_when_titlecased (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3022 /* See DerivedCoreProperties.txt. */
3024 is_property_changes_when_casefolded (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3029 /* See DerivedCoreProperties.txt. */
3031 is_property_changes_when_casemapped (unsigned int ch)
3033 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3036 /* See PropList.txt, UCD.html. */
3038 is_property_soft_dotted (unsigned int ch)
3040 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3043 /* See DerivedCoreProperties.txt, UCD.html. */
3045 is_property_id_start (unsigned int ch)
3047 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3050 /* See PropList.txt, UCD.html. */
3052 is_property_other_id_start (unsigned int ch)
3054 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3057 /* See DerivedCoreProperties.txt, UCD.html. */
3059 is_property_id_continue (unsigned int ch)
3061 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3064 /* See PropList.txt, UCD.html. */
3066 is_property_other_id_continue (unsigned int ch)
3068 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3071 /* See DerivedCoreProperties.txt, UCD.html. */
3073 is_property_xid_start (unsigned int ch)
3075 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3078 /* See DerivedCoreProperties.txt, UCD.html. */
3080 is_property_xid_continue (unsigned int ch)
3082 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3085 /* See PropList.txt, UCD.html. */
3087 is_property_pattern_white_space (unsigned int ch)
3089 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3092 /* See PropList.txt, UCD.html. */
3094 is_property_pattern_syntax (unsigned int ch)
3096 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3099 /* See PropList.txt, UCD.html. */
3101 is_property_join_control (unsigned int ch)
3103 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3106 /* See DerivedCoreProperties.txt, UCD.html. */
3108 is_property_grapheme_base (unsigned int ch)
3110 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3113 /* See DerivedCoreProperties.txt, UCD.html. */
3115 is_property_grapheme_extend (unsigned int ch)
3117 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3120 /* See PropList.txt, UCD.html. */
3122 is_property_other_grapheme_extend (unsigned int ch)
3124 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3127 /* See DerivedCoreProperties.txt, UCD.html. */
3129 is_property_grapheme_link (unsigned int ch)
3131 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3134 /* See PropList.txt, UCD.html. */
3136 is_property_bidi_control (unsigned int ch)
3138 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3141 /* See PropList-3.0.1.txt. */
3143 is_property_bidi_left_to_right (unsigned int ch)
3145 return (get_bidi_category (ch) == UC_BIDI_L);
3148 /* See PropList-3.0.1.txt. */
3150 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3152 return (get_bidi_category (ch) == UC_BIDI_R);
3155 /* See PropList-3.0.1.txt. */
3157 is_property_bidi_arabic_right_to_left (unsigned int ch)
3159 return (get_bidi_category (ch) == UC_BIDI_AL);
3162 /* See PropList-3.0.1.txt. */
3164 is_property_bidi_european_digit (unsigned int ch)
3166 return (get_bidi_category (ch) == UC_BIDI_EN);
3169 /* See PropList-3.0.1.txt. */
3171 is_property_bidi_eur_num_separator (unsigned int ch)
3173 return (get_bidi_category (ch) == UC_BIDI_ES);
3176 /* See PropList-3.0.1.txt. */
3178 is_property_bidi_eur_num_terminator (unsigned int ch)
3180 return (get_bidi_category (ch) == UC_BIDI_ET);
3183 /* See PropList-3.0.1.txt. */
3185 is_property_bidi_arabic_digit (unsigned int ch)
3187 return (get_bidi_category (ch) == UC_BIDI_AN);
3190 /* See PropList-3.0.1.txt. */
3192 is_property_bidi_common_separator (unsigned int ch)
3194 return (get_bidi_category (ch) == UC_BIDI_CS);
3197 /* See PropList-3.0.1.txt. */
3199 is_property_bidi_block_separator (unsigned int ch)
3201 return (get_bidi_category (ch) == UC_BIDI_B);
3204 /* See PropList-3.0.1.txt. */
3206 is_property_bidi_segment_separator (unsigned int ch)
3208 return (get_bidi_category (ch) == UC_BIDI_S);
3211 /* See PropList-3.0.1.txt. */
3213 is_property_bidi_whitespace (unsigned int ch)
3215 return (get_bidi_category (ch) == UC_BIDI_WS);
3218 /* See PropList-3.0.1.txt. */
3220 is_property_bidi_non_spacing_mark (unsigned int ch)
3222 return (get_bidi_category (ch) == UC_BIDI_NSM);
3225 /* See PropList-3.0.1.txt. */
3227 is_property_bidi_boundary_neutral (unsigned int ch)
3229 return (get_bidi_category (ch) == UC_BIDI_BN);
3232 /* See PropList-3.0.1.txt. */
3234 is_property_bidi_pdf (unsigned int ch)
3236 return (get_bidi_category (ch) == UC_BIDI_PDF);
3239 /* See PropList-3.0.1.txt. */
3241 is_property_bidi_embedding_or_override (unsigned int ch)
3243 int category = get_bidi_category (ch);
3244 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3245 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3248 /* See PropList-3.0.1.txt. */
3250 is_property_bidi_other_neutral (unsigned int ch)
3252 return (get_bidi_category (ch) == UC_BIDI_ON);
3255 /* See PropList.txt, UCD.html. */
3257 is_property_hex_digit (unsigned int ch)
3259 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3262 /* See PropList.txt, UCD.html. */
3264 is_property_ascii_hex_digit (unsigned int ch)
3266 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3269 /* See Unicode 3.0 book, section 4.10,
3270 PropList.txt, UCD.html. */
3272 is_property_ideographic (unsigned int ch)
3274 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3277 /* See PropList.txt, UCD.html. */
3279 is_property_unified_ideograph (unsigned int ch)
3281 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3284 /* See PropList.txt, UCD.html. */
3286 is_property_radical (unsigned int ch)
3288 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3291 /* See PropList.txt, UCD.html. */
3293 is_property_ids_binary_operator (unsigned int ch)
3295 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3298 /* See PropList.txt, UCD.html. */
3300 is_property_ids_trinary_operator (unsigned int ch)
3302 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3305 /* See PropList-3.0.1.txt. */
3307 is_property_zero_width (unsigned int ch)
3309 return is_category_Cf (ch)
3310 || (unicode_attributes[ch].name != NULL
3311 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3314 /* See PropList-3.0.1.txt. */
3316 is_property_space (unsigned int ch)
3318 return is_category_Zs (ch);
3321 /* See PropList-3.0.1.txt. */
3323 is_property_non_break (unsigned int ch)
3325 /* This is exactly the set of characters having line breaking
3327 return (ch == 0x00A0 /* NO-BREAK SPACE */
3328 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3329 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3330 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3331 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3332 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3333 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3334 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3335 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3336 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3337 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3338 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3339 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3340 || ch == 0x2007 /* FIGURE SPACE */
3341 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3342 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3345 /* See PropList-3.0.1.txt. */
3347 is_property_iso_control (unsigned int ch)
3350 (unicode_attributes[ch].name != NULL
3351 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3353 is_category_Cc (ch);
3355 if (result1 != result2)
3360 /* See PropList-3.0.1.txt. */
3362 is_property_format_control (unsigned int ch)
3364 return (is_category_Cf (ch)
3365 && get_bidi_category (ch) == UC_BIDI_BN
3366 && !is_property_join_control (ch)
3370 /* See PropList.txt, UCD.html. */
3372 is_property_dash (unsigned int ch)
3374 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3377 /* See PropList.txt, UCD.html. */
3379 is_property_hyphen (unsigned int ch)
3381 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3384 /* See PropList-3.0.1.txt. */
3386 is_property_punctuation (unsigned int ch)
3388 return is_category_P (ch);
3391 /* See PropList-3.0.1.txt. */
3393 is_property_line_separator (unsigned int ch)
3395 return is_category_Zl (ch);
3398 /* See PropList-3.0.1.txt. */
3400 is_property_paragraph_separator (unsigned int ch)
3402 return is_category_Zp (ch);
3405 /* See PropList.txt, UCD.html. */
3407 is_property_quotation_mark (unsigned int ch)
3409 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3412 /* See PropList.txt, UCD.html. */
3414 is_property_sentence_terminal (unsigned int ch)
3416 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3419 /* See PropList.txt, UCD.html. */
3421 is_property_terminal_punctuation (unsigned int ch)
3423 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3426 /* See PropList-3.0.1.txt. */
3428 is_property_currency_symbol (unsigned int ch)
3430 return is_category_Sc (ch);
3433 /* See Unicode 3.0 book, section 4.9,
3434 PropList.txt, UCD.html,
3435 DerivedCoreProperties.txt, UCD.html. */
3437 is_property_math (unsigned int ch)
3441 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3443 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3445 if (result1 != result2)
3450 /* See PropList.txt, UCD.html. */
3452 is_property_other_math (unsigned int ch)
3454 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3457 /* See PropList-3.0.1.txt. */
3459 is_property_paired_punctuation (unsigned int ch)
3461 return unicode_pairedpunctuation[ch];
3464 /* See PropList-3.0.1.txt. */
3466 is_property_left_of_pair (unsigned int ch)
3468 return unicode_leftofpair[ch];
3471 /* See PropList-3.0.1.txt. */
3473 is_property_combining (unsigned int ch)
3475 return (unicode_attributes[ch].name != NULL
3476 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3477 || is_category_Mc (ch)
3478 || is_category_Me (ch)
3479 || is_category_Mn (ch)));
3482 #if 0 /* same as is_property_bidi_non_spacing_mark */
3483 /* See PropList-3.0.1.txt. */
3485 is_property_non_spacing (unsigned int ch)
3487 return (unicode_attributes[ch].name != NULL
3488 && get_bidi_category (ch) == UC_BIDI_NSM);
3492 /* See PropList-3.0.1.txt. */
3494 is_property_composite (unsigned int ch)
3496 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3497 logical in some sense. */
3498 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3500 if (unicode_attributes[ch].name != NULL
3501 && unicode_attributes[ch].decomposition != NULL)
3503 /* Test whether the decomposition contains more than one character,
3504 and the first is not a space. */
3505 const char *decomp = unicode_attributes[ch].decomposition;
3506 if (decomp[0] == '<')
3508 decomp = strchr (decomp, '>') + 1;
3509 if (decomp[0] == ' ')
3512 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3517 /* See PropList-3.0.1.txt. */
3519 is_property_decimal_digit (unsigned int ch)
3521 return is_category_Nd (ch);
3524 /* See PropList-3.0.1.txt. */
3526 is_property_numeric (unsigned int ch)
3528 return ((get_numeric_value (ch)).denominator > 0)
3529 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3530 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3533 /* See PropList.txt, UCD.html. */
3535 is_property_diacritic (unsigned int ch)
3537 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3540 /* See PropList.txt, UCD.html. */
3542 is_property_extender (unsigned int ch)
3544 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3547 /* See PropList-3.0.1.txt. */
3549 is_property_ignorable_control (unsigned int ch)
3551 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3552 || is_category_Cf (ch))
3556 /* ------------------------------------------------------------------------- */
3558 /* Output all properties. */
3560 output_properties (const char *version)
3562 #define PROPERTY(P) \
3563 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3564 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3565 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3566 PROPERTY(white_space)
3567 PROPERTY(alphabetic)
3568 PROPERTY(other_alphabetic)
3569 PROPERTY(not_a_character)
3570 PROPERTY(default_ignorable_code_point)
3571 PROPERTY(other_default_ignorable_code_point)
3572 PROPERTY(deprecated)
3573 PROPERTY(logical_order_exception)
3574 PROPERTY(variation_selector)
3575 PROPERTY(private_use)
3576 PROPERTY(unassigned_code_value)
3578 PROPERTY(other_uppercase)
3580 PROPERTY(other_lowercase)
3583 PROPERTY(case_ignorable)
3584 PROPERTY(changes_when_lowercased)
3585 PROPERTY(changes_when_uppercased)
3586 PROPERTY(changes_when_titlecased)
3587 PROPERTY(changes_when_casefolded)
3588 PROPERTY(changes_when_casemapped)
3589 PROPERTY(soft_dotted)
3591 PROPERTY(other_id_start)
3592 PROPERTY(id_continue)
3593 PROPERTY(other_id_continue)
3595 PROPERTY(xid_continue)
3596 PROPERTY(pattern_white_space)
3597 PROPERTY(pattern_syntax)
3598 PROPERTY(join_control)
3599 PROPERTY(grapheme_base)
3600 PROPERTY(grapheme_extend)
3601 PROPERTY(other_grapheme_extend)
3602 PROPERTY(grapheme_link)
3603 PROPERTY(bidi_control)
3604 PROPERTY(bidi_left_to_right)
3605 PROPERTY(bidi_hebrew_right_to_left)
3606 PROPERTY(bidi_arabic_right_to_left)
3607 PROPERTY(bidi_european_digit)
3608 PROPERTY(bidi_eur_num_separator)
3609 PROPERTY(bidi_eur_num_terminator)
3610 PROPERTY(bidi_arabic_digit)
3611 PROPERTY(bidi_common_separator)
3612 PROPERTY(bidi_block_separator)
3613 PROPERTY(bidi_segment_separator)
3614 PROPERTY(bidi_whitespace)
3615 PROPERTY(bidi_non_spacing_mark)
3616 PROPERTY(bidi_boundary_neutral)
3618 PROPERTY(bidi_embedding_or_override)
3619 PROPERTY(bidi_other_neutral)
3621 PROPERTY(ascii_hex_digit)
3622 PROPERTY(ideographic)
3623 PROPERTY(unified_ideograph)
3625 PROPERTY(ids_binary_operator)
3626 PROPERTY(ids_trinary_operator)
3627 PROPERTY(zero_width)
3630 PROPERTY(iso_control)
3631 PROPERTY(format_control)
3634 PROPERTY(punctuation)
3635 PROPERTY(line_separator)
3636 PROPERTY(paragraph_separator)
3637 PROPERTY(quotation_mark)
3638 PROPERTY(sentence_terminal)
3639 PROPERTY(terminal_punctuation)
3640 PROPERTY(currency_symbol)
3642 PROPERTY(other_math)
3643 PROPERTY(paired_punctuation)
3644 PROPERTY(left_of_pair)
3647 PROPERTY(decimal_digit)
3651 PROPERTY(ignorable_control)
3655 /* ========================================================================= */
3657 /* Arabic Shaping. */
3661 UC_JOINING_TYPE_U, /* Non_Joining */
3662 UC_JOINING_TYPE_T, /* Transparent */
3663 UC_JOINING_TYPE_C, /* Join_Causing */
3664 UC_JOINING_TYPE_L, /* Left_Joining */
3665 UC_JOINING_TYPE_R, /* Right_Joining */
3666 UC_JOINING_TYPE_D /* Dual_Joining */
3669 static uint8_t unicode_joining_type[0x110000];
3673 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
3674 UC_JOINING_GROUP_AIN, /* Ain */
3675 UC_JOINING_GROUP_ALAPH, /* Alaph */
3676 UC_JOINING_GROUP_ALEF, /* Alef */
3677 UC_JOINING_GROUP_BEH, /* Beh */
3678 UC_JOINING_GROUP_BETH, /* Beth */
3679 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3680 UC_JOINING_GROUP_DAL, /* Dal */
3681 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
3682 UC_JOINING_GROUP_E, /* E */
3683 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
3684 UC_JOINING_GROUP_FE, /* Fe */
3685 UC_JOINING_GROUP_FEH, /* Feh */
3686 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
3687 UC_JOINING_GROUP_GAF, /* Gaf */
3688 UC_JOINING_GROUP_GAMAL, /* Gamal */
3689 UC_JOINING_GROUP_HAH, /* Hah */
3690 UC_JOINING_GROUP_HE, /* He */
3691 UC_JOINING_GROUP_HEH, /* Heh */
3692 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
3693 UC_JOINING_GROUP_HETH, /* Heth */
3694 UC_JOINING_GROUP_KAF, /* Kaf */
3695 UC_JOINING_GROUP_KAPH, /* Kaph */
3696 UC_JOINING_GROUP_KHAPH, /* Khaph */
3697 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
3698 UC_JOINING_GROUP_LAM, /* Lam */
3699 UC_JOINING_GROUP_LAMADH, /* Lamadh */
3700 UC_JOINING_GROUP_MEEM, /* Meem */
3701 UC_JOINING_GROUP_MIM, /* Mim */
3702 UC_JOINING_GROUP_NOON, /* Noon */
3703 UC_JOINING_GROUP_NUN, /* Nun */
3704 UC_JOINING_GROUP_NYA, /* Nya */
3705 UC_JOINING_GROUP_PE, /* Pe */
3706 UC_JOINING_GROUP_QAF, /* Qaf */
3707 UC_JOINING_GROUP_QAPH, /* Qaph */
3708 UC_JOINING_GROUP_REH, /* Reh */
3709 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
3710 UC_JOINING_GROUP_SAD, /* Sad */
3711 UC_JOINING_GROUP_SADHE, /* Sadhe */
3712 UC_JOINING_GROUP_SEEN, /* Seen */
3713 UC_JOINING_GROUP_SEMKATH, /* Semkath */
3714 UC_JOINING_GROUP_SHIN, /* Shin */
3715 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
3716 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
3717 UC_JOINING_GROUP_TAH, /* Tah */
3718 UC_JOINING_GROUP_TAW, /* Taw */
3719 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
3720 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
3721 UC_JOINING_GROUP_TETH, /* Teth */
3722 UC_JOINING_GROUP_WAW, /* Waw */
3723 UC_JOINING_GROUP_YEH, /* Yeh */
3724 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
3725 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
3726 UC_JOINING_GROUP_YUDH, /* Yudh */
3727 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
3728 UC_JOINING_GROUP_ZAIN, /* Zain */
3729 UC_JOINING_GROUP_ZHAIN /* Zhain */
3732 static uint8_t unicode_joining_group[0x110000];
3735 fill_arabicshaping (const char *arabicshaping_filename)
3741 stream = fopen (arabicshaping_filename, "r");
3744 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3748 for (i = 0; i < 0x110000; i++)
3750 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3751 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3758 char separator1[100+1];
3759 char padding1[100+1];
3760 char schematic_name[100+1];
3761 char separator2[100+1];
3762 char padding2[100+1];
3763 char joining_type_name[100+1];
3764 char separator3[100+1];
3765 char padding3[100+1];
3766 char joining_group_name[100+1];
3771 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3774 if (buf[0] == '\0' || buf[0] == '#')
3777 if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
3778 &i, separator1, padding1, schematic_name, separator2,
3779 padding2, joining_type_name, separator3, padding3,
3780 joining_group_name) != 10)
3782 fprintf (stderr, "parse error in '%s':%d\n",
3783 arabicshaping_filename, lineno);
3789 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3791 TRY(UC_JOINING_TYPE_U)
3792 TRY(UC_JOINING_TYPE_T)
3793 TRY(UC_JOINING_TYPE_C)
3794 TRY(UC_JOINING_TYPE_L)
3795 TRY(UC_JOINING_TYPE_R)
3796 TRY(UC_JOINING_TYPE_D)
3800 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3801 joining_type_name, arabicshaping_filename, lineno);
3805 /* Remove trailing spaces. */
3806 while (joining_group_name[0] != '\0'
3807 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3808 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3810 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3812 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3813 TRY(UC_JOINING_GROUP_AIN, "AIN")
3814 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
3815 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
3816 TRY(UC_JOINING_GROUP_BEH, "BEH")
3817 TRY(UC_JOINING_GROUP_BETH, "BETH")
3818 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
3819 TRY(UC_JOINING_GROUP_DAL, "DAL")
3820 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
3821 TRY(UC_JOINING_GROUP_E, "E")
3822 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
3823 TRY(UC_JOINING_GROUP_FE, "FE")
3824 TRY(UC_JOINING_GROUP_FEH, "FEH")
3825 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
3826 TRY(UC_JOINING_GROUP_GAF, "GAF")
3827 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
3828 TRY(UC_JOINING_GROUP_HAH, "HAH")
3829 TRY(UC_JOINING_GROUP_HE, "HE")
3830 TRY(UC_JOINING_GROUP_HEH, "HEH")
3831 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
3832 TRY(UC_JOINING_GROUP_HETH, "HETH")
3833 TRY(UC_JOINING_GROUP_KAF, "KAF")
3834 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
3835 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
3836 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
3837 TRY(UC_JOINING_GROUP_LAM, "LAM")
3838 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
3839 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
3840 TRY(UC_JOINING_GROUP_MIM, "MIM")
3841 TRY(UC_JOINING_GROUP_NOON, "NOON")
3842 TRY(UC_JOINING_GROUP_NUN, "NUN")
3843 TRY(UC_JOINING_GROUP_NYA, "NYA")
3844 TRY(UC_JOINING_GROUP_PE, "PE")
3845 TRY(UC_JOINING_GROUP_QAF, "QAF")
3846 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
3847 TRY(UC_JOINING_GROUP_REH, "REH")
3848 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
3849 TRY(UC_JOINING_GROUP_SAD, "SAD")
3850 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
3851 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
3852 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
3853 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
3854 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
3855 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
3856 TRY(UC_JOINING_GROUP_TAH, "TAH")
3857 TRY(UC_JOINING_GROUP_TAW, "TAW")
3858 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
3859 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
3860 TRY(UC_JOINING_GROUP_TETH, "TETH")
3861 TRY(UC_JOINING_GROUP_WAW, "WAW")
3862 TRY(UC_JOINING_GROUP_YEH, "YEH")
3863 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
3864 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
3865 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
3866 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
3867 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
3868 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
3872 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
3873 joining_group_name, arabicshaping_filename, lineno);
3877 unicode_joining_type[i] = joining_type;
3878 unicode_joining_group[i] = joining_group;
3881 if (ferror (stream) || fclose (stream))
3883 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
3888 /* Convert a Joining_Type value to a C identifier. */
3890 joining_type_as_c_identifier (int joining_type)
3892 #define TRY(value) if (joining_type == value) return #value;
3893 TRY(UC_JOINING_TYPE_U)
3894 TRY(UC_JOINING_TYPE_T)
3895 TRY(UC_JOINING_TYPE_C)
3896 TRY(UC_JOINING_TYPE_L)
3897 TRY(UC_JOINING_TYPE_R)
3898 TRY(UC_JOINING_TYPE_D)
3904 output_joining_type_test (const char *filename, const char *version)
3910 stream = fopen (filename, "w");
3913 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3917 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3918 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
3919 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3923 for (ch = 0; ch < 0x110000; ch++)
3925 int value = unicode_joining_type[ch];
3927 if (value != (uint8_t)~(uint8_t)0)
3930 fprintf (stream, ",\n");
3931 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
3936 fprintf (stream, "\n");
3938 if (ferror (stream) || fclose (stream))
3940 fprintf (stderr, "error writing to '%s'\n", filename);
3945 /* Construction of sparse 3-level tables. */
3946 #define TABLE joining_type_table
3947 #define ELEMENT uint8_t
3948 #define DEFAULT (uint8_t)~(uint8_t)0
3949 #define xmalloc malloc
3950 #define xrealloc realloc
3954 output_joining_type (const char *filename, const char *version)
3958 struct joining_type_table t;
3959 unsigned int level1_offset, level2_offset, level3_offset;
3960 uint8_t *level3_packed;
3962 stream = fopen (filename, "w");
3965 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3969 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3970 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
3971 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3976 joining_type_table_init (&t);
3978 for (ch = 0; ch < 0x110000; ch++)
3980 uint8_t value = unicode_joining_type[ch];
3982 joining_type_table_add (&t, ch, value);
3985 joining_type_table_finalize (&t);
3987 /* Offsets in t.result, in memory of this process. */
3989 5 * sizeof (uint32_t);
3991 5 * sizeof (uint32_t)
3992 + t.level1_size * sizeof (uint32_t);
3994 5 * sizeof (uint32_t)
3995 + t.level1_size * sizeof (uint32_t)
3996 + (t.level2_size << t.q) * sizeof (uint32_t);
3998 for (i = 0; i < 5; i++)
3999 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4000 ((uint32_t *) t.result)[i]);
4001 fprintf (stream, "static const\n");
4002 fprintf (stream, "struct\n");
4003 fprintf (stream, " {\n");
4004 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4005 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4006 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4007 (1 << t.p) * 4 / 8);
4008 fprintf (stream, " }\n");
4009 fprintf (stream, "u_joining_type =\n");
4010 fprintf (stream, "{\n");
4011 fprintf (stream, " {");
4012 if (t.level1_size > 8)
4013 fprintf (stream, "\n ");
4014 for (i = 0; i < t.level1_size; i++)
4017 if (i > 0 && (i % 8) == 0)
4018 fprintf (stream, "\n ");
4019 offset = ((uint32_t *) (t.result + level1_offset))[i];
4021 fprintf (stream, " %5d", -1);
4023 fprintf (stream, " %5zu",
4024 (offset - level2_offset) / sizeof (uint32_t));
4025 if (i+1 < t.level1_size)
4026 fprintf (stream, ",");
4028 if (t.level1_size > 8)
4029 fprintf (stream, "\n ");
4030 fprintf (stream, " },\n");
4031 fprintf (stream, " {");
4032 if (t.level2_size << t.q > 8)
4033 fprintf (stream, "\n ");
4034 for (i = 0; i < t.level2_size << t.q; i++)
4037 if (i > 0 && (i % 8) == 0)
4038 fprintf (stream, "\n ");
4039 offset = ((uint32_t *) (t.result + level2_offset))[i];
4041 fprintf (stream, " %5d", -1);
4043 fprintf (stream, " %5zu",
4044 (offset - level3_offset) / sizeof (uint8_t));
4045 if (i+1 < t.level2_size << t.q)
4046 fprintf (stream, ",");
4048 if (t.level2_size << t.q > 8)
4049 fprintf (stream, "\n ");
4050 fprintf (stream, " },\n");
4051 /* Pack the level3 array. Each entry needs 4 bits only. */
4053 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4054 for (i = 0; i < t.level3_size << t.p; i++)
4056 unsigned int j = (i * 4) / 8;
4057 unsigned int k = (i * 4) % 8;
4058 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4059 level3_packed[j] |= (value << k);
4061 fprintf (stream, " {");
4062 if ((t.level3_size << t.p) * 4 / 8 > 8)
4063 fprintf (stream, "\n ");
4064 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4066 if (i > 0 && (i % 8) == 0)
4067 fprintf (stream, "\n ");
4068 fprintf (stream, " 0x%02x", level3_packed[i]);
4069 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4070 fprintf (stream, ",");
4072 if ((t.level3_size << t.p) * 4 / 8 > 8)
4073 fprintf (stream, "\n ");
4074 fprintf (stream, " }\n");
4075 free (level3_packed);
4076 fprintf (stream, "};\n");
4078 if (ferror (stream) || fclose (stream))
4080 fprintf (stderr, "error writing to '%s'\n", filename);
4085 /* Convert a Joining_Group value to a C identifier. */
4087 joining_group_as_c_identifier (int joining_group)
4089 #define TRY(value) if (joining_group == value) return #value;
4090 TRY(UC_JOINING_GROUP_NONE)
4091 TRY(UC_JOINING_GROUP_AIN)
4092 TRY(UC_JOINING_GROUP_ALAPH)
4093 TRY(UC_JOINING_GROUP_ALEF)
4094 TRY(UC_JOINING_GROUP_BEH)
4095 TRY(UC_JOINING_GROUP_BETH)
4096 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4097 TRY(UC_JOINING_GROUP_DAL)
4098 TRY(UC_JOINING_GROUP_DALATH_RISH)
4099 TRY(UC_JOINING_GROUP_E)
4100 TRY(UC_JOINING_GROUP_FARSI_YEH)
4101 TRY(UC_JOINING_GROUP_FE)
4102 TRY(UC_JOINING_GROUP_FEH)
4103 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4104 TRY(UC_JOINING_GROUP_GAF)
4105 TRY(UC_JOINING_GROUP_GAMAL)
4106 TRY(UC_JOINING_GROUP_HAH)
4107 TRY(UC_JOINING_GROUP_HE)
4108 TRY(UC_JOINING_GROUP_HEH)
4109 TRY(UC_JOINING_GROUP_HEH_GOAL)
4110 TRY(UC_JOINING_GROUP_HETH)
4111 TRY(UC_JOINING_GROUP_KAF)
4112 TRY(UC_JOINING_GROUP_KAPH)
4113 TRY(UC_JOINING_GROUP_KHAPH)
4114 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4115 TRY(UC_JOINING_GROUP_LAM)
4116 TRY(UC_JOINING_GROUP_LAMADH)
4117 TRY(UC_JOINING_GROUP_MEEM)
4118 TRY(UC_JOINING_GROUP_MIM)
4119 TRY(UC_JOINING_GROUP_NOON)
4120 TRY(UC_JOINING_GROUP_NUN)
4121 TRY(UC_JOINING_GROUP_NYA)
4122 TRY(UC_JOINING_GROUP_PE)
4123 TRY(UC_JOINING_GROUP_QAF)
4124 TRY(UC_JOINING_GROUP_QAPH)
4125 TRY(UC_JOINING_GROUP_REH)
4126 TRY(UC_JOINING_GROUP_REVERSED_PE)
4127 TRY(UC_JOINING_GROUP_SAD)
4128 TRY(UC_JOINING_GROUP_SADHE)
4129 TRY(UC_JOINING_GROUP_SEEN)
4130 TRY(UC_JOINING_GROUP_SEMKATH)
4131 TRY(UC_JOINING_GROUP_SHIN)
4132 TRY(UC_JOINING_GROUP_SWASH_KAF)
4133 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4134 TRY(UC_JOINING_GROUP_TAH)
4135 TRY(UC_JOINING_GROUP_TAW)
4136 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4137 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4138 TRY(UC_JOINING_GROUP_TETH)
4139 TRY(UC_JOINING_GROUP_WAW)
4140 TRY(UC_JOINING_GROUP_YEH)
4141 TRY(UC_JOINING_GROUP_YEH_BARREE)
4142 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4143 TRY(UC_JOINING_GROUP_YUDH)
4144 TRY(UC_JOINING_GROUP_YUDH_HE)
4145 TRY(UC_JOINING_GROUP_ZAIN)
4146 TRY(UC_JOINING_GROUP_ZHAIN)
4152 output_joining_group_test (const char *filename, const char *version)
4158 stream = fopen (filename, "w");
4161 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4165 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4166 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4167 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4171 for (ch = 0; ch < 0x110000; ch++)
4173 int value = unicode_joining_group[ch];
4175 if (value != UC_JOINING_GROUP_NONE)
4178 fprintf (stream, ",\n");
4179 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4184 fprintf (stream, "\n");
4186 if (ferror (stream) || fclose (stream))
4188 fprintf (stderr, "error writing to '%s'\n", filename);
4194 output_joining_group (const char *filename, const char *version)
4197 unsigned int ch_min, ch_max, ch, i;
4199 stream = fopen (filename, "w");
4202 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4206 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4207 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4208 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4212 for (ch = 0; ch < 0x110000; ch++)
4213 if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
4220 for (ch = 0x10FFFF; ch > 0; ch--)
4221 if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
4227 if (!(ch_min <= ch_max))
4230 /* If the interval [ch_min, ch_max] is too large, we should better use a
4232 if (!(ch_max - ch_min < 0x200))
4235 fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
4236 fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
4237 ch_max + 1, ch_min);
4238 fprintf (stream, "{");
4239 for (i = 0; i <= ch_max - ch_min; i++)
4245 fprintf (stream, "\n ");
4246 s = joining_group_as_c_identifier (unicode_joining_group[ch]);
4247 fprintf (stream, " %s", s);
4248 if (i+1 <= ch_max - ch_min)
4250 fprintf (stream, ",");
4251 if (((i+1) % 2) != 0)
4252 fprintf (stream, "%*s", 38 - (int) strlen (s), "");
4255 fprintf (stream, "\n");
4256 fprintf (stream, "};\n");
4258 if (ferror (stream) || fclose (stream))
4260 fprintf (stderr, "error writing to '%s'\n", filename);
4265 /* ========================================================================= */
4269 static const char *scripts[256];
4270 static unsigned int numscripts;
4272 static uint8_t unicode_scripts[0x110000];
4275 fill_scripts (const char *scripts_filename)
4280 stream = fopen (scripts_filename, "r");
4283 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4289 for (i = 0; i < 0x110000; i++)
4290 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4295 unsigned int i1, i2;
4296 char padding[200+1];
4297 char scriptname[200+1];
4300 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4303 if (buf[0] == '\0' || buf[0] == '#')
4306 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4308 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4310 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4320 for (script = numscripts - 1; script >= 0; script--)
4321 if (strcmp (scripts[script], scriptname) == 0)
4325 scripts[numscripts] = strdup (scriptname);
4326 script = numscripts;
4328 if (numscripts == 256)
4332 for (i = i1; i <= i2; i++)
4334 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4335 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4336 unicode_scripts[i] = script;
4340 if (ferror (stream) || fclose (stream))
4342 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4347 /* Construction of sparse 3-level tables. */
4348 #define TABLE script_table
4349 #define ELEMENT uint8_t
4350 #define DEFAULT (uint8_t)~(uint8_t)0
4351 #define xmalloc malloc
4352 #define xrealloc realloc
4356 output_scripts (const char *version)
4358 const char *filename = "unictype/scripts.h";
4360 unsigned int ch, s, i;
4361 struct script_table t;
4362 unsigned int level1_offset, level2_offset, level3_offset;
4366 const char *lowercase_name;
4369 scriptinfo_t scriptinfo[256];
4371 stream = fopen (filename, "w");
4374 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4378 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4379 fprintf (stream, "/* Unicode scripts. */\n");
4380 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4383 for (s = 0; s < numscripts; s++)
4385 char *lcp = strdup (scripts[s]);
4388 for (cp = lcp; *cp != '\0'; cp++)
4389 if (*cp >= 'A' && *cp <= 'Z')
4392 scriptinfo[s].lowercase_name = lcp;
4395 for (s = 0; s < numscripts; s++)
4397 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4398 scriptinfo[s].lowercase_name);
4399 fprintf (stream, "{\n");
4401 for (ch = 0; ch < 0x110000; ch++)
4402 if (unicode_scripts[ch] == s)
4408 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4413 fprintf (stream, ",\n");
4415 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4417 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4421 fprintf (stream, "\n");
4422 fprintf (stream, "};\n");
4425 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4426 fprintf (stream, "{\n");
4427 for (s = 0; s < numscripts; s++)
4429 fprintf (stream, " {\n");
4430 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4431 scriptinfo[s].lowercase_name);
4432 fprintf (stream, " script_%s_intervals,\n",
4433 scriptinfo[s].lowercase_name);
4434 fprintf (stream, " \"%s\"\n", scripts[s]);
4435 fprintf (stream, " }");
4436 if (s+1 < numscripts)
4437 fprintf (stream, ",");
4438 fprintf (stream, "\n");
4440 fprintf (stream, "};\n");
4444 script_table_init (&t);
4446 for (ch = 0; ch < 0x110000; ch++)
4448 unsigned int s = unicode_scripts[ch];
4449 if (s != (uint8_t)~(uint8_t)0)
4450 script_table_add (&t, ch, s);
4453 script_table_finalize (&t);
4455 /* Offsets in t.result, in memory of this process. */
4457 5 * sizeof (uint32_t);
4459 5 * sizeof (uint32_t)
4460 + t.level1_size * sizeof (uint32_t);
4462 5 * sizeof (uint32_t)
4463 + t.level1_size * sizeof (uint32_t)
4464 + (t.level2_size << t.q) * sizeof (uint32_t);
4466 for (i = 0; i < 5; i++)
4467 fprintf (stream, "#define script_header_%d %d\n", i,
4468 ((uint32_t *) t.result)[i]);
4469 fprintf (stream, "static const\n");
4470 fprintf (stream, "struct\n");
4471 fprintf (stream, " {\n");
4472 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4473 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4474 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4475 fprintf (stream, " }\n");
4476 fprintf (stream, "u_script =\n");
4477 fprintf (stream, "{\n");
4478 fprintf (stream, " {");
4479 if (t.level1_size > 8)
4480 fprintf (stream, "\n ");
4481 for (i = 0; i < t.level1_size; i++)
4484 if (i > 0 && (i % 8) == 0)
4485 fprintf (stream, "\n ");
4486 offset = ((uint32_t *) (t.result + level1_offset))[i];
4488 fprintf (stream, " %5d", -1);
4490 fprintf (stream, " %5zu",
4491 (offset - level2_offset) / sizeof (uint32_t));
4492 if (i+1 < t.level1_size)
4493 fprintf (stream, ",");
4495 if (t.level1_size > 8)
4496 fprintf (stream, "\n ");
4497 fprintf (stream, " },\n");
4498 fprintf (stream, " {");
4499 if (t.level2_size << t.q > 8)
4500 fprintf (stream, "\n ");
4501 for (i = 0; i < t.level2_size << t.q; i++)
4504 if (i > 0 && (i % 8) == 0)
4505 fprintf (stream, "\n ");
4506 offset = ((uint32_t *) (t.result + level2_offset))[i];
4508 fprintf (stream, " %5d", -1);
4510 fprintf (stream, " %5zu",
4511 (offset - level3_offset) / sizeof (uint8_t));
4512 if (i+1 < t.level2_size << t.q)
4513 fprintf (stream, ",");
4515 if (t.level2_size << t.q > 8)
4516 fprintf (stream, "\n ");
4517 fprintf (stream, " },\n");
4518 fprintf (stream, " {");
4519 if (t.level3_size << t.p > 8)
4520 fprintf (stream, "\n ");
4521 for (i = 0; i < t.level3_size << t.p; i++)
4523 if (i > 0 && (i % 8) == 0)
4524 fprintf (stream, "\n ");
4525 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4526 if (i+1 < t.level3_size << t.p)
4527 fprintf (stream, ",");
4529 if (t.level3_size << t.p > 8)
4530 fprintf (stream, "\n ");
4531 fprintf (stream, " }\n");
4532 fprintf (stream, "};\n");
4534 if (ferror (stream) || fclose (stream))
4536 fprintf (stderr, "error writing to '%s'\n", filename);
4542 output_scripts_byname (const char *version)
4544 const char *filename = "unictype/scripts_byname.gperf";
4548 stream = fopen (filename, "w");
4551 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4555 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4556 fprintf (stream, "/* Unicode scripts. */\n");
4557 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4559 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4560 fprintf (stream, "%%struct-type\n");
4561 fprintf (stream, "%%language=ANSI-C\n");
4562 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4563 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4564 fprintf (stream, "%%readonly-tables\n");
4565 fprintf (stream, "%%global-table\n");
4566 fprintf (stream, "%%define word-array-name script_names\n");
4567 fprintf (stream, "%%pic\n");
4568 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4569 fprintf (stream, "%%%%\n");
4570 for (s = 0; s < numscripts; s++)
4571 fprintf (stream, "%s, %u\n", scripts[s], s);
4573 if (ferror (stream) || fclose (stream))
4575 fprintf (stderr, "error writing to '%s'\n", filename);
4580 /* ========================================================================= */
4584 typedef struct { unsigned int start; unsigned int end; const char *name; }
4586 static block_t blocks[256];
4587 static unsigned int numblocks;
4590 fill_blocks (const char *blocks_filename)
4594 stream = fopen (blocks_filename, "r");
4597 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4604 unsigned int i1, i2;
4605 char padding[200+1];
4606 char blockname[200+1];
4608 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4611 if (buf[0] == '\0' || buf[0] == '#')
4614 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4616 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4619 blocks[numblocks].start = i1;
4620 blocks[numblocks].end = i2;
4621 blocks[numblocks].name = strdup (blockname);
4622 /* It must be sorted. */
4623 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
4626 if (numblocks == 256)
4630 if (ferror (stream) || fclose (stream))
4632 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4637 /* Return the smallest block index among the blocks for characters >= ch. */
4639 block_first_index (unsigned int ch)
4641 /* Binary search. */
4642 unsigned int lo = 0;
4643 unsigned int hi = numblocks;
4645 All blocks[i], i < lo, have blocks[i].end < ch,
4646 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4649 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4650 if (blocks[mid].end < ch)
4658 /* Return the largest block index among the blocks for characters <= ch,
4661 block_last_index (unsigned int ch)
4663 /* Binary search. */
4664 unsigned int lo = 0;
4665 unsigned int hi = numblocks;
4667 All blocks[i], i < lo, have blocks[i].start <= ch,
4668 all blocks[i], i >= hi, have blocks[i].start > ch. */
4671 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4672 if (blocks[mid].start <= ch)
4681 output_blocks (const char *version)
4683 const char *filename = "unictype/blocks.h";
4684 const unsigned int shift = 8; /* bits to shift away for array access */
4685 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
4690 stream = fopen (filename, "w");
4693 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4697 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4698 fprintf (stream, "/* Unicode blocks. */\n");
4699 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4702 fprintf (stream, "static const uc_block_t blocks[] =\n");
4703 fprintf (stream, "{\n");
4704 for (i = 0; i < numblocks; i++)
4706 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4707 blocks[i].end, blocks[i].name);
4708 if (i+1 < numblocks)
4709 fprintf (stream, ",");
4710 fprintf (stream, "\n");
4712 fprintf (stream, "};\n");
4713 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4714 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4715 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4716 threshold >> shift);
4717 fprintf (stream, "{\n");
4718 for (i1 = 0; i1 < (threshold >> shift); i1++)
4720 unsigned int first_index = block_first_index (i1 << shift);
4721 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4722 fprintf (stream, " %3d, %3d", first_index, last_index);
4723 if (i1+1 < (threshold >> shift))
4724 fprintf (stream, ",");
4725 fprintf (stream, "\n");
4727 fprintf (stream, "};\n");
4728 fprintf (stream, "#define blocks_upper_first_index %d\n",
4729 block_first_index (threshold));
4730 fprintf (stream, "#define blocks_upper_last_index %d\n",
4731 block_last_index (0x10FFFF));
4733 if (ferror (stream) || fclose (stream))
4735 fprintf (stderr, "error writing to '%s'\n", filename);
4740 /* ========================================================================= */
4742 /* C and Java syntax. */
4746 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4747 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4748 UC_IDENTIFIER_INVALID, /* not valid */
4749 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4752 /* ISO C 99 section 6.4.(3). */
4754 is_c_whitespace (unsigned int ch)
4756 return (ch == ' ' /* space */
4757 || ch == '\t' /* horizontal tab */
4758 || ch == '\n' || ch == '\r' /* new-line */
4759 || ch == '\v' /* vertical tab */
4760 || ch == '\f'); /* form-feed */
4763 /* ISO C 99 section 6.4.2.1 and appendix D. */
4765 c_ident_category (unsigned int ch)
4767 /* Section 6.4.2.1. */
4768 if (ch >= '0' && ch <= '9')
4769 return UC_IDENTIFIER_VALID;
4770 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4771 return UC_IDENTIFIER_START;
4777 || (ch >= 0x00C0 && ch <= 0x00D6)
4778 || (ch >= 0x00D8 && ch <= 0x00F6)
4779 || (ch >= 0x00F8 && ch <= 0x01F5)
4780 || (ch >= 0x01FA && ch <= 0x0217)
4781 || (ch >= 0x0250 && ch <= 0x02A8)
4782 || (ch >= 0x1E00 && ch <= 0x1E9B)
4783 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4787 || (ch >= 0x0388 && ch <= 0x038A)
4789 || (ch >= 0x038E && ch <= 0x03A1)
4790 || (ch >= 0x03A3 && ch <= 0x03CE)
4791 || (ch >= 0x03D0 && ch <= 0x03D6)
4796 || (ch >= 0x03E2 && ch <= 0x03F3)
4797 || (ch >= 0x1F00 && ch <= 0x1F15)
4798 || (ch >= 0x1F18 && ch <= 0x1F1D)
4799 || (ch >= 0x1F20 && ch <= 0x1F45)
4800 || (ch >= 0x1F48 && ch <= 0x1F4D)
4801 || (ch >= 0x1F50 && ch <= 0x1F57)
4805 || (ch >= 0x1F5F && ch <= 0x1F7D)
4806 || (ch >= 0x1F80 && ch <= 0x1FB4)
4807 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4808 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4809 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4810 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4811 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4812 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4813 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4814 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4816 || (ch >= 0x0401 && ch <= 0x040C)
4817 || (ch >= 0x040E && ch <= 0x044F)
4818 || (ch >= 0x0451 && ch <= 0x045C)
4819 || (ch >= 0x045E && ch <= 0x0481)
4820 || (ch >= 0x0490 && ch <= 0x04C4)
4821 || (ch >= 0x04C7 && ch <= 0x04C8)
4822 || (ch >= 0x04CB && ch <= 0x04CC)
4823 || (ch >= 0x04D0 && ch <= 0x04EB)
4824 || (ch >= 0x04EE && ch <= 0x04F5)
4825 || (ch >= 0x04F8 && ch <= 0x04F9)
4827 || (ch >= 0x0531 && ch <= 0x0556)
4828 || (ch >= 0x0561 && ch <= 0x0587)
4830 || (ch >= 0x05B0 && ch <= 0x05B9)
4831 || (ch >= 0x05BB && ch <= 0x05BD)
4833 || (ch >= 0x05C1 && ch <= 0x05C2)
4834 || (ch >= 0x05D0 && ch <= 0x05EA)
4835 || (ch >= 0x05F0 && ch <= 0x05F2)
4837 || (ch >= 0x0621 && ch <= 0x063A)
4838 || (ch >= 0x0640 && ch <= 0x0652)
4839 || (ch >= 0x0670 && ch <= 0x06B7)
4840 || (ch >= 0x06BA && ch <= 0x06BE)
4841 || (ch >= 0x06C0 && ch <= 0x06CE)
4842 || (ch >= 0x06D0 && ch <= 0x06DC)
4843 || (ch >= 0x06E5 && ch <= 0x06E8)
4844 || (ch >= 0x06EA && ch <= 0x06ED)
4846 || (ch >= 0x0901 && ch <= 0x0903)
4847 || (ch >= 0x0905 && ch <= 0x0939)
4848 || (ch >= 0x093E && ch <= 0x094D)
4849 || (ch >= 0x0950 && ch <= 0x0952)
4850 || (ch >= 0x0958 && ch <= 0x0963)
4852 || (ch >= 0x0981 && ch <= 0x0983)
4853 || (ch >= 0x0985 && ch <= 0x098C)
4854 || (ch >= 0x098F && ch <= 0x0990)
4855 || (ch >= 0x0993 && ch <= 0x09A8)
4856 || (ch >= 0x09AA && ch <= 0x09B0)
4858 || (ch >= 0x09B6 && ch <= 0x09B9)
4859 || (ch >= 0x09BE && ch <= 0x09C4)
4860 || (ch >= 0x09C7 && ch <= 0x09C8)
4861 || (ch >= 0x09CB && ch <= 0x09CD)
4862 || (ch >= 0x09DC && ch <= 0x09DD)
4863 || (ch >= 0x09DF && ch <= 0x09E3)
4864 || (ch >= 0x09F0 && ch <= 0x09F1)
4867 || (ch >= 0x0A05 && ch <= 0x0A0A)
4868 || (ch >= 0x0A0F && ch <= 0x0A10)
4869 || (ch >= 0x0A13 && ch <= 0x0A28)
4870 || (ch >= 0x0A2A && ch <= 0x0A30)
4871 || (ch >= 0x0A32 && ch <= 0x0A33)
4872 || (ch >= 0x0A35 && ch <= 0x0A36)
4873 || (ch >= 0x0A38 && ch <= 0x0A39)
4874 || (ch >= 0x0A3E && ch <= 0x0A42)
4875 || (ch >= 0x0A47 && ch <= 0x0A48)
4876 || (ch >= 0x0A4B && ch <= 0x0A4D)
4877 || (ch >= 0x0A59 && ch <= 0x0A5C)
4881 || (ch >= 0x0A81 && ch <= 0x0A83)
4882 || (ch >= 0x0A85 && ch <= 0x0A8B)
4884 || (ch >= 0x0A8F && ch <= 0x0A91)
4885 || (ch >= 0x0A93 && ch <= 0x0AA8)
4886 || (ch >= 0x0AAA && ch <= 0x0AB0)
4887 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4888 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4889 || (ch >= 0x0ABD && ch <= 0x0AC5)
4890 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4891 || (ch >= 0x0ACB && ch <= 0x0ACD)
4895 || (ch >= 0x0B01 && ch <= 0x0B03)
4896 || (ch >= 0x0B05 && ch <= 0x0B0C)
4897 || (ch >= 0x0B0F && ch <= 0x0B10)
4898 || (ch >= 0x0B13 && ch <= 0x0B28)
4899 || (ch >= 0x0B2A && ch <= 0x0B30)
4900 || (ch >= 0x0B32 && ch <= 0x0B33)
4901 || (ch >= 0x0B36 && ch <= 0x0B39)
4902 || (ch >= 0x0B3E && ch <= 0x0B43)
4903 || (ch >= 0x0B47 && ch <= 0x0B48)
4904 || (ch >= 0x0B4B && ch <= 0x0B4D)
4905 || (ch >= 0x0B5C && ch <= 0x0B5D)
4906 || (ch >= 0x0B5F && ch <= 0x0B61)
4908 || (ch >= 0x0B82 && ch <= 0x0B83)
4909 || (ch >= 0x0B85 && ch <= 0x0B8A)
4910 || (ch >= 0x0B8E && ch <= 0x0B90)
4911 || (ch >= 0x0B92 && ch <= 0x0B95)
4912 || (ch >= 0x0B99 && ch <= 0x0B9A)
4914 || (ch >= 0x0B9E && ch <= 0x0B9F)
4915 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4916 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4917 || (ch >= 0x0BAE && ch <= 0x0BB5)
4918 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4919 || (ch >= 0x0BBE && ch <= 0x0BC2)
4920 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4921 || (ch >= 0x0BCA && ch <= 0x0BCD)
4923 || (ch >= 0x0C01 && ch <= 0x0C03)
4924 || (ch >= 0x0C05 && ch <= 0x0C0C)
4925 || (ch >= 0x0C0E && ch <= 0x0C10)
4926 || (ch >= 0x0C12 && ch <= 0x0C28)
4927 || (ch >= 0x0C2A && ch <= 0x0C33)
4928 || (ch >= 0x0C35 && ch <= 0x0C39)
4929 || (ch >= 0x0C3E && ch <= 0x0C44)
4930 || (ch >= 0x0C46 && ch <= 0x0C48)
4931 || (ch >= 0x0C4A && ch <= 0x0C4D)
4932 || (ch >= 0x0C60 && ch <= 0x0C61)
4934 || (ch >= 0x0C82 && ch <= 0x0C83)
4935 || (ch >= 0x0C85 && ch <= 0x0C8C)
4936 || (ch >= 0x0C8E && ch <= 0x0C90)
4937 || (ch >= 0x0C92 && ch <= 0x0CA8)
4938 || (ch >= 0x0CAA && ch <= 0x0CB3)
4939 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4940 || (ch >= 0x0CBE && ch <= 0x0CC4)
4941 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4942 || (ch >= 0x0CCA && ch <= 0x0CCD)
4944 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4946 || (ch >= 0x0D02 && ch <= 0x0D03)
4947 || (ch >= 0x0D05 && ch <= 0x0D0C)
4948 || (ch >= 0x0D0E && ch <= 0x0D10)
4949 || (ch >= 0x0D12 && ch <= 0x0D28)
4950 || (ch >= 0x0D2A && ch <= 0x0D39)
4951 || (ch >= 0x0D3E && ch <= 0x0D43)
4952 || (ch >= 0x0D46 && ch <= 0x0D48)
4953 || (ch >= 0x0D4A && ch <= 0x0D4D)
4954 || (ch >= 0x0D60 && ch <= 0x0D61)
4956 || (ch >= 0x0E01 && ch <= 0x0E3A)
4957 || (ch >= 0x0E40 && ch <= 0x0E5B)
4959 || (ch >= 0x0E81 && ch <= 0x0E82)
4961 || (ch >= 0x0E87 && ch <= 0x0E88)
4964 || (ch >= 0x0E94 && ch <= 0x0E97)
4965 || (ch >= 0x0E99 && ch <= 0x0E9F)
4966 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4969 || (ch >= 0x0EAA && ch <= 0x0EAB)
4970 || (ch >= 0x0EAD && ch <= 0x0EAE)
4971 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4972 || (ch >= 0x0EBB && ch <= 0x0EBD)
4973 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4975 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4976 || (ch >= 0x0EDC && ch <= 0x0EDD)
4979 || (ch >= 0x0F18 && ch <= 0x0F19)
4983 || (ch >= 0x0F3E && ch <= 0x0F47)
4984 || (ch >= 0x0F49 && ch <= 0x0F69)
4985 || (ch >= 0x0F71 && ch <= 0x0F84)
4986 || (ch >= 0x0F86 && ch <= 0x0F8B)
4987 || (ch >= 0x0F90 && ch <= 0x0F95)
4989 || (ch >= 0x0F99 && ch <= 0x0FAD)
4990 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4993 || (ch >= 0x10A0 && ch <= 0x10C5)
4994 || (ch >= 0x10D0 && ch <= 0x10F6)
4996 || (ch >= 0x3041 && ch <= 0x3093)
4997 || (ch >= 0x309B && ch <= 0x309C)
4999 || (ch >= 0x30A1 && ch <= 0x30F6)
5000 || (ch >= 0x30FB && ch <= 0x30FC)
5002 || (ch >= 0x3105 && ch <= 0x312C)
5003 /* CJK Unified Ideographs */
5004 || (ch >= 0x4E00 && ch <= 0x9FA5)
5006 || (ch >= 0xAC00 && ch <= 0xD7A3)
5008 || (ch >= 0x0660 && ch <= 0x0669)
5009 || (ch >= 0x06F0 && ch <= 0x06F9)
5010 || (ch >= 0x0966 && ch <= 0x096F)
5011 || (ch >= 0x09E6 && ch <= 0x09EF)
5012 || (ch >= 0x0A66 && ch <= 0x0A6F)
5013 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5014 || (ch >= 0x0B66 && ch <= 0x0B6F)
5015 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5016 || (ch >= 0x0C66 && ch <= 0x0C6F)
5017 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5018 || (ch >= 0x0D66 && ch <= 0x0D6F)
5019 || (ch >= 0x0E50 && ch <= 0x0E59)
5020 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5021 || (ch >= 0x0F20 && ch <= 0x0F33)
5022 /* Special characters */
5025 || (ch >= 0x02B0 && ch <= 0x02B8)
5027 || (ch >= 0x02BD && ch <= 0x02C1)
5028 || (ch >= 0x02D0 && ch <= 0x02D1)
5029 || (ch >= 0x02E0 && ch <= 0x02E4)
5035 || (ch >= 0x203F && ch <= 0x2040)
5038 || (ch >= 0x210A && ch <= 0x2113)
5040 || (ch >= 0x2118 && ch <= 0x211D)
5044 || (ch >= 0x212A && ch <= 0x2131)
5045 || (ch >= 0x2133 && ch <= 0x2138)
5046 || (ch >= 0x2160 && ch <= 0x2182)
5047 || (ch >= 0x3005 && ch <= 0x3007)
5048 || (ch >= 0x3021 && ch <= 0x3029)
5050 return UC_IDENTIFIER_START;
5051 return UC_IDENTIFIER_INVALID;
5054 /* The Java Language Specification, 3rd edition, §3.6.
5055 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
5057 is_java_whitespace (unsigned int ch)
5059 return (ch == ' ' || ch == '\t' || ch == '\f'
5060 || ch == '\n' || ch == '\r');
5063 /* The Java Language Specification, 3rd edition, §3.8.
5064 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
5065 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5067 java_ident_category (unsigned int ch)
5069 /* FIXME: Check this against Sun's JDK implementation. */
5070 if (is_category_L (ch) /* = Character.isLetter(ch) */
5071 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5072 || is_category_Sc (ch) /* currency symbol */
5073 || is_category_Pc (ch) /* connector punctuation */
5075 return UC_IDENTIFIER_START;
5076 if (is_category_Nd (ch) /* digit */
5077 || is_category_Mc (ch) /* combining mark */
5078 || is_category_Mn (ch) /* non-spacing mark */
5080 return UC_IDENTIFIER_VALID;
5081 if ((ch >= 0x0000 && ch <= 0x0008)
5082 || (ch >= 0x000E && ch <= 0x001B)
5083 || (ch >= 0x007F && ch <= 0x009F)
5084 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5086 return UC_IDENTIFIER_IGNORABLE;
5087 return UC_IDENTIFIER_INVALID;
5090 /* Construction of sparse 3-level tables. */
5091 #define TABLE identsyntax_table
5092 #define ELEMENT uint8_t
5093 #define DEFAULT UC_IDENTIFIER_INVALID
5094 #define xmalloc malloc
5095 #define xrealloc realloc
5098 /* Output an identifier syntax categorization in a three-level bitmap. */
5100 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5104 struct identsyntax_table t;
5105 unsigned int level1_offset, level2_offset, level3_offset;
5107 stream = fopen (filename, "w");
5110 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5114 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5115 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5116 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5121 identsyntax_table_init (&t);
5123 for (ch = 0; ch < 0x110000; ch++)
5125 int syntaxcode = predicate (ch);
5126 if (syntaxcode != UC_IDENTIFIER_INVALID)
5127 identsyntax_table_add (&t, ch, syntaxcode);
5130 identsyntax_table_finalize (&t);
5132 /* Offsets in t.result, in memory of this process. */
5134 5 * sizeof (uint32_t);
5136 5 * sizeof (uint32_t)
5137 + t.level1_size * sizeof (uint32_t);
5139 5 * sizeof (uint32_t)
5140 + t.level1_size * sizeof (uint32_t)
5141 + (t.level2_size << t.q) * sizeof (uint32_t);
5143 for (i = 0; i < 5; i++)
5144 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5145 ((uint32_t *) t.result)[i]);
5146 fprintf (stream, "static const\n");
5147 fprintf (stream, "struct\n");
5148 fprintf (stream, " {\n");
5149 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5150 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5151 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5152 (1 << t.p) * 2 / 16);
5153 fprintf (stream, " }\n");
5154 fprintf (stream, "%s =\n", name);
5155 fprintf (stream, "{\n");
5156 fprintf (stream, " {");
5157 if (t.level1_size > 8)
5158 fprintf (stream, "\n ");
5159 for (i = 0; i < t.level1_size; i++)
5162 if (i > 0 && (i % 8) == 0)
5163 fprintf (stream, "\n ");
5164 offset = ((uint32_t *) (t.result + level1_offset))[i];
5166 fprintf (stream, " %5d", -1);
5168 fprintf (stream, " %5zu",
5169 (offset - level2_offset) / sizeof (uint32_t));
5170 if (i+1 < t.level1_size)
5171 fprintf (stream, ",");
5173 if (t.level1_size > 8)
5174 fprintf (stream, "\n ");
5175 fprintf (stream, " },\n");
5176 fprintf (stream, " {");
5177 if (t.level2_size << t.q > 8)
5178 fprintf (stream, "\n ");
5179 for (i = 0; i < t.level2_size << t.q; i++)
5182 if (i > 0 && (i % 8) == 0)
5183 fprintf (stream, "\n ");
5184 offset = ((uint32_t *) (t.result + level2_offset))[i];
5186 fprintf (stream, " %5d", -1);
5188 fprintf (stream, " %5zu",
5189 (offset - level3_offset) / sizeof (uint8_t));
5190 if (i+1 < t.level2_size << t.q)
5191 fprintf (stream, ",");
5193 if (t.level2_size << t.q > 8)
5194 fprintf (stream, "\n ");
5195 fprintf (stream, " },\n");
5196 /* Pack the level3 array. Each entry needs 2 bits only. */
5197 fprintf (stream, " {");
5198 if ((t.level3_size << t.p) * 2 / 16 > 8)
5199 fprintf (stream, "\n ");
5200 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5202 if (i > 0 && (i % 8) == 0)
5203 fprintf (stream, "\n ");
5204 fprintf (stream, " 0x%04x",
5205 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5206 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5207 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5208 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5209 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5210 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5211 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5212 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5213 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5214 fprintf (stream, ",");
5216 if ((t.level3_size << t.p) * 2 / 16 > 8)
5217 fprintf (stream, "\n ");
5218 fprintf (stream, " }\n");
5219 fprintf (stream, "};\n");
5221 if (ferror (stream) || fclose (stream))
5223 fprintf (stderr, "error writing to '%s'\n", filename);
5229 output_ident_properties (const char *version)
5231 #define PROPERTY(P) \
5232 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5233 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5234 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5235 PROPERTY(c_whitespace)
5236 PROPERTY(java_whitespace)
5239 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5240 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5243 /* ========================================================================= */
5245 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5246 glibc/localedata/locales/i18n file, generated by
5247 glibc/localedata/gen-unicode-ctype.c. */
5249 /* Character mappings. */
5252 to_upper (unsigned int ch)
5254 if (unicode_attributes[ch].name != NULL
5255 && unicode_attributes[ch].upper != NONE)
5256 return unicode_attributes[ch].upper;
5262 to_lower (unsigned int ch)
5264 if (unicode_attributes[ch].name != NULL
5265 && unicode_attributes[ch].lower != NONE)
5266 return unicode_attributes[ch].lower;
5272 to_title (unsigned int ch)
5274 if (unicode_attributes[ch].name != NULL
5275 && unicode_attributes[ch].title != NONE)
5276 return unicode_attributes[ch].title;
5281 /* Character class properties. */
5284 is_upper (unsigned int ch)
5286 return (to_lower (ch) != ch);
5290 is_lower (unsigned int ch)
5292 return (to_upper (ch) != ch)
5293 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5298 is_alpha (unsigned int ch)
5300 return (unicode_attributes[ch].name != NULL
5301 && ((unicode_attributes[ch].category[0] == 'L'
5302 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5303 <U0E2F>, <U0E46> should belong to is_punct. */
5304 && (ch != 0x0E2F) && (ch != 0x0E46))
5305 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5306 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5308 || (ch >= 0x0E34 && ch <= 0x0E3A)
5309 || (ch >= 0x0E47 && ch <= 0x0E4E)
5310 /* Avoid warning for <U0345>. */
5312 /* Avoid warnings for <U2160>..<U217F>. */
5313 || (unicode_attributes[ch].category[0] == 'N'
5314 && unicode_attributes[ch].category[1] == 'l')
5315 /* Avoid warnings for <U24B6>..<U24E9>. */
5316 || (unicode_attributes[ch].category[0] == 'S'
5317 && unicode_attributes[ch].category[1] == 'o'
5318 && strstr (unicode_attributes[ch].name, " LETTER ")
5320 /* Consider all the non-ASCII digits as alphabetic.
5321 ISO C 99 forbids us to have them in category "digit",
5322 but we want iswalnum to return true on them. */
5323 || (unicode_attributes[ch].category[0] == 'N'
5324 && unicode_attributes[ch].category[1] == 'd'
5325 && !(ch >= 0x0030 && ch <= 0x0039))));
5329 is_digit (unsigned int ch)
5332 return (unicode_attributes[ch].name != NULL
5333 && unicode_attributes[ch].category[0] == 'N'
5334 && unicode_attributes[ch].category[1] == 'd');
5335 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5336 a zero. Must add <0> in front of them by hand. */
5338 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5341 The iswdigit function tests for any wide character that corresponds
5342 to a decimal-digit character (as defined in 5.2.1).
5344 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5346 return (ch >= 0x0030 && ch <= 0x0039);
5351 is_outdigit (unsigned int ch)
5353 return (ch >= 0x0030 && ch <= 0x0039);
5357 is_alnum (unsigned int ch)
5359 return is_alpha (ch) || is_digit (ch);
5363 is_blank (unsigned int ch)
5365 return (ch == 0x0009 /* '\t' */
5366 /* Category Zs without mention of "<noBreak>" */
5367 || (unicode_attributes[ch].name != NULL
5368 && unicode_attributes[ch].category[0] == 'Z'
5369 && unicode_attributes[ch].category[1] == 's'
5370 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5374 is_space (unsigned int ch)
5376 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5377 should treat it like a punctuation character, not like a space. */
5378 return (ch == 0x0020 /* ' ' */
5379 || ch == 0x000C /* '\f' */
5380 || ch == 0x000A /* '\n' */
5381 || ch == 0x000D /* '\r' */
5382 || ch == 0x0009 /* '\t' */
5383 || ch == 0x000B /* '\v' */
5384 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5385 || (unicode_attributes[ch].name != NULL
5386 && unicode_attributes[ch].category[0] == 'Z'
5387 && (unicode_attributes[ch].category[1] == 'l'
5388 || unicode_attributes[ch].category[1] == 'p'
5389 || (unicode_attributes[ch].category[1] == 's'
5390 && !strstr (unicode_attributes[ch].decomposition,
5395 is_cntrl (unsigned int ch)
5397 return (unicode_attributes[ch].name != NULL
5398 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5399 /* Categories Zl and Zp */
5400 || (unicode_attributes[ch].category[0] == 'Z'
5401 && (unicode_attributes[ch].category[1] == 'l'
5402 || unicode_attributes[ch].category[1] == 'p'))));
5406 is_xdigit (unsigned int ch)
5409 return is_digit (ch)
5410 || (ch >= 0x0041 && ch <= 0x0046)
5411 || (ch >= 0x0061 && ch <= 0x0066);
5413 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5416 The iswxdigit function tests for any wide character that corresponds
5417 to a hexadecimal-digit character (as defined in 6.4.4.1).
5419 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5421 return (ch >= 0x0030 && ch <= 0x0039)
5422 || (ch >= 0x0041 && ch <= 0x0046)
5423 || (ch >= 0x0061 && ch <= 0x0066);
5428 is_graph (unsigned int ch)
5430 return (unicode_attributes[ch].name != NULL
5431 && strcmp (unicode_attributes[ch].name, "<control>")
5436 is_print (unsigned int ch)
5438 return (unicode_attributes[ch].name != NULL
5439 && strcmp (unicode_attributes[ch].name, "<control>")
5440 /* Categories Zl and Zp */
5441 && !(unicode_attributes[ch].name != NULL
5442 && unicode_attributes[ch].category[0] == 'Z'
5443 && (unicode_attributes[ch].category[1] == 'l'
5444 || unicode_attributes[ch].category[1] == 'p')));
5448 is_punct (unsigned int ch)
5451 return (unicode_attributes[ch].name != NULL
5452 && unicode_attributes[ch].category[0] == 'P');
5454 /* The traditional POSIX definition of punctuation is every graphic,
5455 non-alphanumeric character. */
5456 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5460 /* Output all properties. */
5462 output_old_ctype (const char *version)
5464 #define PROPERTY(P) \
5465 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5466 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5467 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5486 is_combining (unsigned int ch)
5488 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5489 file. In 3.0.1 it was identical to the union of the general categories
5490 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5491 PropList.txt file, so we take the latter definition. */
5492 return (unicode_attributes[ch].name != NULL
5493 && unicode_attributes[ch].category[0] == 'M'
5494 && (unicode_attributes[ch].category[1] == 'n'
5495 || unicode_attributes[ch].category[1] == 'c'
5496 || unicode_attributes[ch].category[1] == 'e'));
5500 is_combining_level3 (unsigned int ch)
5502 return is_combining (ch)
5503 && !(unicode_attributes[ch].combining[0] != '\0'
5504 && unicode_attributes[ch].combining[0] != '0'
5505 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5508 /* Return the UCS symbol string for a Unicode character. */
5510 ucs_symbol (unsigned int i)
5512 static char buf[11+1];
5514 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5518 /* Return the UCS symbol range string for a Unicode characters interval. */
5520 ucs_symbol_range (unsigned int low, unsigned int high)
5522 static char buf[24+1];
5524 strcpy (buf, ucs_symbol (low));
5526 strcat (buf, ucs_symbol (high));
5530 /* Output a character class (= property) table. */
5533 output_charclass (FILE *stream, const char *classname,
5534 bool (*func) (unsigned int))
5536 char table[0x110000];
5538 bool need_semicolon;
5539 const int max_column = 75;
5542 for (i = 0; i < 0x110000; i++)
5543 table[i] = (int) func (i);
5545 fprintf (stream, "%s ", classname);
5546 need_semicolon = false;
5548 for (i = 0; i < 0x110000; )
5554 unsigned int low, high;
5560 while (i < 0x110000 && table[i]);
5564 strcpy (buf, ucs_symbol (low));
5566 strcpy (buf, ucs_symbol_range (low, high));
5570 fprintf (stream, ";");
5574 if (column + strlen (buf) > max_column)
5576 fprintf (stream, "/\n ");
5580 fprintf (stream, "%s", buf);
5581 column += strlen (buf);
5582 need_semicolon = true;
5585 fprintf (stream, "\n");
5588 /* Output a character mapping table. */
5591 output_charmap (FILE *stream, const char *mapname,
5592 unsigned int (*func) (unsigned int))
5594 char table[0x110000];
5596 bool need_semicolon;
5597 const int max_column = 75;
5600 for (i = 0; i < 0x110000; i++)
5601 table[i] = (func (i) != i);
5603 fprintf (stream, "%s ", mapname);
5604 need_semicolon = false;
5606 for (i = 0; i < 0x110000; i++)
5612 strcat (buf, ucs_symbol (i));
5614 strcat (buf, ucs_symbol (func (i)));
5619 fprintf (stream, ";");
5623 if (column + strlen (buf) > max_column)
5625 fprintf (stream, "/\n ");
5629 fprintf (stream, "%s", buf);
5630 column += strlen (buf);
5631 need_semicolon = true;
5633 fprintf (stream, "\n");
5636 /* Output the width table. */
5639 output_widthmap (FILE *stream)
5643 /* Output the tables to the given file. */
5646 output_tables (const char *filename, const char *version)
5651 stream = fopen (filename, "w");
5654 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5658 fprintf (stream, "escape_char /\n");
5659 fprintf (stream, "comment_char %%\n");
5660 fprintf (stream, "\n");
5661 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
5663 fprintf (stream, "\n");
5665 fprintf (stream, "LC_IDENTIFICATION\n");
5666 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5667 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5668 fprintf (stream, "address \"\"\n");
5669 fprintf (stream, "contact \"\"\n");
5670 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5671 fprintf (stream, "tel \"\"\n");
5672 fprintf (stream, "fax \"\"\n");
5673 fprintf (stream, "language \"\"\n");
5674 fprintf (stream, "territory \"Earth\"\n");
5675 fprintf (stream, "revision \"%s\"\n", version);
5680 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5681 fprintf (stream, "date \"%s\"\n", date);
5683 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5684 fprintf (stream, "END LC_IDENTIFICATION\n");
5685 fprintf (stream, "\n");
5687 /* Verifications. */
5688 for (ch = 0; ch < 0x110000; ch++)
5690 /* toupper restriction: "Only characters specified for the keywords
5691 lower and upper shall be specified. */
5692 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5694 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5695 ucs_symbol (ch), ch, to_upper (ch));
5697 /* tolower restriction: "Only characters specified for the keywords
5698 lower and upper shall be specified. */
5699 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5701 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5702 ucs_symbol (ch), ch, to_lower (ch));
5704 /* alpha restriction: "Characters classified as either upper or lower
5705 shall automatically belong to this class. */
5706 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5707 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5709 /* alpha restriction: "No character specified for the keywords cntrl,
5710 digit, punct or space shall be specified." */
5711 if (is_alpha (ch) && is_cntrl (ch))
5712 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5713 if (is_alpha (ch) && is_digit (ch))
5714 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5715 if (is_alpha (ch) && is_punct (ch))
5716 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5717 if (is_alpha (ch) && is_space (ch))
5718 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5720 /* space restriction: "No character specified for the keywords upper,
5721 lower, alpha, digit, graph or xdigit shall be specified."
5722 upper, lower, alpha already checked above. */
5723 if (is_space (ch) && is_digit (ch))
5724 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5725 if (is_space (ch) && is_graph (ch))
5726 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5727 if (is_space (ch) && is_xdigit (ch))
5728 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5730 /* cntrl restriction: "No character specified for the keywords upper,
5731 lower, alpha, digit, punct, graph, print or xdigit shall be
5732 specified." upper, lower, alpha already checked above. */
5733 if (is_cntrl (ch) && is_digit (ch))
5734 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5735 if (is_cntrl (ch) && is_punct (ch))
5736 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5737 if (is_cntrl (ch) && is_graph (ch))
5738 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5739 if (is_cntrl (ch) && is_print (ch))
5740 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5741 if (is_cntrl (ch) && is_xdigit (ch))
5742 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5744 /* punct restriction: "No character specified for the keywords upper,
5745 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5746 be specified." upper, lower, alpha, cntrl already checked above. */
5747 if (is_punct (ch) && is_digit (ch))
5748 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5749 if (is_punct (ch) && is_xdigit (ch))
5750 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5751 if (is_punct (ch) && (ch == 0x0020))
5752 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5754 /* graph restriction: "No character specified for the keyword cntrl
5755 shall be specified." Already checked above. */
5757 /* print restriction: "No character specified for the keyword cntrl
5758 shall be specified." Already checked above. */
5760 /* graph - print relation: differ only in the <space> character.
5761 How is this possible if there are more than one space character?!
5762 I think susv2/xbd/locale.html should speak of "space characters",
5763 not "space character". */
5764 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5766 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5767 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5769 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5772 fprintf (stream, "LC_CTYPE\n");
5773 output_charclass (stream, "upper", is_upper);
5774 output_charclass (stream, "lower", is_lower);
5775 output_charclass (stream, "alpha", is_alpha);
5776 output_charclass (stream, "digit", is_digit);
5777 output_charclass (stream, "outdigit", is_outdigit);
5778 output_charclass (stream, "blank", is_blank);
5779 output_charclass (stream, "space", is_space);
5780 output_charclass (stream, "cntrl", is_cntrl);
5781 output_charclass (stream, "punct", is_punct);
5782 output_charclass (stream, "xdigit", is_xdigit);
5783 output_charclass (stream, "graph", is_graph);
5784 output_charclass (stream, "print", is_print);
5785 output_charclass (stream, "class \"combining\";", is_combining);
5786 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5787 output_charmap (stream, "toupper", to_upper);
5788 output_charmap (stream, "tolower", to_lower);
5789 output_charmap (stream, "map \"totitle\";", to_title);
5790 output_widthmap (stream);
5791 fprintf (stream, "END LC_CTYPE\n");
5793 if (ferror (stream) || fclose (stream))
5795 fprintf (stderr, "error writing to '%s'\n", filename);
5802 /* ========================================================================= */
5804 /* The width property from the EastAsianWidth.txt file.
5805 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5806 const char * unicode_width[0x110000];
5808 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5811 fill_width (const char *width_filename)
5815 char field0[FIELDLEN];
5816 char field1[FIELDLEN];
5817 char field2[FIELDLEN];
5820 for (i = 0; i < 0x110000; i++)
5821 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5823 stream = fopen (width_filename, "r");
5826 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5841 do c = getc (stream); while (c != EOF && c != '\n');
5845 n = getfield (stream, field0, ';');
5846 n += getfield (stream, field1, ' ');
5847 n += getfield (stream, field2, '\n');
5852 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5855 i = strtoul (field0, NULL, 16);
5856 if (strstr (field0, "..") != NULL)
5858 /* Deal with a range. */
5859 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5861 unicode_width[i] = strdup (field1);
5865 /* Single character line. */
5866 unicode_width[i] = strdup (field1);
5870 if (ferror (stream) || fclose (stream))
5872 fprintf (stderr, "error reading from '%s'\n", width_filename);
5877 /* ========================================================================= */
5879 /* Non-spacing attribute and width. */
5881 /* The non-spacing attribute table consists of:
5882 - Non-spacing characters; generated from PropList.txt or
5883 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
5884 - Format control characters; generated from
5885 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
5886 - Zero width characters; generated from
5887 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
5891 is_nonspacing (unsigned int ch)
5893 return (unicode_attributes[ch].name != NULL
5894 && (get_bidi_category (ch) == UC_BIDI_NSM
5895 || is_category_Cc (ch) || is_category_Cf (ch)
5896 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
5900 output_nonspacing_property (const char *filename)
5903 int ind[0x110000 / 0x200];
5908 stream = fopen (filename, "w");
5911 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5916 for (i = 0; i < 0x110000 / 0x200; i++)
5918 bool nontrivial = false;
5921 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
5922 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
5923 if (is_nonspacing (ch))
5929 ind[i] = next_ind++;
5934 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
5937 for (i = 0; i < 0x110000 / 0x200; i++)
5939 bool nontrivial = (ind[i] >= 0);
5945 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
5946 for (j = 0; j < 8; j++)
5950 fprintf (stream, " ");
5951 for (k = 0; k < 8; k++)
5954 unsigned char bits = 0;
5956 for (l = 0; l < 8; l++)
5958 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
5960 if (is_nonspacing (ch))
5963 fprintf (stream, " 0x%02x%c", bits,
5964 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
5966 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5967 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
5972 fprintf (stream, "};\n");
5974 i_max = ((i_max + 8 - 1) / 8) * 8;
5975 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
5980 for (j = 0; j < i_max / 8; j++)
5984 fprintf (stream, " ");
5985 for (k = 0; k < 8; k++)
5988 fprintf (stream, " %2d%c", ind[i],
5989 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
5991 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5992 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
5995 fprintf (stream, "};\n");
5997 if (ferror (stream) || fclose (stream))
5999 fprintf (stderr, "error writing to '%s'\n", filename);
6004 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6006 symbolic_width (unsigned int ch)
6008 /* Test for unassigned character. */
6009 if (is_property_unassigned_code_value (ch))
6011 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6012 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6014 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6015 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6016 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6017 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6018 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6024 /* Test for non-spacing or control character. */
6025 if (is_category_Cc (ch) && ch < 0x00A0)
6027 if (is_nonspacing (ch))
6029 /* Test for double-width character. */
6030 if (unicode_width[ch] != NULL
6031 && (strcmp (unicode_width[ch], "W") == 0
6032 || strcmp (unicode_width[ch], "F") == 0))
6034 /* Test for half-width character. */
6035 if (unicode_width[ch] != NULL
6036 && strcmp (unicode_width[ch], "H") == 0)
6039 /* In ancient CJK encodings, Cyrillic and most other characters are
6040 double-width as well. */
6041 if (ch >= 0x00A1 && ch < 0x10000)
6047 output_width_property_test (const char *filename)
6050 unsigned int interval_start, interval_end, ch;
6051 char interval_value;
6053 stream = fopen (filename, "w");
6056 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6061 interval_start = interval_end = 0; /* avoid GCC warning */
6062 for (ch = 0; ch < 0x110000; ch++)
6064 char value = symbolic_width (ch);
6065 if (value != 0) /* skip Cc control characters and unassigned characters */
6067 if (value == interval_value)
6068 /* Extend the interval. */
6072 /* Terminate the interval. */
6073 if (interval_value != 0)
6075 if (interval_end == interval_start)
6076 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6078 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6080 /* Start a new interval. */
6081 interval_start = interval_end = ch;
6082 interval_value = value;
6086 /* Terminate the last interval. */
6087 if (interval_value != 0)
6089 if (interval_end == interval_start)
6090 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6092 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6095 if (ferror (stream) || fclose (stream))
6097 fprintf (stderr, "error writing to '%s'\n", filename);
6102 /* ========================================================================= */
6104 /* Line breaking classification.
6105 Updated for Unicode TR #14 revision 26. */
6109 /* Values >= 25 are resolved at run time. */
6110 LBP_BK = 25, /* mandatory break */
6111 /*LBP_CR, carriage return - not used here because it's a DOSism */
6112 /*LBP_LF, line feed - not used here because it's a DOSism */
6113 LBP_CM = 26, /* attached characters and combining marks */
6114 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6115 /*LBP_SG, surrogates - not used here because they are not characters */
6116 LBP_WJ = 0, /* word joiner */
6117 LBP_ZW = 27, /* zero width space */
6118 LBP_GL = 1, /* non-breaking (glue) */
6119 LBP_SP = 28, /* space */
6120 LBP_B2 = 2, /* break opportunity before and after */
6121 LBP_BA = 3, /* break opportunity after */
6122 LBP_BB = 4, /* break opportunity before */
6123 LBP_HY = 5, /* hyphen */
6124 LBP_CB = 29, /* contingent break opportunity */
6125 LBP_CL = 6, /* closing punctuation */
6126 LBP_CP = 7, /* closing parenthesis */
6127 LBP_EX = 8, /* exclamation/interrogation */
6128 LBP_IN = 9, /* inseparable */
6129 LBP_NS = 10, /* non starter */
6130 LBP_OP = 11, /* opening punctuation */
6131 LBP_QU = 12, /* ambiguous quotation */
6132 LBP_IS = 13, /* infix separator (numeric) */
6133 LBP_NU = 14, /* numeric */
6134 LBP_PO = 15, /* postfix (numeric) */
6135 LBP_PR = 16, /* prefix (numeric) */
6136 LBP_SY = 17, /* symbols allowing breaks */
6137 LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
6138 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6139 LBP_H2 = 19, /* Hangul LV syllable */
6140 LBP_H3 = 20, /* Hangul LVT syllable */
6141 LBP_ID = 21, /* ideographic */
6142 LBP_JL = 22, /* Hangul L Jamo */
6143 LBP_JV = 23, /* Hangul V Jamo */
6144 LBP_JT = 24, /* Hangul T Jamo */
6145 LBP_SA = 31, /* complex context (South East Asian) */
6146 LBP_XX = 32 /* unknown */
6149 /* Returns the line breaking classification for ch, as a bit mask. */
6151 get_lbp (unsigned int ch)
6155 if (unicode_attributes[ch].name != NULL)
6157 /* mandatory break */
6158 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6159 || ch == 0x000C /* form feed */
6160 || ch == 0x000B /* line tabulation */
6161 || ch == 0x2028 /* LINE SEPARATOR */
6162 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6163 attr |= (int64_t) 1 << LBP_BK;
6165 if (ch == 0x2060 /* WORD JOINER */
6166 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6167 attr |= (int64_t) 1 << LBP_WJ;
6169 /* zero width space */
6170 if (ch == 0x200B /* ZERO WIDTH SPACE */)
6171 attr |= (int64_t) 1 << LBP_ZW;
6173 /* non-breaking (glue) */
6174 if (ch == 0x00A0 /* NO-BREAK SPACE */
6175 || ch == 0x202F /* NARROW NO-BREAK SPACE */
6176 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6177 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6178 || ch == 0x2007 /* FIGURE SPACE */
6179 || ch == 0x2011 /* NON-BREAKING HYPHEN */
6180 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6181 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6182 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6183 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6184 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6185 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6186 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6187 attr |= (int64_t) 1 << LBP_GL;
6190 if (ch == 0x0020 /* SPACE */)
6191 attr |= (int64_t) 1 << LBP_SP;
6193 /* break opportunity before and after */
6194 if (ch == 0x2014 /* EM DASH */)
6195 attr |= (int64_t) 1 << LBP_B2;
6197 /* break opportunity after */
6198 if (/* Breaking Spaces */
6199 ch == 0x1680 /* OGHAM SPACE MARK */
6200 || ch == 0x2000 /* EN QUAD */
6201 || ch == 0x2001 /* EM QUAD */
6202 || ch == 0x2002 /* EN SPACE */
6203 || ch == 0x2003 /* EM SPACE */
6204 || ch == 0x2004 /* THREE-PER-EM SPACE */
6205 || ch == 0x2005 /* FOUR-PER-EM SPACE */
6206 || ch == 0x2006 /* SIX-PER-EM SPACE */
6207 || ch == 0x2008 /* PUNCTUATION SPACE */
6208 || ch == 0x2009 /* THIN SPACE */
6209 || ch == 0x200A /* HAIR SPACE */
6210 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6212 || ch == 0x0009 /* tab */
6213 /* Conditional Hyphens */
6214 || ch == 0x00AD /* SOFT HYPHEN */
6215 /* Breaking Hyphens */
6216 || ch == 0x058A /* ARMENIAN HYPHEN */
6217 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6218 || ch == 0x2010 /* HYPHEN */
6219 || ch == 0x2012 /* FIGURE DASH */
6220 || ch == 0x2013 /* EN DASH */
6221 /* Visible Word Dividers */
6222 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6223 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6224 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6225 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6226 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6227 || ch == 0x2027 /* HYPHENATION POINT */
6228 || ch == 0x007C /* VERTICAL LINE */
6229 /* Historic Word Separators */
6230 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6231 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6232 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6233 || ch == 0x2056 /* THREE DOT PUNCTUATION */
6234 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6235 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6236 || ch == 0x205A /* TWO DOT PUNCTUATION */
6237 || ch == 0x205B /* FOUR DOT MARK */
6238 || ch == 0x205D /* TRICOLON */
6239 || ch == 0x205E /* VERTICAL FOUR DOTS */
6240 || ch == 0x2E19 /* PALM BRANCH */
6241 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6242 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6243 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6244 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6245 || ch == 0x2E30 /* RING POINT */
6246 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6247 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6248 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6249 || ch == 0x10102 /* AEGEAN CHECK MARK */
6250 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6251 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6252 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6253 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6255 || ch == 0x0964 /* DEVANAGARI DANDA */
6256 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6257 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6258 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6259 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6260 || ch == 0x104B /* MYANMAR SIGN SECTION */
6261 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6262 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6263 || ch == 0x17D4 /* KHMER SIGN KHAN */
6264 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6265 || ch == 0x1B5E /* BALINESE CARIK SIKI */
6266 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6267 || ch == 0xA8CE /* SAURASHTRA DANDA */
6268 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6269 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6270 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6271 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6272 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6273 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6275 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6276 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6277 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6278 || ch == 0x0FBE /* TIBETAN KU RU KHA */
6279 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6280 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6281 /* Other Terminating Punctuation */
6282 || ch == 0x1804 /* MONGOLIAN COLON */
6283 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6284 || ch == 0x1B5A /* BALINESE PANTI */
6285 || ch == 0x1B5B /* BALINESE PAMADA */
6286 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6287 || ch == 0x1B60 /* BALINESE PAMENENG */
6288 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6289 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6290 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6291 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6292 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6293 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6294 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6295 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6296 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6297 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6298 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6299 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6300 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6301 || ch == 0xA60D /* VAI COMMA */
6302 || ch == 0xA60F /* VAI QUESTION MARK */
6303 || ch == 0xA92E /* KAYAH LI SIGN CWI */
6304 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6305 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6306 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6307 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6308 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6309 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6310 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6311 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6312 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6313 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6314 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6315 || ch == 0xA6F3 /* BAMUM FULL STOP */
6316 || ch == 0xA6F4 /* BAMUM COLON */
6317 || ch == 0xA6F5 /* BAMUM COMMA */
6318 || ch == 0xA6F6 /* BAMUM SEMICOLON */
6319 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6320 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6321 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6322 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6323 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6324 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6325 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6326 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6327 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6328 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6329 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6330 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6331 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6332 || ch == 0x11047 /* BRAHMI DANDA */
6333 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6334 || ch == 0x110BE /* KAITHI SECTION MARK */
6335 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6336 || ch == 0x110C0 /* KAITHI DANDA */
6337 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6338 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6339 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6340 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
6341 attr |= (int64_t) 1 << LBP_BA;
6343 /* break opportunity before */
6344 if (ch == 0x00B4 /* ACUTE ACCENT */
6345 || ch == 0x1FFD /* GREEK OXIA */
6346 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6347 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6348 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6349 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6350 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6351 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6352 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6353 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6354 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6355 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6356 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6357 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6358 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6359 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6360 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6361 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6362 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
6363 attr |= (int64_t) 1 << LBP_BB;
6366 if (ch == 0x002D /* HYPHEN-MINUS */)
6367 attr |= (int64_t) 1 << LBP_HY;
6369 /* contingent break opportunity */
6370 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6371 attr |= (int64_t) 1 << LBP_CB;
6373 /* closing parenthesis */
6374 if (ch == 0x0029 /* RIGHT PARENTHESIS */
6375 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6376 attr |= (int64_t) 1 << LBP_CP;
6378 /* closing punctuation */
6379 if ((unicode_attributes[ch].category[0] == 'P'
6380 && unicode_attributes[ch].category[1] == 'e'
6381 && !(attr & ((int64_t) 1 << LBP_CP)))
6382 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6383 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6384 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6385 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6386 || ch == 0xFE50 /* SMALL COMMA */
6387 || ch == 0xFE52 /* SMALL FULL STOP */
6388 || ch == 0xFF0C /* FULLWIDTH COMMA */
6389 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6390 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6391 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6392 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6393 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6394 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6395 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6396 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6397 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6398 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6399 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6400 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
6401 attr |= (int64_t) 1 << LBP_CL;
6403 /* exclamation/interrogation */
6404 if (ch == 0x0021 /* EXCLAMATION MARK */
6405 || ch == 0x003F /* QUESTION MARK */
6406 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6407 || ch == 0x061B /* ARABIC SEMICOLON */
6408 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6409 || ch == 0x061F /* ARABIC QUESTION MARK */
6410 || ch == 0x06D4 /* ARABIC FULL STOP */
6411 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6412 || ch == 0x0F0D /* TIBETAN MARK SHAD */
6413 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6414 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6415 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6416 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6417 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6418 || ch == 0x1802 /* MONGOLIAN COMMA */
6419 || ch == 0x1803 /* MONGOLIAN FULL STOP */
6420 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6421 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6422 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6423 || ch == 0x1945 /* LIMBU QUESTION MARK */
6424 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6425 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6426 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6427 || ch == 0x2CFE /* COPTIC FULL STOP */
6428 || ch == 0x2E2E /* REVERSED QUESTION MARK */
6429 || ch == 0xA60E /* VAI FULL STOP */
6430 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6431 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6432 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6433 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6434 || ch == 0xFE56 /* SMALL QUESTION MARK */
6435 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6436 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6437 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
6438 attr |= (int64_t) 1 << LBP_EX;
6441 if (ch == 0x2024 /* ONE DOT LEADER */
6442 || ch == 0x2025 /* TWO DOT LEADER */
6443 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6444 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
6445 attr |= (int64_t) 1 << LBP_IN;
6448 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6449 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6450 || ch == 0x203D /* INTERROBANG */
6451 || ch == 0x2047 /* DOUBLE QUESTION MARK */
6452 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6453 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6454 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6455 || ch == 0x301C /* WAVE DASH */
6456 || ch == 0x303C /* MASU MARK */
6457 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6458 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6459 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6460 || ch == 0x309D /* HIRAGANA ITERATION MARK */
6461 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6462 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6463 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6464 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6465 || ch == 0x30FD /* KATAKANA ITERATION MARK */
6466 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6467 || ch == 0xA015 /* YI SYLLABLE WU */
6468 || ch == 0xFE54 /* SMALL SEMICOLON */
6469 || ch == 0xFE55 /* SMALL COLON */
6470 || ch == 0xFF1A /* FULLWIDTH COLON */
6471 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6472 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6473 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6474 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6475 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6476 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6477 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6478 attr |= (int64_t) 1 << LBP_NS;
6480 /* opening punctuation */
6481 if ((unicode_attributes[ch].category[0] == 'P'
6482 && unicode_attributes[ch].category[1] == 's')
6483 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6484 || ch == 0x00BF /* INVERTED QUESTION MARK */
6485 || ch == 0x2E18 /* INVERTED INTERROBANG */
6486 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6487 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6488 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6489 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6490 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6491 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6492 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
6493 attr |= (int64_t) 1 << LBP_OP;
6495 /* ambiguous quotation */
6496 if ((unicode_attributes[ch].category[0] == 'P'
6497 && (unicode_attributes[ch].category[1] == 'f'
6498 || unicode_attributes[ch].category[1] == 'i'))
6499 || ch == 0x0022 /* QUOTATION MARK */
6500 || ch == 0x0027 /* APOSTROPHE */
6501 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6502 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6503 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6504 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6505 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6506 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6507 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6508 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6509 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6510 || ch == 0x2E0B /* RAISED SQUARE */)
6511 attr |= (int64_t) 1 << LBP_QU;
6513 /* infix separator (numeric) */
6514 if (ch == 0x002C /* COMMA */
6515 || ch == 0x002E /* FULL STOP */
6516 || ch == 0x003A /* COLON */
6517 || ch == 0x003B /* SEMICOLON */
6518 || ch == 0x037E /* GREEK QUESTION MARK */
6519 || ch == 0x0589 /* ARMENIAN FULL STOP */
6520 || ch == 0x060C /* ARABIC COMMA */
6521 || ch == 0x060D /* ARABIC DATE SEPARATOR */
6522 || ch == 0x07F8 /* NKO COMMA */
6523 || ch == 0x2044 /* FRACTION SLASH */
6524 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6525 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6526 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6527 attr |= (int64_t) 1 << LBP_IS;
6530 if ((unicode_attributes[ch].category[0] == 'N'
6531 && unicode_attributes[ch].category[1] == 'd'
6532 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
6533 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
6534 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6535 attr |= (int64_t) 1 << LBP_NU;
6537 /* postfix (numeric) */
6538 if (ch == 0x0025 /* PERCENT SIGN */
6539 || ch == 0x00A2 /* CENT SIGN */
6540 || ch == 0x00B0 /* DEGREE SIGN */
6541 || ch == 0x060B /* AFGHANI SIGN */
6542 || ch == 0x066A /* ARABIC PERCENT SIGN */
6543 || ch == 0x2030 /* PER MILLE SIGN */
6544 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
6545 || ch == 0x2032 /* PRIME */
6546 || ch == 0x2033 /* DOUBLE PRIME */
6547 || ch == 0x2034 /* TRIPLE PRIME */
6548 || ch == 0x2035 /* REVERSED PRIME */
6549 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
6550 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
6551 || ch == 0x20A7 /* PESETA SIGN */
6552 || ch == 0x2103 /* DEGREE CELSIUS */
6553 || ch == 0x2109 /* DEGREE FAHRENHEIT */
6554 || ch == 0xFDFC /* RIAL SIGN */
6555 || ch == 0xFE6A /* SMALL PERCENT SIGN */
6556 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
6557 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6558 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6559 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6560 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6561 || ch == 0x09F2 /* BENGALI RUPEE MARK */
6562 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
6563 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6564 || ch == 0x0D79 /* MALAYALAM DATE MARK */
6565 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
6566 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
6567 attr |= (int64_t) 1 << LBP_PO;
6569 /* prefix (numeric) */
6570 if ((unicode_attributes[ch].category[0] == 'S'
6571 && unicode_attributes[ch].category[1] == 'c')
6572 || ch == 0x002B /* PLUS SIGN */
6573 || ch == 0x005C /* REVERSE SOLIDUS */
6574 || ch == 0x00B1 /* PLUS-MINUS SIGN */
6575 || ch == 0x2116 /* NUMERO SIGN */
6576 || ch == 0x2212 /* MINUS SIGN */
6577 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
6578 if (!(attr & ((int64_t) 1 << LBP_PO)))
6579 attr |= (int64_t) 1 << LBP_PR;
6581 /* symbols allowing breaks */
6582 if (ch == 0x002F /* SOLIDUS */)
6583 attr |= (int64_t) 1 << LBP_SY;
6585 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
6586 attr |= (int64_t) 1 << LBP_H2;
6588 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
6589 attr |= (int64_t) 1 << LBP_H3;
6591 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
6592 attr |= (int64_t) 1 << LBP_JL;
6594 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
6595 attr |= (int64_t) 1 << LBP_JV;
6597 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
6598 attr |= (int64_t) 1 << LBP_JT;
6600 /* complex context (South East Asian) */
6601 if (((unicode_attributes[ch].category[0] == 'C'
6602 && unicode_attributes[ch].category[1] == 'f')
6603 || (unicode_attributes[ch].category[0] == 'L'
6604 && (unicode_attributes[ch].category[1] == 'm'
6605 || unicode_attributes[ch].category[1] == 'o'))
6606 || (unicode_attributes[ch].category[0] == 'M'
6607 && (unicode_attributes[ch].category[1] == 'c'
6608 || unicode_attributes[ch].category[1] == 'n')
6609 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6610 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6611 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
6612 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6613 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6614 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6615 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6616 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6617 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6618 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
6619 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6620 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6621 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6622 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6623 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6624 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
6625 attr |= (int64_t) 1 << LBP_SA;
6627 /* attached characters and combining marks */
6628 if ((unicode_attributes[ch].category[0] == 'M'
6629 && (unicode_attributes[ch].category[1] == 'c'
6630 || unicode_attributes[ch].category[1] == 'e'
6631 || unicode_attributes[ch].category[1] == 'n'))
6632 || (unicode_attributes[ch].category[0] == 'C'
6633 && (unicode_attributes[ch].category[1] == 'c'
6634 || unicode_attributes[ch].category[1] == 'f')
6635 && ch != 0x110BD /* KAITHI NUMBER SIGN */))
6636 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6637 attr |= (int64_t) 1 << LBP_CM;
6640 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6641 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6642 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6643 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6644 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6645 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6646 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6647 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6648 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6649 || ch == 0xFE62 /* SMALL PLUS SIGN */
6650 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6651 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6652 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6653 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6654 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
6655 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
6656 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
6657 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
6658 || (ch >= 0x3000 && ch <= 0x33FF
6659 && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
6660 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6661 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
6662 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
6663 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
6664 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
6665 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
6666 || ch == 0xFE45 /* SESAME DOT */
6667 || ch == 0xFE46 /* WHITE SESAME DOT */
6668 || ch == 0xFE49 /* DASHED OVERLINE */
6669 || ch == 0xFE4A /* CENTRELINE OVERLINE */
6670 || ch == 0xFE4B /* WAVY OVERLINE */
6671 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
6672 || ch == 0xFE4D /* DASHED LOW LINE */
6673 || ch == 0xFE4E /* CENTRELINE LOW LINE */
6674 || ch == 0xFE4F /* WAVY LOW LINE */
6675 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
6676 || ch == 0xFE58 /* SMALL EM DASH */
6677 || ch == 0xFE5F /* SMALL NUMBER SIGN */
6678 || ch == 0xFE60 /* SMALL AMPERSAND */
6679 || ch == 0xFE61 /* SMALL ASTERISK */
6680 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
6681 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
6682 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
6683 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
6684 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
6685 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
6686 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
6687 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
6688 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
6689 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
6690 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
6691 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
6692 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
6693 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
6694 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
6695 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
6696 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
6697 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
6698 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
6699 || ch == 0xFF5E /* FULLWIDTH TILDE */
6700 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
6701 || ch == 0xFFE3 /* FULLWIDTH MACRON */
6702 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
6703 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6704 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
6705 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
6706 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
6707 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
6708 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
6709 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
6711 /* ambiguous (ideograph) ? */
6712 if ((unicode_width[ch] != NULL
6713 && unicode_width[ch][0] == 'A'
6715 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6716 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
6717 attr |= (int64_t) 1 << LBP_AI;
6719 attr |= (int64_t) 1 << LBP_ID;
6722 /* ordinary alphabetic and symbol characters */
6723 if ((unicode_attributes[ch].category[0] == 'L'
6724 && (unicode_attributes[ch].category[1] == 'u'
6725 || unicode_attributes[ch].category[1] == 'l'
6726 || unicode_attributes[ch].category[1] == 't'
6727 || unicode_attributes[ch].category[1] == 'm'
6728 || unicode_attributes[ch].category[1] == 'o'))
6729 || (unicode_attributes[ch].category[0] == 'S'
6730 && (unicode_attributes[ch].category[1] == 'm'
6731 || unicode_attributes[ch].category[1] == 'k'
6732 || unicode_attributes[ch].category[1] == 'o'))
6733 || (unicode_attributes[ch].category[0] == 'N'
6734 && (unicode_attributes[ch].category[1] == 'l'
6735 || unicode_attributes[ch].category[1] == 'o'))
6736 || (unicode_attributes[ch].category[0] == 'P'
6737 && (unicode_attributes[ch].category[1] == 'c'
6738 || unicode_attributes[ch].category[1] == 'd'
6739 || unicode_attributes[ch].category[1] == 'o'))
6740 || ch == 0x0600 /* ARABIC NUMBER SIGN */
6741 || ch == 0x0601 /* ARABIC SIGN SANAH */
6742 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
6743 || ch == 0x0603 /* ARABIC SIGN SAFHA */
6744 || ch == 0x06DD /* ARABIC END OF AYAH */
6745 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
6746 || ch == 0x2061 /* FUNCTION APPLICATION */
6747 || ch == 0x2062 /* INVISIBLE TIMES */
6748 || ch == 0x2063 /* INVISIBLE SEPARATOR */
6749 || ch == 0x2064 /* INVISIBLE PLUS */
6750 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6751 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
6752 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
6754 /* ambiguous (alphabetic) ? */
6755 if ((unicode_width[ch] != NULL
6756 && unicode_width[ch][0] == 'A'
6758 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
6759 && ch != 0x2022 /* BULLET */
6760 && ch != 0x203E /* OVERLINE */
6761 && ch != 0x2126 /* OHM SIGN */
6762 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
6763 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
6764 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
6765 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
6766 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
6767 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
6768 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
6769 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
6770 || ch == 0x00A7 /* SECTION SIGN */
6771 || ch == 0x00A8 /* DIAERESIS */
6772 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
6773 || ch == 0x00B2 /* SUPERSCRIPT TWO */
6774 || ch == 0x00B3 /* SUPERSCRIPT THREE */
6775 || ch == 0x00B6 /* PILCROW SIGN */
6776 || ch == 0x00B7 /* MIDDLE DOT */
6777 || ch == 0x00B8 /* CEDILLA */
6778 || ch == 0x00B9 /* SUPERSCRIPT ONE */
6779 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
6780 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
6781 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
6782 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
6783 || ch == 0x00D7 /* MULTIPLICATION SIGN */
6784 || ch == 0x00F7 /* DIVISION SIGN */
6785 || ch == 0x02C7 /* CARON */
6786 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
6787 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
6788 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
6789 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
6790 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
6791 || ch == 0x02D8 /* BREVE */
6792 || ch == 0x02D9 /* DOT ABOVE */
6793 || ch == 0x02DA /* RING ABOVE */
6794 || ch == 0x02DB /* OGONEK */
6795 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
6796 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6797 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
6798 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6799 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
6800 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
6801 || ch == 0x2616 /* WHITE SHOGI PIECE */
6802 || ch == 0x2617 /* BLACK SHOGI PIECE */)
6803 attr |= (int64_t) 1 << LBP_AI;
6805 attr |= (int64_t) 1 << LBP_AL;
6806 attr &= ~((int64_t) 1 << LBP_CM);
6811 /* Unassigned character. */
6812 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
6813 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
6814 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
6815 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
6816 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
6817 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6818 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
6819 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6820 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
6821 attr |= (int64_t) 1 << LBP_ID;
6826 attr |= (int64_t) 1 << LBP_XX;
6831 /* Output the line breaking properties in a human readable format. */
6833 debug_output_lbp (FILE *stream)
6837 for (i = 0; i < 0x110000; i++)
6839 int64_t attr = get_lbp (i);
6840 if (attr != (int64_t) 1 << LBP_XX)
6842 fprintf (stream, "0x%04X", i);
6843 #define PRINT_BIT(attr,bit) \
6844 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
6845 PRINT_BIT(attr,LBP_BK);
6846 PRINT_BIT(attr,LBP_CM);
6847 PRINT_BIT(attr,LBP_WJ);
6848 PRINT_BIT(attr,LBP_ZW);
6849 PRINT_BIT(attr,LBP_GL);
6850 PRINT_BIT(attr,LBP_SP);
6851 PRINT_BIT(attr,LBP_B2);
6852 PRINT_BIT(attr,LBP_BA);
6853 PRINT_BIT(attr,LBP_BB);
6854 PRINT_BIT(attr,LBP_HY);
6855 PRINT_BIT(attr,LBP_CB);
6856 PRINT_BIT(attr,LBP_CL);
6857 PRINT_BIT(attr,LBP_CP);
6858 PRINT_BIT(attr,LBP_EX);
6859 PRINT_BIT(attr,LBP_IN);
6860 PRINT_BIT(attr,LBP_NS);
6861 PRINT_BIT(attr,LBP_OP);
6862 PRINT_BIT(attr,LBP_QU);
6863 PRINT_BIT(attr,LBP_IS);
6864 PRINT_BIT(attr,LBP_NU);
6865 PRINT_BIT(attr,LBP_PO);
6866 PRINT_BIT(attr,LBP_PR);
6867 PRINT_BIT(attr,LBP_SY);
6868 PRINT_BIT(attr,LBP_AI);
6869 PRINT_BIT(attr,LBP_AL);
6870 PRINT_BIT(attr,LBP_H2);
6871 PRINT_BIT(attr,LBP_H3);
6872 PRINT_BIT(attr,LBP_ID);
6873 PRINT_BIT(attr,LBP_JL);
6874 PRINT_BIT(attr,LBP_JV);
6875 PRINT_BIT(attr,LBP_JT);
6876 PRINT_BIT(attr,LBP_SA);
6877 PRINT_BIT(attr,LBP_XX);
6879 fprintf (stream, "\n");
6885 debug_output_lbrk_tables (const char *filename)
6889 stream = fopen (filename, "w");
6892 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6896 debug_output_lbp (stream);
6898 if (ferror (stream) || fclose (stream))
6900 fprintf (stderr, "error writing to '%s'\n", filename);
6905 /* The line breaking property from the LineBreak.txt file. */
6906 int unicode_org_lbp[0x110000];
6908 /* Stores in unicode_org_lbp[] the line breaking property from the
6909 LineBreak.txt file. */
6911 fill_org_lbp (const char *linebreak_filename)
6915 char field0[FIELDLEN];
6916 char field1[FIELDLEN];
6917 char field2[FIELDLEN];
6920 for (i = 0; i < 0x110000; i++)
6921 unicode_org_lbp[i] = LBP_XX;
6923 stream = fopen (linebreak_filename, "r");
6926 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
6942 do c = getc (stream); while (c != EOF && c != '\n');
6946 n = getfield (stream, field0, ';');
6947 n += getfield (stream, field1, ' ');
6948 n += getfield (stream, field2, '\n');
6953 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
6957 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
6993 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
6994 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
6995 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
6996 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
6999 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7000 field1, linebreak_filename, lineno);
7003 i = strtoul (field0, NULL, 16);
7004 if (strstr (field0, "..") != NULL)
7006 /* Deal with a range. */
7007 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7009 unicode_org_lbp[i] = value;
7013 /* Single character line. */
7014 unicode_org_lbp[i] = value;
7018 if (ferror (stream) || fclose (stream))
7020 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7025 /* Output the line breaking properties in a human readable format. */
7027 debug_output_org_lbp (FILE *stream)
7031 for (i = 0; i < 0x110000; i++)
7033 int attr = unicode_org_lbp[i];
7036 fprintf (stream, "0x%04X", i);
7037 #define PRINT_BIT(attr,bit) \
7038 if (attr == bit) fprintf (stream, " " #bit);
7039 PRINT_BIT(attr,LBP_BK);
7040 PRINT_BIT(attr,LBP_CM);
7041 PRINT_BIT(attr,LBP_WJ);
7042 PRINT_BIT(attr,LBP_ZW);
7043 PRINT_BIT(attr,LBP_GL);
7044 PRINT_BIT(attr,LBP_SP);
7045 PRINT_BIT(attr,LBP_B2);
7046 PRINT_BIT(attr,LBP_BA);
7047 PRINT_BIT(attr,LBP_BB);
7048 PRINT_BIT(attr,LBP_HY);
7049 PRINT_BIT(attr,LBP_CB);
7050 PRINT_BIT(attr,LBP_CL);
7051 PRINT_BIT(attr,LBP_CP);
7052 PRINT_BIT(attr,LBP_EX);
7053 PRINT_BIT(attr,LBP_IN);
7054 PRINT_BIT(attr,LBP_NS);
7055 PRINT_BIT(attr,LBP_OP);
7056 PRINT_BIT(attr,LBP_QU);
7057 PRINT_BIT(attr,LBP_IS);
7058 PRINT_BIT(attr,LBP_NU);
7059 PRINT_BIT(attr,LBP_PO);
7060 PRINT_BIT(attr,LBP_PR);
7061 PRINT_BIT(attr,LBP_SY);
7062 PRINT_BIT(attr,LBP_AI);
7063 PRINT_BIT(attr,LBP_AL);
7064 PRINT_BIT(attr,LBP_H2);
7065 PRINT_BIT(attr,LBP_H3);
7066 PRINT_BIT(attr,LBP_ID);
7067 PRINT_BIT(attr,LBP_JL);
7068 PRINT_BIT(attr,LBP_JV);
7069 PRINT_BIT(attr,LBP_JT);
7070 PRINT_BIT(attr,LBP_SA);
7071 PRINT_BIT(attr,LBP_XX);
7073 fprintf (stream, "\n");
7079 debug_output_org_lbrk_tables (const char *filename)
7083 stream = fopen (filename, "w");
7086 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7090 debug_output_org_lbp (stream);
7092 if (ferror (stream) || fclose (stream))
7094 fprintf (stderr, "error writing to '%s'\n", filename);
7099 /* Construction of sparse 3-level tables. */
7100 #define TABLE lbp_table
7101 #define ELEMENT unsigned char
7102 #define DEFAULT LBP_XX
7103 #define xmalloc malloc
7104 #define xrealloc realloc
7108 output_lbp (FILE *stream1, FILE *stream2)
7112 unsigned int level1_offset, level2_offset, level3_offset;
7116 lbp_table_init (&t);
7118 for (i = 0; i < 0x110000; i++)
7120 int64_t attr = get_lbp (i);
7122 /* Now attr should contain exactly one bit. */
7123 if (attr == 0 || ((attr & (attr - 1)) != 0))
7126 if (attr != (int64_t) 1 << LBP_XX)
7128 unsigned int log2_attr;
7129 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7131 lbp_table_add (&t, i, log2_attr);
7135 lbp_table_finalize (&t);
7138 5 * sizeof (uint32_t);
7140 5 * sizeof (uint32_t)
7141 + t.level1_size * sizeof (uint32_t);
7143 5 * sizeof (uint32_t)
7144 + t.level1_size * sizeof (uint32_t)
7145 + (t.level2_size << t.q) * sizeof (uint32_t);
7147 for (i = 0; i < 5; i++)
7148 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7149 ((uint32_t *) t.result)[i]);
7150 fprintf (stream1, "\n");
7151 fprintf (stream1, "typedef struct\n");
7152 fprintf (stream1, " {\n");
7153 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7154 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7155 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7156 fprintf (stream1, " }\n");
7157 fprintf (stream1, "lbrkprop_t;\n");
7158 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7160 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7161 fprintf (stream2, "{\n");
7162 fprintf (stream2, " {");
7163 if (t.level1_size > 8)
7164 fprintf (stream2, "\n ");
7165 for (i = 0; i < t.level1_size; i++)
7168 if (i > 0 && (i % 8) == 0)
7169 fprintf (stream2, "\n ");
7170 offset = ((uint32_t *) (t.result + level1_offset))[i];
7172 fprintf (stream2, " %5d", -1);
7174 fprintf (stream2, " %5zu",
7175 (offset - level2_offset) / sizeof (uint32_t));
7176 if (i+1 < t.level1_size)
7177 fprintf (stream2, ",");
7179 if (t.level1_size > 8)
7180 fprintf (stream2, "\n ");
7181 fprintf (stream2, " },\n");
7182 fprintf (stream2, " {");
7183 if (t.level2_size << t.q > 8)
7184 fprintf (stream2, "\n ");
7185 for (i = 0; i < t.level2_size << t.q; i++)
7188 if (i > 0 && (i % 8) == 0)
7189 fprintf (stream2, "\n ");
7190 offset = ((uint32_t *) (t.result + level2_offset))[i];
7192 fprintf (stream2, " %5d", -1);
7194 fprintf (stream2, " %5zu",
7195 (offset - level3_offset) / sizeof (unsigned char));
7196 if (i+1 < t.level2_size << t.q)
7197 fprintf (stream2, ",");
7199 if (t.level2_size << t.q > 8)
7200 fprintf (stream2, "\n ");
7201 fprintf (stream2, " },\n");
7202 fprintf (stream2, " {");
7203 if (t.level3_size << t.p > 8)
7204 fprintf (stream2, "\n ");
7205 for (i = 0; i < t.level3_size << t.p; i++)
7207 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7208 const char *value_string;
7211 #define CASE(x) case x: value_string = #x; break;
7249 if (i > 0 && (i % 8) == 0)
7250 fprintf (stream2, "\n ");
7251 fprintf (stream2, " %s%s", value_string,
7252 (i+1 < t.level3_size << t.p ? "," : ""));
7254 if (t.level3_size << t.p > 8)
7255 fprintf (stream2, "\n ");
7256 fprintf (stream2, " }\n");
7257 fprintf (stream2, "};\n");
7261 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7263 const char *filenames[2];
7267 filenames[0] = filename1;
7268 filenames[1] = filename2;
7270 for (i = 0; i < 2; i++)
7272 streams[i] = fopen (filenames[i], "w");
7273 if (streams[i] == NULL)
7275 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7280 for (i = 0; i < 2; i++)
7282 FILE *stream = streams[i];
7284 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7285 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7286 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
7288 fprintf (stream, "\n");
7290 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7291 still carries the GPL header), and it's gnulib-tool which replaces the
7292 GPL header with an LGPL header. */
7293 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7294 fprintf (stream, "\n");
7295 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7296 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7297 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7298 fprintf (stream, " (at your option) any later version.\n");
7299 fprintf (stream, "\n");
7300 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7301 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7302 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7303 fprintf (stream, " GNU General Public License for more details.\n");
7304 fprintf (stream, "\n");
7305 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7306 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7307 fprintf (stream, "\n");
7310 output_lbp (streams[0], streams[1]);
7312 for (i = 0; i < 2; i++)
7314 if (ferror (streams[i]) || fclose (streams[i]))
7316 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7322 /* ========================================================================= */
7324 /* Word break property.
7325 Updated for Unicode TR #29 revision 17. */
7327 /* Possible values of the Word_Break property. */
7342 WBP_EXTENDNUMLET = 7
7345 /* Returns the word breaking property for ch, as a bit mask. */
7347 get_wbp (unsigned int ch)
7351 if (unicode_attributes[ch].name != NULL)
7354 attr |= 1 << WBP_CR;
7357 attr |= 1 << WBP_LF;
7359 if (ch == 0x000B || ch == 0x000C
7361 || ch == 0x2028 || ch == 0x2029)
7362 attr |= 1 << WBP_NEWLINE;
7364 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
7365 || (unicode_attributes[ch].category != NULL
7366 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
7367 attr |= 1 << WBP_EXTEND;
7369 if (unicode_attributes[ch].category != NULL
7370 && strcmp (unicode_attributes[ch].category, "Cf") == 0
7371 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
7372 attr |= 1 << WBP_FORMAT;
7374 if ((unicode_scripts[ch] < numscripts
7375 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
7376 || (ch >= 0x3031 && ch <= 0x3035)
7377 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
7379 attr |= 1 << WBP_KATAKANA;
7381 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
7383 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
7384 && (attr & (1 << WBP_KATAKANA)) == 0
7385 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
7386 && !(unicode_scripts[ch] < numscripts
7387 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
7388 && (attr & (1 << WBP_EXTEND)) == 0)
7389 attr |= 1 << WBP_ALETTER;
7391 if (is_WBP_MIDNUMLET (ch))
7392 attr |= 1 << WBP_MIDNUMLET;
7394 if (is_WBP_MIDLETTER (ch))
7395 attr |= 1 << WBP_MIDLETTER;
7397 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
7398 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
7400 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
7401 attr |= 1 << WBP_MIDNUM;
7403 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
7405 attr |= 1 << WBP_NUMERIC;
7407 if (unicode_attributes[ch].category != NULL
7408 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
7409 attr |= 1 << WBP_EXTENDNUMLET;
7414 attr |= 1 << WBP_OTHER;
7419 /* Output the word break property in a human readable format. */
7421 debug_output_wbp (FILE *stream)
7425 for (i = 0; i < 0x110000; i++)
7427 int attr = get_wbp (i);
7428 if (attr != 1 << WBP_OTHER)
7430 fprintf (stream, "0x%04X", i);
7431 if (attr & (1 << WBP_CR))
7432 fprintf (stream, " CR");
7433 if (attr & (1 << WBP_LF))
7434 fprintf (stream, " LF");
7435 if (attr & (1 << WBP_NEWLINE))
7436 fprintf (stream, " Newline");
7437 if (attr & (1 << WBP_EXTEND))
7438 fprintf (stream, " Extend");
7439 if (attr & (1 << WBP_FORMAT))
7440 fprintf (stream, " Format");
7441 if (attr & (1 << WBP_KATAKANA))
7442 fprintf (stream, " Katakana");
7443 if (attr & (1 << WBP_ALETTER))
7444 fprintf (stream, " ALetter");
7445 if (attr & (1 << WBP_MIDNUMLET))
7446 fprintf (stream, " MidNumLet");
7447 if (attr & (1 << WBP_MIDLETTER))
7448 fprintf (stream, " MidLetter");
7449 if (attr & (1 << WBP_MIDNUM))
7450 fprintf (stream, " MidNum");
7451 if (attr & (1 << WBP_NUMERIC))
7452 fprintf (stream, " Numeric");
7453 if (attr & (1 << WBP_EXTENDNUMLET))
7454 fprintf (stream, " ExtendNumLet");
7455 fprintf (stream, "\n");
7461 debug_output_wbrk_tables (const char *filename)
7465 stream = fopen (filename, "w");
7468 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7472 debug_output_wbp (stream);
7474 if (ferror (stream) || fclose (stream))
7476 fprintf (stderr, "error writing to '%s'\n", filename);
7481 /* The word break property from the WordBreakProperty.txt file. */
7482 int unicode_org_wbp[0x110000];
7484 /* Stores in unicode_org_wbp[] the word break property from the
7485 WordBreakProperty.txt file. */
7487 fill_org_wbp (const char *wordbreakproperty_filename)
7492 for (i = 0; i < 0x110000; i++)
7493 unicode_org_wbp[i] = WBP_OTHER;
7495 stream = fopen (wordbreakproperty_filename, "r");
7498 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
7505 unsigned int i1, i2;
7506 char padding[200+1];
7507 char propname[200+1];
7510 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7513 if (buf[0] == '\0' || buf[0] == '#')
7516 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7518 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7520 fprintf (stderr, "parse error in '%s'\n",
7521 wordbreakproperty_filename);
7526 #define PROP(name,value) \
7527 if (strcmp (propname, name) == 0) propvalue = value; else
7530 PROP ("Newline", WBP_NEWLINE)
7531 PROP ("Extend", WBP_EXTEND)
7532 PROP ("Format", WBP_FORMAT)
7533 PROP ("Katakana", WBP_KATAKANA)
7534 PROP ("ALetter", WBP_ALETTER)
7535 PROP ("MidNumLet", WBP_MIDNUMLET)
7536 PROP ("MidLetter", WBP_MIDLETTER)
7537 PROP ("MidNum", WBP_MIDNUM)
7538 PROP ("Numeric", WBP_NUMERIC)
7539 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7542 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
7543 wordbreakproperty_filename);
7546 if (!(i1 <= i2 && i2 < 0x110000))
7549 for (i = i1; i <= i2; i++)
7550 unicode_org_wbp[i] = propvalue;
7553 if (ferror (stream) || fclose (stream))
7555 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
7560 /* Output the word break property in a human readable format. */
7562 debug_output_org_wbp (FILE *stream)
7566 for (i = 0; i < 0x110000; i++)
7568 int propvalue = unicode_org_wbp[i];
7569 if (propvalue != WBP_OTHER)
7571 fprintf (stream, "0x%04X", i);
7572 #define PROP(name,value) \
7573 if (propvalue == value) fprintf (stream, " " name); else
7576 PROP ("Newline", WBP_NEWLINE)
7577 PROP ("Extend", WBP_EXTEND)
7578 PROP ("Format", WBP_FORMAT)
7579 PROP ("Katakana", WBP_KATAKANA)
7580 PROP ("ALetter", WBP_ALETTER)
7581 PROP ("MidNumLet", WBP_MIDNUMLET)
7582 PROP ("MidLetter", WBP_MIDLETTER)
7583 PROP ("MidNum", WBP_MIDNUM)
7584 PROP ("Numeric", WBP_NUMERIC)
7585 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7587 fprintf (stream, " ??");
7588 fprintf (stream, "\n");
7594 debug_output_org_wbrk_tables (const char *filename)
7598 stream = fopen (filename, "w");
7601 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7605 debug_output_org_wbp (stream);
7607 if (ferror (stream) || fclose (stream))
7609 fprintf (stderr, "error writing to '%s'\n", filename);
7614 /* Construction of sparse 3-level tables. */
7615 #define TABLE wbp_table
7616 #define ELEMENT unsigned char
7617 #define DEFAULT WBP_OTHER
7618 #define xmalloc malloc
7619 #define xrealloc realloc
7623 output_wbp (FILE *stream)
7627 unsigned int level1_offset, level2_offset, level3_offset;
7631 wbp_table_init (&t);
7633 for (i = 0; i < 0x110000; i++)
7635 int attr = get_wbp (i);
7637 /* Now attr should contain exactly one bit. */
7638 if (attr == 0 || ((attr & (attr - 1)) != 0))
7641 if (attr != 1 << WBP_OTHER)
7643 unsigned int log2_attr;
7644 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7646 wbp_table_add (&t, i, log2_attr);
7650 wbp_table_finalize (&t);
7653 5 * sizeof (uint32_t);
7655 5 * sizeof (uint32_t)
7656 + t.level1_size * sizeof (uint32_t);
7658 5 * sizeof (uint32_t)
7659 + t.level1_size * sizeof (uint32_t)
7660 + (t.level2_size << t.q) * sizeof (uint32_t);
7662 for (i = 0; i < 5; i++)
7663 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
7664 ((uint32_t *) t.result)[i]);
7665 fprintf (stream, "\n");
7666 fprintf (stream, "typedef struct\n");
7667 fprintf (stream, " {\n");
7668 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7669 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
7670 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7671 fprintf (stream, " }\n");
7672 fprintf (stream, "wbrkprop_t;\n");
7673 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
7674 fprintf (stream, "{\n");
7675 fprintf (stream, " {");
7676 if (t.level1_size > 8)
7677 fprintf (stream, "\n ");
7678 for (i = 0; i < t.level1_size; i++)
7681 if (i > 0 && (i % 8) == 0)
7682 fprintf (stream, "\n ");
7683 offset = ((uint32_t *) (t.result + level1_offset))[i];
7685 fprintf (stream, " %5d", -1);
7687 fprintf (stream, " %5zu",
7688 (offset - level2_offset) / sizeof (uint32_t));
7689 if (i+1 < t.level1_size)
7690 fprintf (stream, ",");
7692 if (t.level1_size > 8)
7693 fprintf (stream, "\n ");
7694 fprintf (stream, " },\n");
7695 fprintf (stream, " {");
7696 if (t.level2_size << t.q > 8)
7697 fprintf (stream, "\n ");
7698 for (i = 0; i < t.level2_size << t.q; i++)
7701 if (i > 0 && (i % 8) == 0)
7702 fprintf (stream, "\n ");
7703 offset = ((uint32_t *) (t.result + level2_offset))[i];
7705 fprintf (stream, " %5d", -1);
7707 fprintf (stream, " %5zu",
7708 (offset - level3_offset) / sizeof (unsigned char));
7709 if (i+1 < t.level2_size << t.q)
7710 fprintf (stream, ",");
7712 if (t.level2_size << t.q > 8)
7713 fprintf (stream, "\n ");
7714 fprintf (stream, " },\n");
7715 fprintf (stream, " {");
7716 if (t.level3_size << t.p > 4)
7717 fprintf (stream, "\n ");
7718 for (i = 0; i < t.level3_size << t.p; i++)
7720 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7721 const char *value_string;
7724 #define CASE(x) case x: value_string = #x; break;
7733 CASE(WBP_MIDNUMLET);
7734 CASE(WBP_MIDLETTER);
7737 CASE(WBP_EXTENDNUMLET);
7742 if (i > 0 && (i % 4) == 0)
7743 fprintf (stream, "\n ");
7744 fprintf (stream, " %s%s", value_string,
7745 (i+1 < t.level3_size << t.p ? "," : ""));
7747 if (t.level3_size << t.p > 4)
7748 fprintf (stream, "\n ");
7749 fprintf (stream, " }\n");
7750 fprintf (stream, "};\n");
7754 output_wbrk_tables (const char *filename, const char *version)
7758 stream = fopen (filename, "w");
7761 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7765 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7766 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7767 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7769 fprintf (stream, "\n");
7771 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7772 still carries the GPL header), and it's gnulib-tool which replaces the
7773 GPL header with an LGPL header. */
7774 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
7775 fprintf (stream, "\n");
7776 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7777 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7778 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7779 fprintf (stream, " (at your option) any later version.\n");
7780 fprintf (stream, "\n");
7781 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7782 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7783 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7784 fprintf (stream, " GNU General Public License for more details.\n");
7785 fprintf (stream, "\n");
7786 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7787 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7788 fprintf (stream, "\n");
7790 output_wbp (stream);
7792 if (ferror (stream) || fclose (stream))
7794 fprintf (stderr, "error writing to '%s'\n", filename);
7799 /* ========================================================================= */
7801 /* Grapheme break property.
7802 Updated for Unicode TR #29 revision 17. */
7804 /* Possible values of the Grapheme_Cluster_Break property. */
7813 GBP_SPACINGMARK = 6,
7821 /* Construction of sparse 3-level tables. */
7822 #define TABLE gbp_table
7823 #define ELEMENT unsigned char
7824 #define DEFAULT GBP_OTHER
7825 #define xmalloc malloc
7826 #define xrealloc realloc
7829 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
7830 int unicode_org_gbp[0x110000];
7832 /* Output the unit test data for the grapheme break property. */
7834 output_gbp_test (const char *filename)
7840 stream = fopen (filename, "w");
7843 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7847 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7848 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
7849 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
7850 fprintf (stream, "\n");
7851 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7852 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7853 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7854 fprintf (stream, " (at your option) any later version.\n");
7855 fprintf (stream, "\n");
7856 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7857 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7858 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7859 fprintf (stream, " GNU General Public License for more details.\n");
7860 fprintf (stream, "\n");
7861 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7862 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7863 fprintf (stream, "\n");
7866 for (ch = 0; ch < 0x110000; ch++)
7868 int gbp = unicode_org_gbp[ch];
7869 const char *gbp_string;
7871 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
7876 #define CASE(x) case x: gbp_string = #x; break;
7883 CASE (GBP_SPACINGMARK)
7895 fprintf (stream, ",\n");
7896 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
7900 fprintf (stream, "\n");
7902 if (ferror (stream) || fclose (stream))
7904 fprintf (stderr, "error writing to '%s'\n", filename);
7909 /* Output the per-character grapheme break property table. */
7911 output_gbp_table (const char *filename, const char *version)
7916 unsigned int level1_offset, level2_offset, level3_offset;
7918 stream = fopen (filename, "w");
7921 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7925 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7926 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
7927 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7932 gbp_table_init (&t);
7934 for (ch = 0; ch < 0x110000; ch++)
7935 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
7937 gbp_table_finalize (&t);
7939 /* Offsets in t.result, in memory of this process. */
7941 5 * sizeof (uint32_t);
7943 5 * sizeof (uint32_t)
7944 + t.level1_size * sizeof (uint32_t);
7946 5 * sizeof (uint32_t)
7947 + t.level1_size * sizeof (uint32_t)
7948 + (t.level2_size << t.q) * sizeof (uint32_t);
7950 for (i = 0; i < 5; i++)
7951 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
7952 ((uint32_t *) t.result)[i]);
7953 fprintf (stream, "static const\n");
7954 fprintf (stream, "struct\n");
7955 fprintf (stream, " {\n");
7956 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7957 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7958 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
7959 t.level3_size, t.p);
7960 fprintf (stream, " }\n");
7961 fprintf (stream, "unigbrkprop =\n");
7962 fprintf (stream, "{\n");
7963 fprintf (stream, " {");
7964 if (t.level1_size > 8)
7965 fprintf (stream, "\n ");
7966 for (i = 0; i < t.level1_size; i++)
7969 if (i > 0 && (i % 8) == 0)
7970 fprintf (stream, "\n ");
7971 offset = ((uint32_t *) (t.result + level1_offset))[i];
7973 fprintf (stream, " %5d", -1);
7975 fprintf (stream, " %5zu",
7976 (offset - level2_offset) / sizeof (uint32_t));
7977 if (i+1 < t.level1_size)
7978 fprintf (stream, ",");
7980 if (t.level1_size > 8)
7981 fprintf (stream, "\n ");
7982 fprintf (stream, " },\n");
7983 fprintf (stream, " {");
7984 if (t.level2_size << t.q > 8)
7985 fprintf (stream, "\n ");
7986 for (i = 0; i < t.level2_size << t.q; i++)
7989 if (i > 0 && (i % 8) == 0)
7990 fprintf (stream, "\n ");
7991 offset = ((uint32_t *) (t.result + level2_offset))[i];
7993 fprintf (stream, " %5d", -1);
7995 fprintf (stream, " %5zu",
7996 (offset - level3_offset) / sizeof (uint8_t) / 2);
7997 if (i+1 < t.level2_size << t.q)
7998 fprintf (stream, ",");
8000 if (t.level2_size << t.q > 8)
8001 fprintf (stream, "\n ");
8002 fprintf (stream, " },\n");
8003 fprintf (stream, " {");
8004 if (t.level3_size << t.p > 8)
8005 fprintf (stream, "\n ");
8006 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
8008 unsigned char *p = (unsigned char *) (t.result + level3_offset);
8009 unsigned char value0 = p[i * 2];
8010 unsigned char value1 = p[i * 2 + 1];
8011 if (i > 0 && (i % 8) == 0)
8012 fprintf (stream, "\n ");
8013 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
8014 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
8016 if (t.level3_size << t.p > 8)
8017 fprintf (stream, "\n ");
8018 fprintf (stream, " }\n");
8019 fprintf (stream, "};\n");
8021 if (ferror (stream) || fclose (stream))
8023 fprintf (stderr, "error writing to '%s'\n", filename);
8028 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8029 GraphemeBreakProperty.txt file. */
8031 fill_org_gbp (const char *graphemebreakproperty_filename)
8037 for (i = 0; i < 0x110000; i++)
8038 unicode_org_gbp[i] = GBP_OTHER;
8040 stream = fopen (graphemebreakproperty_filename, "r");
8043 fprintf (stderr, "error during fopen of '%s'\n",
8044 graphemebreakproperty_filename);
8051 unsigned int i1, i2;
8052 char padding[200+1];
8053 char propname[200+1];
8057 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8060 if (buf[0] == '\0' || buf[0] == '#')
8063 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8065 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8067 fprintf (stderr, "parse error in '%s'\n",
8068 graphemebreakproperty_filename);
8073 #define PROP(name,value) \
8074 if (strcmp (propname, name) == 0) propvalue = value; else
8077 PROP ("Control", GBP_CONTROL)
8078 PROP ("Extend", GBP_EXTEND)
8079 PROP ("Prepend", GBP_PREPEND)
8080 PROP ("SpacingMark", GBP_SPACINGMARK)
8085 PROP ("LVT", GBP_LVT)
8088 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8089 graphemebreakproperty_filename, lineno);
8092 if (!(i1 <= i2 && i2 < 0x110000))
8095 for (i = i1; i <= i2; i++)
8096 unicode_org_gbp[i] = propvalue;
8099 if (ferror (stream) || fclose (stream))
8101 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8106 /* ========================================================================= */
8108 /* Composition and decomposition.
8109 Updated for Unicode TR #15 revision 33. */
8111 /* Maximum number of characters into which a single Unicode character can be
8113 #define MAX_DECOMP_LENGTH 18
8117 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
8118 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
8119 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
8120 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
8121 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
8122 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
8123 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
8124 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
8125 UC_DECOMP_SUPER, /* <super> A superscript form. */
8126 UC_DECOMP_SUB, /* <sub> A subscript form. */
8127 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
8128 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
8129 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
8130 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
8131 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
8132 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
8133 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
8136 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8137 decompositions). Return the type, or -1 for none. */
8139 get_decomposition (unsigned int ch,
8140 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8142 const char *decomposition = unicode_attributes[ch].decomposition;
8144 if (decomposition != NULL && decomposition[0] != '\0')
8146 int type = UC_DECOMP_CANONICAL;
8147 unsigned int length;
8150 if (decomposition[0] == '<')
8155 rangle = strchr (decomposition + 1, '>');
8158 typelen = rangle + 1 - decomposition;
8159 #define TYPE(t1,t2) \
8160 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8163 TYPE ("<font>", UC_DECOMP_FONT)
8164 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8165 TYPE ("<initial>", UC_DECOMP_INITIAL)
8166 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8167 TYPE ("<final>", UC_DECOMP_FINAL)
8168 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8169 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8170 TYPE ("<super>", UC_DECOMP_SUPER)
8171 TYPE ("<sub>", UC_DECOMP_SUB)
8172 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8173 TYPE ("<wide>", UC_DECOMP_WIDE)
8174 TYPE ("<narrow>", UC_DECOMP_NARROW)
8175 TYPE ("<small>", UC_DECOMP_SMALL)
8176 TYPE ("<square>", UC_DECOMP_SQUARE)
8177 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8178 TYPE ("<compat>", UC_DECOMP_COMPAT)
8180 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8184 decomposition = rangle + 1;
8185 if (decomposition[0] == ' ')
8188 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8190 decomposed[length] = strtoul (decomposition, &endptr, 16);
8191 if (endptr == decomposition)
8193 decomposition = endptr;
8194 if (decomposition[0] == ' ')
8197 if (*decomposition != '\0')
8198 /* MAX_DECOMP_LENGTH is too small. */
8208 /* Construction of sparse 3-level tables. */
8209 #define TABLE decomp_table
8210 #define ELEMENT uint16_t
8211 #define DEFAULT (uint16_t)(-1)
8212 #define xmalloc malloc
8213 #define xrealloc realloc
8217 output_decomposition (FILE *stream1, FILE *stream2)
8219 struct decomp_table t;
8220 unsigned int level1_offset, level2_offset, level3_offset;
8221 unsigned int offset;
8227 decomp_table_init (&t);
8229 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8230 fprintf (stream1, "\n");
8231 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8234 for (ch = 0; ch < 0x110000; ch++)
8236 unsigned int length;
8237 unsigned int decomposed[MAX_DECOMP_LENGTH];
8238 int type = get_decomposition (ch, &length, decomposed);
8242 if (!(offset < (1 << 15)))
8244 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
8246 /* Produce length 3-bytes entries. */
8248 /* We would need a special representation of zero-length entries. */
8250 for (i = 0; i < length; i++)
8253 fprintf (stream2, ",");
8254 if ((offset % 4) == 0)
8255 fprintf (stream2, "\n ");
8256 if (!(decomposed[i] < (1 << 18)))
8258 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
8259 (((i+1 < length ? (1 << 23) : 0)
8260 | (i == 0 ? (type << 18) : 0)
8261 | decomposed[i]) >> 16) & 0xff,
8262 (decomposed[i] >> 8) & 0xff,
8263 decomposed[i] & 0xff);
8269 fprintf (stream2, "\n};\n");
8270 fprintf (stream2, "\n");
8272 decomp_table_finalize (&t);
8275 5 * sizeof (uint32_t);
8277 5 * sizeof (uint32_t)
8278 + t.level1_size * sizeof (uint32_t);
8280 5 * sizeof (uint32_t)
8281 + t.level1_size * sizeof (uint32_t)
8282 + (t.level2_size << t.q) * sizeof (uint32_t);
8284 for (i = 0; i < 5; i++)
8285 fprintf (stream1, "#define decomp_header_%d %d\n", i,
8286 ((uint32_t *) t.result)[i]);
8287 fprintf (stream1, "\n");
8288 fprintf (stream1, "typedef struct\n");
8289 fprintf (stream1, " {\n");
8290 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8291 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8292 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
8293 fprintf (stream1, " }\n");
8294 fprintf (stream1, "decomp_index_table_t;\n");
8295 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8296 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8297 fprintf (stream2, "{\n");
8298 fprintf (stream2, " {");
8299 if (t.level1_size > 8)
8300 fprintf (stream2, "\n ");
8301 for (i = 0; i < t.level1_size; i++)
8304 if (i > 0 && (i % 8) == 0)
8305 fprintf (stream2, "\n ");
8306 offset = ((uint32_t *) (t.result + level1_offset))[i];
8308 fprintf (stream2, " %5d", -1);
8310 fprintf (stream2, " %5zu",
8311 (offset - level2_offset) / sizeof (uint32_t));
8312 if (i+1 < t.level1_size)
8313 fprintf (stream2, ",");
8315 if (t.level1_size > 8)
8316 fprintf (stream2, "\n ");
8317 fprintf (stream2, " },\n");
8318 fprintf (stream2, " {");
8319 if (t.level2_size << t.q > 8)
8320 fprintf (stream2, "\n ");
8321 for (i = 0; i < t.level2_size << t.q; i++)
8324 if (i > 0 && (i % 8) == 0)
8325 fprintf (stream2, "\n ");
8326 offset = ((uint32_t *) (t.result + level2_offset))[i];
8328 fprintf (stream2, " %5d", -1);
8330 fprintf (stream2, " %5zu",
8331 (offset - level3_offset) / sizeof (uint16_t));
8332 if (i+1 < t.level2_size << t.q)
8333 fprintf (stream2, ",");
8335 if (t.level2_size << t.q > 8)
8336 fprintf (stream2, "\n ");
8337 fprintf (stream2, " },\n");
8338 fprintf (stream2, " {");
8339 if (t.level3_size << t.p > 8)
8340 fprintf (stream2, "\n ");
8341 for (i = 0; i < t.level3_size << t.p; i++)
8343 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
8344 if (i > 0 && (i % 8) == 0)
8345 fprintf (stream2, "\n ");
8346 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
8347 if (i+1 < t.level3_size << t.p)
8348 fprintf (stream2, ",");
8350 if (t.level3_size << t.p > 8)
8351 fprintf (stream2, "\n ");
8352 fprintf (stream2, " }\n");
8353 fprintf (stream2, "};\n");
8357 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
8359 const char *filenames[2];
8363 filenames[0] = filename1;
8364 filenames[1] = filename2;
8366 for (i = 0; i < 2; i++)
8368 streams[i] = fopen (filenames[i], "w");
8369 if (streams[i] == NULL)
8371 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
8376 for (i = 0; i < 2; i++)
8378 FILE *stream = streams[i];
8380 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8381 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
8382 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8384 fprintf (stream, "\n");
8387 output_decomposition (streams[0], streams[1]);
8389 for (i = 0; i < 2; i++)
8391 if (ferror (streams[i]) || fclose (streams[i]))
8393 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
8399 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
8400 char unicode_composition_exclusions[0x110000];
8403 fill_composition_exclusions (const char *compositionexclusions_filename)
8408 stream = fopen (compositionexclusions_filename, "r");
8411 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
8415 for (i = 0; i < 0x110000; i++)
8416 unicode_composition_exclusions[i] = 0;
8423 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8426 if (buf[0] == '\0' || buf[0] == '#')
8429 if (sscanf (buf, "%X", &i) != 1)
8431 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
8434 if (!(i < 0x110000))
8437 unicode_composition_exclusions[i] = 1;
8440 if (ferror (stream) || fclose (stream))
8442 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
8448 debug_output_composition_tables (const char *filename)
8453 stream = fopen (filename, "w");
8456 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8460 for (ch = 0; ch < 0x110000; ch++)
8462 unsigned int length;
8463 unsigned int decomposed[MAX_DECOMP_LENGTH];
8464 int type = get_decomposition (ch, &length, decomposed);
8466 if (type == UC_DECOMP_CANONICAL
8467 /* Consider only binary decompositions.
8468 Exclude singleton decompositions. */
8471 unsigned int code1 = decomposed[0];
8472 unsigned int code2 = decomposed[1];
8473 unsigned int combined = ch;
8475 /* Exclude decompositions where the first part is not a starter,
8476 i.e. is not of canonical combining class 0. */
8477 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8478 /* Exclude characters listed in CompositionExclusions.txt. */
8479 && !unicode_composition_exclusions[combined])
8481 /* The combined character must now also be a starter.
8483 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
8486 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
8490 unicode_attributes[code2].combining);
8495 if (ferror (stream) || fclose (stream))
8497 fprintf (stderr, "error writing to '%s'\n", filename);
8503 output_composition_tables (const char *filename, const char *version)
8508 stream = fopen (filename, "w");
8511 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8515 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8516 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
8517 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
8519 fprintf (stream, "\n");
8521 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8522 still carries the GPL header), and it's gnulib-tool which replaces the
8523 GPL header with an LGPL header. */
8524 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
8525 fprintf (stream, "\n");
8526 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8527 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8528 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8529 fprintf (stream, " (at your option) any later version.\n");
8530 fprintf (stream, "\n");
8531 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8532 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8533 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8534 fprintf (stream, " GNU General Public License for more details.\n");
8535 fprintf (stream, "\n");
8536 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8537 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8538 fprintf (stream, "\n");
8540 /* The composition table is a set of mappings (code1, code2) -> combined,
8542 367 values for code1 (from 0x003C to 0x30FD),
8543 54 values for code2 (from 0x0300 to 0x309A).
8544 For a fixed code1, there are from 1 to 19 possible values for code2.
8545 For a fixed code2, there are from 1 to 117 possible values for code1.
8546 This is a very sparse matrix.
8548 We want an O(1) hash lookup.
8550 We could implement the hash lookup by mapping (code1, code2) to a linear
8551 combination mul1*code1 + mul2*code2, which is then used as an index into
8552 a 3-level table. But this leads to a table of size 37 KB.
8554 We use gperf to implement the hash lookup, giving it the 928 sets of
8555 4 bytes (code1, code2) as input. gperf generates a hash table of size
8556 1527, which is quite good (60% filled). It requires an auxiliary table
8557 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
8559 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
8560 fprintf (stream, "%%struct-type\n");
8561 fprintf (stream, "%%language=ANSI-C\n");
8562 fprintf (stream, "%%define slot-name codes\n");
8563 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
8564 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
8565 fprintf (stream, "%%compare-lengths\n");
8566 fprintf (stream, "%%compare-strncmp\n");
8567 fprintf (stream, "%%readonly-tables\n");
8568 fprintf (stream, "%%omit-struct-type\n");
8569 fprintf (stream, "%%%%\n");
8571 for (ch = 0; ch < 0x110000; ch++)
8573 unsigned int length;
8574 unsigned int decomposed[MAX_DECOMP_LENGTH];
8575 int type = get_decomposition (ch, &length, decomposed);
8577 if (type == UC_DECOMP_CANONICAL
8578 /* Consider only binary decompositions.
8579 Exclude singleton decompositions. */
8582 unsigned int code1 = decomposed[0];
8583 unsigned int code2 = decomposed[1];
8584 unsigned int combined = ch;
8586 /* Exclude decompositions where the first part is not a starter,
8587 i.e. is not of canonical combining class 0. */
8588 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8589 /* Exclude characters listed in CompositionExclusions.txt. */
8590 && !unicode_composition_exclusions[combined])
8592 /* The combined character must now also be a starter.
8594 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
8597 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
8598 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
8599 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
8605 if (ferror (stream) || fclose (stream))
8607 fprintf (stderr, "error writing to '%s'\n", filename);
8612 /* ========================================================================= */
8614 /* Output the test for a simple character mapping table to the given file. */
8617 output_simple_mapping_test (const char *filename,
8618 const char *function_name,
8619 unsigned int (*func) (unsigned int),
8620 const char *version)
8626 stream = fopen (filename, "w");
8629 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8633 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8634 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
8635 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
8636 fprintf (stream, "\n");
8637 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8638 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8639 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8640 fprintf (stream, " (at your option) any later version.\n");
8641 fprintf (stream, "\n");
8642 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8643 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8644 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8645 fprintf (stream, " GNU General Public License for more details.\n");
8646 fprintf (stream, "\n");
8647 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8648 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8649 fprintf (stream, "\n");
8650 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8652 fprintf (stream, "\n");
8653 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
8654 fprintf (stream, "\n");
8657 for (ch = 0; ch < 0x110000; ch++)
8659 unsigned int value = func (ch);
8664 fprintf (stream, ",\n");
8665 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
8670 fprintf (stream, "\n");
8672 fprintf (stream, "\n");
8673 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
8674 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
8676 if (ferror (stream) || fclose (stream))
8678 fprintf (stderr, "error writing to '%s'\n", filename);
8683 /* Construction of sparse 3-level tables. */
8684 #define TABLE mapping_table
8685 #define ELEMENT int32_t
8687 #define xmalloc malloc
8688 #define xrealloc realloc
8691 /* Output a simple character mapping table to the given file. */
8694 output_simple_mapping (const char *filename,
8695 unsigned int (*func) (unsigned int),
8696 const char *version)
8700 struct mapping_table t;
8701 unsigned int level1_offset, level2_offset, level3_offset;
8703 stream = fopen (filename, "w");
8706 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8710 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8711 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
8712 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8717 mapping_table_init (&t);
8719 for (ch = 0; ch < 0x110000; ch++)
8721 int value = (int) func (ch) - (int) ch;
8723 mapping_table_add (&t, ch, value);
8726 mapping_table_finalize (&t);
8728 /* Offsets in t.result, in memory of this process. */
8730 5 * sizeof (uint32_t);
8732 5 * sizeof (uint32_t)
8733 + t.level1_size * sizeof (uint32_t);
8735 5 * sizeof (uint32_t)
8736 + t.level1_size * sizeof (uint32_t)
8737 + (t.level2_size << t.q) * sizeof (uint32_t);
8739 for (i = 0; i < 5; i++)
8740 fprintf (stream, "#define mapping_header_%d %d\n", i,
8741 ((uint32_t *) t.result)[i]);
8742 fprintf (stream, "static const\n");
8743 fprintf (stream, "struct\n");
8744 fprintf (stream, " {\n");
8745 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8746 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8747 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
8748 fprintf (stream, " }\n");
8749 fprintf (stream, "u_mapping =\n");
8750 fprintf (stream, "{\n");
8751 fprintf (stream, " {");
8752 if (t.level1_size > 8)
8753 fprintf (stream, "\n ");
8754 for (i = 0; i < t.level1_size; i++)
8757 if (i > 0 && (i % 8) == 0)
8758 fprintf (stream, "\n ");
8759 offset = ((uint32_t *) (t.result + level1_offset))[i];
8761 fprintf (stream, " %5d", -1);
8763 fprintf (stream, " %5zu",
8764 (offset - level2_offset) / sizeof (uint32_t));
8765 if (i+1 < t.level1_size)
8766 fprintf (stream, ",");
8768 if (t.level1_size > 8)
8769 fprintf (stream, "\n ");
8770 fprintf (stream, " },\n");
8771 fprintf (stream, " {");
8772 if (t.level2_size << t.q > 8)
8773 fprintf (stream, "\n ");
8774 for (i = 0; i < t.level2_size << t.q; i++)
8777 if (i > 0 && (i % 8) == 0)
8778 fprintf (stream, "\n ");
8779 offset = ((uint32_t *) (t.result + level2_offset))[i];
8781 fprintf (stream, " %5d", -1);
8783 fprintf (stream, " %5zu",
8784 (offset - level3_offset) / sizeof (int32_t));
8785 if (i+1 < t.level2_size << t.q)
8786 fprintf (stream, ",");
8788 if (t.level2_size << t.q > 8)
8789 fprintf (stream, "\n ");
8790 fprintf (stream, " },\n");
8791 fprintf (stream, " {");
8792 if (t.level3_size << t.p > 8)
8793 fprintf (stream, "\n ");
8794 for (i = 0; i < t.level3_size << t.p; i++)
8796 if (i > 0 && (i % 8) == 0)
8797 fprintf (stream, "\n ");
8798 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
8799 if (i+1 < t.level3_size << t.p)
8800 fprintf (stream, ",");
8802 if (t.level3_size << t.p > 8)
8803 fprintf (stream, "\n ");
8804 fprintf (stream, " }\n");
8805 fprintf (stream, "};\n");
8807 if (ferror (stream) || fclose (stream))
8809 fprintf (stderr, "error writing to '%s'\n", filename);
8814 /* ========================================================================= */
8816 /* A special casing context.
8817 A context is negated through x -> -x. */
8822 SCC_AFTER_SOFT_DOTTED,
8828 /* A special casing rule. */
8829 struct special_casing_rule
8832 unsigned int lower_mapping[3];
8833 unsigned int title_mapping[3];
8834 unsigned int upper_mapping[3];
8835 unsigned int casefold_mapping[3];
8836 const char *language;
8840 /* The special casing rules. */
8841 struct special_casing_rule **casing_rules;
8842 unsigned int num_casing_rules;
8843 unsigned int allocated_casing_rules;
8846 add_casing_rule (struct special_casing_rule *new_rule)
8848 if (num_casing_rules == allocated_casing_rules)
8850 allocated_casing_rules = 2 * allocated_casing_rules;
8851 if (allocated_casing_rules < 16)
8852 allocated_casing_rules = 16;
8854 (struct special_casing_rule **)
8855 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
8857 casing_rules[num_casing_rules++] = new_rule;
8860 /* Stores in casing_rules the special casing rules found in
8861 specialcasing_filename. */
8863 fill_casing_rules (const char *specialcasing_filename)
8867 stream = fopen (specialcasing_filename, "r");
8870 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
8874 casing_rules = NULL;
8875 num_casing_rules = 0;
8876 allocated_casing_rules = 0;
8886 unsigned int lower_mapping[3];
8887 unsigned int title_mapping[3];
8888 unsigned int upper_mapping[3];
8892 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8895 if (buf[0] == '\0' || buf[0] == '#')
8900 code = strtoul (scanptr, &endptr, 16);
8901 if (endptr == scanptr)
8903 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8907 if (*scanptr != ';')
8909 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8914 /* Scan lower mapping. */
8915 for (i = 0; i < 3; i++)
8916 lower_mapping[i] = 0;
8917 for (i = 0; i < 3; i++)
8919 while (*scanptr == ' ')
8921 if (*scanptr == ';')
8923 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
8924 if (endptr == scanptr)
8926 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8931 if (*scanptr != ';')
8933 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8938 /* Scan title mapping. */
8939 for (i = 0; i < 3; i++)
8940 title_mapping[i] = 0;
8941 for (i = 0; i < 3; i++)
8943 while (*scanptr == ' ')
8945 if (*scanptr == ';')
8947 title_mapping[i] = strtoul (scanptr, &endptr, 16);
8948 if (endptr == scanptr)
8950 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8955 if (*scanptr != ';')
8957 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8962 /* Scan upper mapping. */
8963 for (i = 0; i < 3; i++)
8964 upper_mapping[i] = 0;
8965 for (i = 0; i < 3; i++)
8967 while (*scanptr == ' ')
8969 if (*scanptr == ';')
8971 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
8972 if (endptr == scanptr)
8974 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8979 if (*scanptr != ';')
8981 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8986 /* Scan language and context. */
8988 context = SCC_ALWAYS;
8989 while (*scanptr == ' ')
8991 if (*scanptr != '\0' && *scanptr != '#')
8993 const char *word_begin = scanptr;
8994 const char *word_end;
8996 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9000 while (*scanptr == ' ')
9003 if (word_end - word_begin == 2)
9005 language = (char *) malloc ((word_end - word_begin) + 1);
9006 memcpy (language, word_begin, 2);
9007 language[word_end - word_begin] = '\0';
9008 word_begin = word_end = NULL;
9010 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9012 word_begin = scanptr;
9013 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9019 if (word_end > word_begin)
9021 bool negate = false;
9023 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9028 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9029 context = SCC_FINAL_SIGMA;
9030 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9031 context = SCC_AFTER_SOFT_DOTTED;
9032 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9033 context = SCC_MORE_ABOVE;
9034 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9035 context = SCC_BEFORE_DOT;
9036 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9037 context = SCC_AFTER_I;
9040 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9044 context = - context;
9047 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9049 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9054 /* Store the rule. */
9056 struct special_casing_rule *new_rule =
9057 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9058 new_rule->code = code;
9059 new_rule->language = language;
9060 new_rule->context = context;
9061 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9062 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9063 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9065 add_casing_rule (new_rule);
9069 if (ferror (stream) || fclose (stream))
9071 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9076 /* A casefolding rule. */
9077 struct casefold_rule
9080 unsigned int mapping[3];
9081 const char *language;
9084 /* The casefolding rules. */
9085 struct casefold_rule **casefolding_rules;
9086 unsigned int num_casefolding_rules;
9087 unsigned int allocated_casefolding_rules;
9089 /* Stores in casefolding_rules the case folding rules found in
9090 casefolding_filename. */
9092 fill_casefolding_rules (const char *casefolding_filename)
9096 stream = fopen (casefolding_filename, "r");
9099 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9103 casefolding_rules = NULL;
9104 num_casefolding_rules = 0;
9105 allocated_casefolding_rules = 0;
9116 unsigned int mapping[3];
9118 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9121 if (buf[0] == '\0' || buf[0] == '#')
9126 code = strtoul (scanptr, &endptr, 16);
9127 if (endptr == scanptr)
9129 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9133 if (*scanptr != ';')
9135 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9141 while (*scanptr == ' ')
9146 case 'C': case 'F': case 'S': case 'T':
9150 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9154 if (*scanptr != ';')
9156 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9161 /* Scan casefold mapping. */
9162 for (i = 0; i < 3; i++)
9164 for (i = 0; i < 3; i++)
9166 while (*scanptr == ' ')
9168 if (*scanptr == ';')
9170 mapping[i] = strtoul (scanptr, &endptr, 16);
9171 if (endptr == scanptr)
9173 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9178 if (*scanptr != ';')
9180 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9185 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9188 const char * const *languages;
9189 unsigned int languages_count;
9191 /* Type 'T' indicates that the rule is applicable to Turkish
9195 static const char * const turkish_languages[] = { "tr", "az" };
9196 languages = turkish_languages;
9197 languages_count = 2;
9201 static const char * const all_languages[] = { NULL };
9202 languages = all_languages;
9203 languages_count = 1;
9206 for (i = 0; i < languages_count; i++)
9208 /* Store a new rule. */
9209 struct casefold_rule *new_rule =
9210 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9211 new_rule->code = code;
9212 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9213 new_rule->language = languages[i];
9215 if (num_casefolding_rules == allocated_casefolding_rules)
9217 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9218 if (allocated_casefolding_rules < 16)
9219 allocated_casefolding_rules = 16;
9221 (struct casefold_rule **)
9222 realloc (casefolding_rules,
9223 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9225 casefolding_rules[num_casefolding_rules++] = new_rule;
9230 if (ferror (stream) || fclose (stream))
9232 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9237 /* Casefold mapping, when it maps to a single character. */
9238 unsigned int unicode_casefold[0x110000];
9241 to_casefold (unsigned int ch)
9243 return unicode_casefold[ch];
9246 /* Redistribute the casefolding_rules:
9247 - Rules that map to a single character, language independently, are stored
9248 in unicode_casefold.
9249 - Other rules are merged into casing_rules. */
9251 redistribute_casefolding_rules (void)
9253 unsigned int ch, i, j;
9255 /* Fill unicode_casefold[]. */
9256 for (ch = 0; ch < 0x110000; ch++)
9257 unicode_casefold[ch] = ch;
9258 for (i = 0; i < num_casefolding_rules; i++)
9260 struct casefold_rule *cfrule = casefolding_rules[i];
9262 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
9265 if (!(ch < 0x110000))
9267 unicode_casefold[ch] = cfrule->mapping[0];
9271 /* Extend the special casing rules by filling in their casefold_mapping[]
9273 for (j = 0; j < num_casing_rules; j++)
9275 struct special_casing_rule *rule = casing_rules[j];
9278 rule->casefold_mapping[0] = to_casefold (rule->code);
9279 for (k = 1; k < 3; k++)
9280 rule->casefold_mapping[k] = 0;
9283 /* Now merge the other casefolding rules into casing_rules. */
9284 for (i = 0; i < num_casefolding_rules; i++)
9286 struct casefold_rule *cfrule = casefolding_rules[i];
9288 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
9290 /* Find a rule that applies to the same code, same language, and it
9291 has context SCC_ALWAYS. At the same time, update all rules that
9292 have the same code and same or more specific language. */
9293 struct special_casing_rule *found_rule = NULL;
9295 for (j = 0; j < num_casing_rules; j++)
9297 struct special_casing_rule *rule = casing_rules[j];
9299 if (rule->code == cfrule->code
9300 && (cfrule->language == NULL
9301 || (rule->language != NULL
9302 && strcmp (rule->language, cfrule->language) == 0)))
9304 memcpy (rule->casefold_mapping, cfrule->mapping,
9305 sizeof (rule->casefold_mapping));
9307 if ((cfrule->language == NULL
9308 ? rule->language == NULL
9309 : rule->language != NULL
9310 && strcmp (rule->language, cfrule->language) == 0)
9311 && rule->context == SCC_ALWAYS)
9319 if (found_rule == NULL)
9321 /* Create a new rule. */
9322 struct special_casing_rule *new_rule =
9323 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9325 /* Try to find a rule that applies to the same code, no language
9326 restriction, and with context SCC_ALWAYS. */
9327 for (j = 0; j < num_casing_rules; j++)
9329 struct special_casing_rule *rule = casing_rules[j];
9331 if (rule->code == cfrule->code
9332 && rule->context == SCC_ALWAYS
9333 && rule->language == NULL)
9341 new_rule->code = cfrule->code;
9342 new_rule->language = cfrule->language;
9343 new_rule->context = SCC_ALWAYS;
9344 if (found_rule != NULL)
9346 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
9347 sizeof (new_rule->lower_mapping));
9348 memcpy (new_rule->title_mapping, found_rule->title_mapping,
9349 sizeof (new_rule->title_mapping));
9350 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
9351 sizeof (new_rule->upper_mapping));
9357 new_rule->lower_mapping[0] = to_lower (cfrule->code);
9358 for (k = 1; k < 3; k++)
9359 new_rule->lower_mapping[k] = 0;
9360 new_rule->title_mapping[0] = to_title (cfrule->code);
9361 for (k = 1; k < 3; k++)
9362 new_rule->title_mapping[k] = 0;
9363 new_rule->upper_mapping[0] = to_upper (cfrule->code);
9364 for (k = 1; k < 3; k++)
9365 new_rule->upper_mapping[k] = 0;
9367 memcpy (new_rule->casefold_mapping, cfrule->mapping,
9368 sizeof (new_rule->casefold_mapping));
9370 add_casing_rule (new_rule);
9377 compare_casing_rules (const void *a, const void *b)
9379 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
9380 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
9381 unsigned int a_code = a_rule->code;
9382 unsigned int b_code = b_rule->code;
9384 if (a_code < b_code)
9386 if (a_code > b_code)
9389 /* Sort the more specific rules before the more general ones. */
9390 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
9391 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
9395 sort_casing_rules (void)
9397 /* Sort the rules 1. by code, 2. by specificity. */
9398 if (num_casing_rules > 1)
9399 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
9400 compare_casing_rules);
9403 /* Output the special casing rules. */
9405 output_casing_rules (const char *filename, const char *version)
9411 stream = fopen (filename, "w");
9414 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9418 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9419 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
9420 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9422 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
9423 fprintf (stream, "%%struct-type\n");
9424 fprintf (stream, "%%language=ANSI-C\n");
9425 fprintf (stream, "%%define slot-name code\n");
9426 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
9427 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
9428 fprintf (stream, "%%compare-lengths\n");
9429 fprintf (stream, "%%compare-strncmp\n");
9430 fprintf (stream, "%%readonly-tables\n");
9431 fprintf (stream, "%%omit-struct-type\n");
9432 fprintf (stream, "%%%%\n");
9435 for (i = 0; i < num_casing_rules; i++)
9437 struct special_casing_rule *rule = casing_rules[i];
9440 if (i > 0 && rule->code == casing_rules[i - 1]->code)
9445 if (!(rule->code < 0x10000))
9447 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
9451 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
9452 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
9454 fprintf (stream, "%d, ",
9455 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
9457 context = rule->context;
9460 fprintf (stream, "-");
9461 context = - context;
9464 fprintf (stream, " ");
9468 fprintf (stream, "SCC_ALWAYS ");
9470 case SCC_FINAL_SIGMA:
9471 fprintf (stream, "SCC_FINAL_SIGMA ");
9473 case SCC_AFTER_SOFT_DOTTED:
9474 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
9476 case SCC_MORE_ABOVE:
9477 fprintf (stream, "SCC_MORE_ABOVE ");
9479 case SCC_BEFORE_DOT:
9480 fprintf (stream, "SCC_BEFORE_DOT ");
9483 fprintf (stream, "SCC_AFTER_I ");
9488 fprintf (stream, ", ");
9490 if (rule->language != NULL)
9492 if (strlen (rule->language) != 2)
9494 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
9497 fprintf (stream, "{ '\\0', '\\0' }, ");
9499 fprintf (stream, "{ ");
9500 for (j = 0; j < 3; j++)
9503 fprintf (stream, ", ");
9504 if (!(rule->upper_mapping[j] < 0x10000))
9506 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
9509 if (rule->upper_mapping[j] != 0)
9510 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
9512 fprintf (stream, " 0");
9514 fprintf (stream, " }, { ");
9515 for (j = 0; j < 3; j++)
9518 fprintf (stream, ", ");
9519 if (!(rule->lower_mapping[j] < 0x10000))
9521 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
9524 if (rule->lower_mapping[j] != 0)
9525 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
9527 fprintf (stream, " 0");
9529 fprintf (stream, " }, { ");
9530 for (j = 0; j < 3; j++)
9533 fprintf (stream, ", ");
9534 if (!(rule->title_mapping[j] < 0x10000))
9536 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
9539 if (rule->title_mapping[j] != 0)
9540 fprintf (stream, "0x%04X", rule->title_mapping[j]);
9542 fprintf (stream, " 0");
9544 fprintf (stream, " }, { ");
9545 for (j = 0; j < 3; j++)
9548 fprintf (stream, ", ");
9549 if (!(rule->casefold_mapping[j] < 0x10000))
9551 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
9554 if (rule->casefold_mapping[j] != 0)
9555 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
9557 fprintf (stream, " 0");
9559 fprintf (stream, " }\n");
9562 if (ferror (stream) || fclose (stream))
9564 fprintf (stderr, "error writing to '%s'\n", filename);
9569 /* ========================================================================= */
9571 /* Quoting the Unicode standard:
9572 Definition: A character is defined to be "cased" if it has the Lowercase
9573 or Uppercase property or has a General_Category value of
9574 Titlecase_Letter. */
9576 is_cased (unsigned int ch)
9578 return (is_property_lowercase (ch)
9579 || is_property_uppercase (ch)
9580 || is_category_Lt (ch));
9583 /* Quoting the Unicode standard:
9584 Definition: A character is defined to be "case-ignorable" if it has the
9585 value MidLetter {or the value MidNumLet} for the Word_Break property or
9586 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
9587 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
9588 The text marked in braces was added in Unicode 5.1.0, see
9589 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
9590 Definition of case-ignorable". */
9591 /* Since this predicate is only used for the "Before C" and "After C"
9592 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
9593 This simplifies the evaluation of the regular expressions
9594 \p{cased} (\p{case-ignorable})* C
9596 C (\p{case-ignorable})* \p{cased}
9599 is_case_ignorable (unsigned int ch)
9601 return (unicode_org_wbp[ch] == WBP_MIDLETTER
9602 || unicode_org_wbp[ch] == WBP_MIDNUMLET
9603 || is_category_Mn (ch)
9604 || is_category_Me (ch)
9605 || is_category_Cf (ch)
9606 || is_category_Lm (ch)
9607 || is_category_Sk (ch))
9611 /* ------------------------------------------------------------------------- */
9613 /* Output all case related properties. */
9615 output_casing_properties (const char *version)
9617 #define PROPERTY(FN,P) \
9618 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
9619 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
9620 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
9621 PROPERTY(cased, cased)
9622 PROPERTY(ignorable, case_ignorable)
9626 /* ========================================================================= */
9629 main (int argc, char * argv[])
9631 const char *unicodedata_filename;
9632 const char *proplist_filename;
9633 const char *derivedproplist_filename;
9634 const char *arabicshaping_filename;
9635 const char *scripts_filename;
9636 const char *blocks_filename;
9637 const char *proplist30_filename;
9638 const char *eastasianwidth_filename;
9639 const char *linebreak_filename;
9640 const char *wordbreakproperty_filename;
9641 const char *graphemebreakproperty_filename;
9642 const char *compositionexclusions_filename;
9643 const char *specialcasing_filename;
9644 const char *casefolding_filename;
9645 const char *version;
9649 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
9654 unicodedata_filename = argv[1];
9655 proplist_filename = argv[2];
9656 derivedproplist_filename = argv[3];
9657 arabicshaping_filename = argv[4];
9658 scripts_filename = argv[5];
9659 blocks_filename = argv[6];
9660 proplist30_filename = argv[7];
9661 eastasianwidth_filename = argv[8];
9662 linebreak_filename = argv[9];
9663 wordbreakproperty_filename = argv[10];
9664 graphemebreakproperty_filename = argv[11];
9665 compositionexclusions_filename = argv[12];
9666 specialcasing_filename = argv[13];
9667 casefolding_filename = argv[14];
9670 fill_attributes (unicodedata_filename);
9671 clear_properties ();
9672 fill_properties (proplist_filename);
9673 fill_properties (derivedproplist_filename);
9674 fill_properties30 (proplist30_filename);
9675 fill_arabicshaping (arabicshaping_filename);
9676 fill_scripts (scripts_filename);
9677 fill_blocks (blocks_filename);
9678 fill_width (eastasianwidth_filename);
9679 fill_org_lbp (linebreak_filename);
9680 fill_org_wbp (wordbreakproperty_filename);
9681 fill_org_gbp (graphemebreakproperty_filename);
9682 fill_composition_exclusions (compositionexclusions_filename);
9683 fill_casing_rules (specialcasing_filename);
9684 fill_casefolding_rules (casefolding_filename);
9685 redistribute_casefolding_rules ();
9686 sort_casing_rules ();
9688 output_categories (version);
9689 output_category ("unictype/categ_of.h", version);
9690 output_combclass ("unictype/combining.h", version);
9691 output_bidi_category ("unictype/bidi_of.h", version);
9692 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
9693 output_decimal_digit ("unictype/decdigit.h", version);
9694 output_digit_test ("../tests/unictype/test-digit.h", version);
9695 output_digit ("unictype/digit.h", version);
9696 output_numeric_test ("../tests/unictype/test-numeric.h", version);
9697 output_numeric ("unictype/numeric.h", version);
9698 output_mirror ("unictype/mirror.h", version);
9699 output_properties (version);
9700 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
9701 output_joining_type ("unictype/joiningtype_of.h", version);
9702 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
9703 output_joining_group ("unictype/joininggroup_of.h", version);
9705 output_scripts (version);
9706 output_scripts_byname (version);
9707 output_blocks (version);
9708 output_ident_properties (version);
9709 output_nonspacing_property ("uniwidth/width.c.part");
9710 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
9711 output_old_ctype (version);
9713 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
9714 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
9715 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
9717 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
9718 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
9719 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
9721 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
9722 output_gbp_table ("unigbrk/gbrkprop.h", version);
9724 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
9725 debug_output_composition_tables ("uninorm/composition.txt");
9726 output_composition_tables ("uninorm/composition-table.gperf", version);
9728 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
9729 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
9730 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
9731 output_simple_mapping ("unicase/toupper.h", to_upper, version);
9732 output_simple_mapping ("unicase/tolower.h", to_lower, version);
9733 output_simple_mapping ("unicase/totitle.h", to_title, version);
9734 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
9735 output_casing_rules ("unicase/special-casing-table.gperf", version);
9736 output_casing_properties (version);
9742 * For Emacs M-x compile
9744 * compile-command: "
9745 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
9747 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
9748 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
9749 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
9750 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \
9751 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
9752 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
9753 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
9754 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
9755 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
9756 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
9757 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
9758 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
9759 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
9760 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
9762 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
9763 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt