1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2013 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/ArabicShaping.txt \
25 /usr/local/share/Unidata/Scripts.txt \
26 /usr/local/share/Unidata/Blocks.txt \
27 /usr/local/share/Unidata/PropList-3.0.1.txt \
28 /usr/local/share/Unidata/EastAsianWidth.txt \
29 /usr/local/share/Unidata/LineBreak.txt \
30 /usr/local/share/Unidata/WordBreakProperty.txt \
31 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
32 /usr/local/share/Unidata/CompositionExclusions.txt \
33 /usr/local/share/Unidata/SpecialCasing.txt \
34 /usr/local/share/Unidata/CaseFolding.txt \
45 /* ========================================================================= */
47 /* Reading UnicodeData.txt. */
50 /* This structure represents one line in the UnicodeData.txt file. */
51 struct unicode_attribute
53 const char *name; /* Character name */
54 const char *category; /* General category */
55 const char *combining; /* Canonical combining class */
56 const char *bidi; /* Bidirectional category */
57 const char *decomposition; /* Character decomposition mapping */
58 const char *decdigit; /* Decimal digit value */
59 const char *digit; /* Digit value */
60 const char *numeric; /* Numeric value */
61 bool mirrored; /* mirrored */
62 const char *oldname; /* Old Unicode 1.0 name */
63 const char *comment; /* Comment */
64 unsigned int upper; /* Uppercase mapping */
65 unsigned int lower; /* Lowercase mapping */
66 unsigned int title; /* Titlecase mapping */
69 /* Missing fields are represented with "" for strings, and NONE for
71 #define NONE (~(unsigned int)0)
73 /* The entire contents of the UnicodeData.txt file. */
74 struct unicode_attribute unicode_attributes [0x110000];
76 /* Stores in unicode_attributes[i] the values from the given fields. */
78 fill_attribute (unsigned int i,
79 const char *field1, const char *field2,
80 const char *field3, const char *field4,
81 const char *field5, const char *field6,
82 const char *field7, const char *field8,
83 const char *field9, const char *field10,
84 const char *field11, const char *field12,
85 const char *field13, const char *field14)
87 struct unicode_attribute * uni;
91 fprintf (stderr, "index too large\n");
94 if (strcmp (field2, "Cs") == 0)
95 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
97 uni = &unicode_attributes[i];
98 /* Copy the strings. */
99 uni->name = strdup (field1);
100 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
101 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
102 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
103 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
104 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
105 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
106 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
107 uni->mirrored = (field9[0] == 'Y');
108 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
109 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
110 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
111 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
112 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
115 /* Maximum length of a field in the UnicodeData.txt file. */
118 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
119 Reads up to (but excluding) DELIM.
120 Returns 1 when a field was successfully read, otherwise 0. */
122 getfield (FILE *stream, char *buffer, int delim)
127 for (; (c = getc (stream)), (c != EOF && c != delim); )
129 /* The original unicode.org UnicodeData.txt file happens to have
130 CR/LF line terminators. Silently convert to LF. */
134 /* Put c into the buffer. */
135 if (++count >= FIELDLEN - 1)
137 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
150 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
153 fill_attributes (const char *unicodedata_filename)
157 char field0[FIELDLEN];
158 char field1[FIELDLEN];
159 char field2[FIELDLEN];
160 char field3[FIELDLEN];
161 char field4[FIELDLEN];
162 char field5[FIELDLEN];
163 char field6[FIELDLEN];
164 char field7[FIELDLEN];
165 char field8[FIELDLEN];
166 char field9[FIELDLEN];
167 char field10[FIELDLEN];
168 char field11[FIELDLEN];
169 char field12[FIELDLEN];
170 char field13[FIELDLEN];
171 char field14[FIELDLEN];
174 for (i = 0; i < 0x110000; i++)
175 unicode_attributes[i].name = NULL;
177 stream = fopen (unicodedata_filename, "r");
180 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
189 n = getfield (stream, field0, ';');
190 n += getfield (stream, field1, ';');
191 n += getfield (stream, field2, ';');
192 n += getfield (stream, field3, ';');
193 n += getfield (stream, field4, ';');
194 n += getfield (stream, field5, ';');
195 n += getfield (stream, field6, ';');
196 n += getfield (stream, field7, ';');
197 n += getfield (stream, field8, ';');
198 n += getfield (stream, field9, ';');
199 n += getfield (stream, field10, ';');
200 n += getfield (stream, field11, ';');
201 n += getfield (stream, field12, ';');
202 n += getfield (stream, field13, ';');
203 n += getfield (stream, field14, '\n');
208 fprintf (stderr, "short line in '%s':%d\n",
209 unicodedata_filename, lineno);
212 i = strtoul (field0, NULL, 16);
214 && strlen (field1) >= 9
215 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
217 /* Deal with a range. */
219 n = getfield (stream, field0, ';');
220 n += getfield (stream, field1, ';');
221 n += getfield (stream, field2, ';');
222 n += getfield (stream, field3, ';');
223 n += getfield (stream, field4, ';');
224 n += getfield (stream, field5, ';');
225 n += getfield (stream, field6, ';');
226 n += getfield (stream, field7, ';');
227 n += getfield (stream, field8, ';');
228 n += getfield (stream, field9, ';');
229 n += getfield (stream, field10, ';');
230 n += getfield (stream, field11, ';');
231 n += getfield (stream, field12, ';');
232 n += getfield (stream, field13, ';');
233 n += getfield (stream, field14, '\n');
236 fprintf (stderr, "missing end range in '%s':%d\n",
237 unicodedata_filename, lineno);
240 if (!(field1[0] == '<'
241 && strlen (field1) >= 8
242 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
244 fprintf (stderr, "missing end range in '%s':%d\n",
245 unicodedata_filename, lineno);
248 field1[strlen (field1) - 7] = '\0';
249 j = strtoul (field0, NULL, 16);
251 fill_attribute (i, field1+1, field2, field3, field4, field5,
252 field6, field7, field8, field9, field10,
253 field11, field12, field13, field14);
257 /* Single character line */
258 fill_attribute (i, field1, field2, field3, field4, field5,
259 field6, field7, field8, field9, field10,
260 field11, field12, field13, field14);
264 if (ferror (stream) || fclose (stream))
266 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
271 /* ========================================================================= */
273 /* General category. */
274 /* See Unicode 3.0 book, section 4.5,
278 is_category_L (unsigned int ch)
280 return (unicode_attributes[ch].name != NULL
281 && unicode_attributes[ch].category[0] == 'L');
285 is_category_LC (unsigned int ch)
287 /* See PropertyValueAliases.txt. */
288 return (unicode_attributes[ch].name != NULL
289 && unicode_attributes[ch].category[0] == 'L'
290 && (unicode_attributes[ch].category[1] == 'u'
291 || unicode_attributes[ch].category[1] == 'l'
292 || unicode_attributes[ch].category[1] == 't'));
296 is_category_Lu (unsigned int ch)
298 return (unicode_attributes[ch].name != NULL
299 && unicode_attributes[ch].category[0] == 'L'
300 && unicode_attributes[ch].category[1] == 'u');
304 is_category_Ll (unsigned int ch)
306 return (unicode_attributes[ch].name != NULL
307 && unicode_attributes[ch].category[0] == 'L'
308 && unicode_attributes[ch].category[1] == 'l');
312 is_category_Lt (unsigned int ch)
314 return (unicode_attributes[ch].name != NULL
315 && unicode_attributes[ch].category[0] == 'L'
316 && unicode_attributes[ch].category[1] == 't');
320 is_category_Lm (unsigned int ch)
322 return (unicode_attributes[ch].name != NULL
323 && unicode_attributes[ch].category[0] == 'L'
324 && unicode_attributes[ch].category[1] == 'm');
328 is_category_Lo (unsigned int ch)
330 return (unicode_attributes[ch].name != NULL
331 && unicode_attributes[ch].category[0] == 'L'
332 && unicode_attributes[ch].category[1] == 'o');
336 is_category_M (unsigned int ch)
338 return (unicode_attributes[ch].name != NULL
339 && unicode_attributes[ch].category[0] == 'M');
343 is_category_Mn (unsigned int ch)
345 return (unicode_attributes[ch].name != NULL
346 && unicode_attributes[ch].category[0] == 'M'
347 && unicode_attributes[ch].category[1] == 'n');
351 is_category_Mc (unsigned int ch)
353 return (unicode_attributes[ch].name != NULL
354 && unicode_attributes[ch].category[0] == 'M'
355 && unicode_attributes[ch].category[1] == 'c');
359 is_category_Me (unsigned int ch)
361 return (unicode_attributes[ch].name != NULL
362 && unicode_attributes[ch].category[0] == 'M'
363 && unicode_attributes[ch].category[1] == 'e');
367 is_category_N (unsigned int ch)
369 return (unicode_attributes[ch].name != NULL
370 && unicode_attributes[ch].category[0] == 'N');
374 is_category_Nd (unsigned int ch)
376 return (unicode_attributes[ch].name != NULL
377 && unicode_attributes[ch].category[0] == 'N'
378 && unicode_attributes[ch].category[1] == 'd');
382 is_category_Nl (unsigned int ch)
384 return (unicode_attributes[ch].name != NULL
385 && unicode_attributes[ch].category[0] == 'N'
386 && unicode_attributes[ch].category[1] == 'l');
390 is_category_No (unsigned int ch)
392 return (unicode_attributes[ch].name != NULL
393 && unicode_attributes[ch].category[0] == 'N'
394 && unicode_attributes[ch].category[1] == 'o');
398 is_category_P (unsigned int ch)
400 return (unicode_attributes[ch].name != NULL
401 && unicode_attributes[ch].category[0] == 'P');
405 is_category_Pc (unsigned int ch)
407 return (unicode_attributes[ch].name != NULL
408 && unicode_attributes[ch].category[0] == 'P'
409 && unicode_attributes[ch].category[1] == 'c');
413 is_category_Pd (unsigned int ch)
415 return (unicode_attributes[ch].name != NULL
416 && unicode_attributes[ch].category[0] == 'P'
417 && unicode_attributes[ch].category[1] == 'd');
421 is_category_Ps (unsigned int ch)
423 return (unicode_attributes[ch].name != NULL
424 && unicode_attributes[ch].category[0] == 'P'
425 && unicode_attributes[ch].category[1] == 's');
429 is_category_Pe (unsigned int ch)
431 return (unicode_attributes[ch].name != NULL
432 && unicode_attributes[ch].category[0] == 'P'
433 && unicode_attributes[ch].category[1] == 'e');
437 is_category_Pi (unsigned int ch)
439 return (unicode_attributes[ch].name != NULL
440 && unicode_attributes[ch].category[0] == 'P'
441 && unicode_attributes[ch].category[1] == 'i');
445 is_category_Pf (unsigned int ch)
447 return (unicode_attributes[ch].name != NULL
448 && unicode_attributes[ch].category[0] == 'P'
449 && unicode_attributes[ch].category[1] == 'f');
453 is_category_Po (unsigned int ch)
455 return (unicode_attributes[ch].name != NULL
456 && unicode_attributes[ch].category[0] == 'P'
457 && unicode_attributes[ch].category[1] == 'o');
461 is_category_S (unsigned int ch)
463 return (unicode_attributes[ch].name != NULL
464 && unicode_attributes[ch].category[0] == 'S');
468 is_category_Sm (unsigned int ch)
470 return (unicode_attributes[ch].name != NULL
471 && unicode_attributes[ch].category[0] == 'S'
472 && unicode_attributes[ch].category[1] == 'm');
476 is_category_Sc (unsigned int ch)
478 return (unicode_attributes[ch].name != NULL
479 && unicode_attributes[ch].category[0] == 'S'
480 && unicode_attributes[ch].category[1] == 'c');
484 is_category_Sk (unsigned int ch)
486 return (unicode_attributes[ch].name != NULL
487 && unicode_attributes[ch].category[0] == 'S'
488 && unicode_attributes[ch].category[1] == 'k');
492 is_category_So (unsigned int ch)
494 return (unicode_attributes[ch].name != NULL
495 && unicode_attributes[ch].category[0] == 'S'
496 && unicode_attributes[ch].category[1] == 'o');
500 is_category_Z (unsigned int ch)
502 return (unicode_attributes[ch].name != NULL
503 && unicode_attributes[ch].category[0] == 'Z');
507 is_category_Zs (unsigned int ch)
509 return (unicode_attributes[ch].name != NULL
510 && unicode_attributes[ch].category[0] == 'Z'
511 && unicode_attributes[ch].category[1] == 's');
515 is_category_Zl (unsigned int ch)
517 return (unicode_attributes[ch].name != NULL
518 && unicode_attributes[ch].category[0] == 'Z'
519 && unicode_attributes[ch].category[1] == 'l');
523 is_category_Zp (unsigned int ch)
525 return (unicode_attributes[ch].name != NULL
526 && unicode_attributes[ch].category[0] == 'Z'
527 && unicode_attributes[ch].category[1] == 'p');
531 is_category_C (unsigned int ch)
533 return (unicode_attributes[ch].name == NULL
534 || unicode_attributes[ch].category[0] == 'C');
538 is_category_Cc (unsigned int ch)
540 return (unicode_attributes[ch].name != NULL
541 && unicode_attributes[ch].category[0] == 'C'
542 && unicode_attributes[ch].category[1] == 'c');
546 is_category_Cf (unsigned int ch)
548 return (unicode_attributes[ch].name != NULL
549 && unicode_attributes[ch].category[0] == 'C'
550 && unicode_attributes[ch].category[1] == 'f');
554 is_category_Cs (unsigned int ch)
556 return (ch >= 0xd800 && ch < 0xe000);
560 is_category_Co (unsigned int ch)
562 return (unicode_attributes[ch].name != NULL
563 && unicode_attributes[ch].category[0] == 'C'
564 && unicode_attributes[ch].category[1] == 'o');
568 is_category_Cn (unsigned int ch)
570 return (unicode_attributes[ch].name == NULL
571 && !(ch >= 0xd800 && ch < 0xe000));
574 /* Output a boolean property in a human readable format. */
576 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
581 stream = fopen (filename, "w");
584 fprintf (stderr, "cannot open '%s' for writing\n", filename);
588 #if 0 /* This yields huge text output. */
589 for (ch = 0; ch < 0x110000; ch++)
592 fprintf (stream, "0x%04X\n", ch);
595 for (ch = 0; ch < 0x110000; ch++)
598 unsigned int first = ch;
601 while (ch + 1 < 0x110000 && predicate (ch + 1))
605 fprintf (stream, "0x%04X..0x%04X\n", first, last);
607 fprintf (stream, "0x%04X\n", ch);
611 if (ferror (stream) || fclose (stream))
613 fprintf (stderr, "error writing to '%s'\n", filename);
618 /* Output the unit test for a boolean property. */
620 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
626 stream = fopen (filename, "w");
629 fprintf (stderr, "cannot open '%s' for writing\n", filename);
633 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
634 fprintf (stream, "/* Test the Unicode character type functions.\n");
635 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
636 fprintf (stream, "\n");
637 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
638 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
639 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
640 fprintf (stream, " (at your option) any later version.\n");
641 fprintf (stream, "\n");
642 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
643 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
644 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
645 fprintf (stream, " GNU General Public License for more details.\n");
646 fprintf (stream, "\n");
647 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
648 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
649 fprintf (stream, "\n");
650 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
651 fprintf (stream, "\n");
654 for (ch = 0; ch < 0x110000; ch++)
657 unsigned int first = ch;
660 while (ch + 1 < 0x110000 && predicate (ch + 1))
664 fprintf (stream, ",\n");
665 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
669 fprintf (stream, "\n");
671 fprintf (stream, "\n");
672 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
673 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
675 if (ferror (stream) || fclose (stream))
677 fprintf (stderr, "error writing to '%s'\n", filename);
682 /* Construction of sparse 3-level tables. */
683 #define TABLE predicate_table
684 #define xmalloc malloc
685 #define xrealloc realloc
686 #include "3levelbit.h"
688 /* Output a boolean property in a three-level bitmap. */
690 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
694 struct predicate_table t;
695 unsigned int level1_offset, level2_offset, level3_offset;
697 stream = fopen (filename, "w");
700 fprintf (stderr, "cannot open '%s' for writing\n", filename);
704 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
705 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
706 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
711 predicate_table_init (&t);
713 for (ch = 0; ch < 0x110000; ch++)
715 predicate_table_add (&t, ch);
717 predicate_table_finalize (&t);
719 /* Offsets in t.result, in memory of this process. */
721 5 * sizeof (uint32_t);
723 5 * sizeof (uint32_t)
724 + t.level1_size * sizeof (uint32_t);
726 5 * sizeof (uint32_t)
727 + t.level1_size * sizeof (uint32_t)
728 + (t.level2_size << t.q) * sizeof (uint32_t);
730 for (i = 0; i < 5; i++)
732 fprintf (stream, "#define header_%d %d\n", i,
733 ((uint32_t *) t.result)[i]);
735 fprintf (stream, "static const\n");
736 fprintf (stream, "struct\n");
737 fprintf (stream, " {\n");
738 fprintf (stream, " int header[1];\n");
739 fprintf (stream, " int level1[%zu];\n", t.level1_size);
740 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
741 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
742 fprintf (stream, " }\n");
743 fprintf (stream, "%s =\n", name);
744 fprintf (stream, "{\n");
745 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
746 fprintf (stream, " {");
747 if (t.level1_size > 1)
748 fprintf (stream, "\n ");
749 for (i = 0; i < t.level1_size; i++)
752 if (i > 0 && (i % 1) == 0)
753 fprintf (stream, "\n ");
754 offset = ((uint32_t *) (t.result + level1_offset))[i];
756 fprintf (stream, " %5d", -1);
758 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
759 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
760 if (i+1 < t.level1_size)
761 fprintf (stream, ",");
763 if (t.level1_size > 1)
764 fprintf (stream, "\n ");
765 fprintf (stream, " },\n");
766 fprintf (stream, " {");
767 if (t.level2_size << t.q > 1)
768 fprintf (stream, "\n ");
769 for (i = 0; i < t.level2_size << t.q; i++)
772 if (i > 0 && (i % 1) == 0)
773 fprintf (stream, "\n ");
774 offset = ((uint32_t *) (t.result + level2_offset))[i];
776 fprintf (stream, " %5d", -1);
778 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
779 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
780 if (i+1 < t.level2_size << t.q)
781 fprintf (stream, ",");
783 if (t.level2_size << t.q > 1)
784 fprintf (stream, "\n ");
785 fprintf (stream, " },\n");
786 fprintf (stream, " {");
787 if (t.level3_size << t.p > 4)
788 fprintf (stream, "\n ");
789 for (i = 0; i < t.level3_size << t.p; i++)
791 if (i > 0 && (i % 4) == 0)
792 fprintf (stream, "\n ");
793 fprintf (stream, " 0x%08X",
794 ((uint32_t *) (t.result + level3_offset))[i]);
795 if (i+1 < t.level3_size << t.p)
796 fprintf (stream, ",");
798 if (t.level3_size << t.p > 4)
799 fprintf (stream, "\n ");
800 fprintf (stream, " }\n");
801 fprintf (stream, "};\n");
803 if (ferror (stream) || fclose (stream))
805 fprintf (stderr, "error writing to '%s'\n", filename);
810 /* Output all categories. */
812 output_categories (const char *version)
814 #define CATEGORY(C) \
815 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
816 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
817 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
861 UC_CATEGORY_MASK_L = 0x0000001f,
862 UC_CATEGORY_MASK_LC = 0x00000007,
863 UC_CATEGORY_MASK_Lu = 0x00000001,
864 UC_CATEGORY_MASK_Ll = 0x00000002,
865 UC_CATEGORY_MASK_Lt = 0x00000004,
866 UC_CATEGORY_MASK_Lm = 0x00000008,
867 UC_CATEGORY_MASK_Lo = 0x00000010,
868 UC_CATEGORY_MASK_M = 0x000000e0,
869 UC_CATEGORY_MASK_Mn = 0x00000020,
870 UC_CATEGORY_MASK_Mc = 0x00000040,
871 UC_CATEGORY_MASK_Me = 0x00000080,
872 UC_CATEGORY_MASK_N = 0x00000700,
873 UC_CATEGORY_MASK_Nd = 0x00000100,
874 UC_CATEGORY_MASK_Nl = 0x00000200,
875 UC_CATEGORY_MASK_No = 0x00000400,
876 UC_CATEGORY_MASK_P = 0x0003f800,
877 UC_CATEGORY_MASK_Pc = 0x00000800,
878 UC_CATEGORY_MASK_Pd = 0x00001000,
879 UC_CATEGORY_MASK_Ps = 0x00002000,
880 UC_CATEGORY_MASK_Pe = 0x00004000,
881 UC_CATEGORY_MASK_Pi = 0x00008000,
882 UC_CATEGORY_MASK_Pf = 0x00010000,
883 UC_CATEGORY_MASK_Po = 0x00020000,
884 UC_CATEGORY_MASK_S = 0x003c0000,
885 UC_CATEGORY_MASK_Sm = 0x00040000,
886 UC_CATEGORY_MASK_Sc = 0x00080000,
887 UC_CATEGORY_MASK_Sk = 0x00100000,
888 UC_CATEGORY_MASK_So = 0x00200000,
889 UC_CATEGORY_MASK_Z = 0x01c00000,
890 UC_CATEGORY_MASK_Zs = 0x00400000,
891 UC_CATEGORY_MASK_Zl = 0x00800000,
892 UC_CATEGORY_MASK_Zp = 0x01000000,
893 UC_CATEGORY_MASK_C = 0x3e000000,
894 UC_CATEGORY_MASK_Cc = 0x02000000,
895 UC_CATEGORY_MASK_Cf = 0x04000000,
896 UC_CATEGORY_MASK_Cs = 0x08000000,
897 UC_CATEGORY_MASK_Co = 0x10000000,
898 UC_CATEGORY_MASK_Cn = 0x20000000
902 general_category_byname (const char *category_name)
904 if (category_name[0] != '\0'
905 && (category_name[1] == '\0' || category_name[2] == '\0'))
906 switch (category_name[0])
909 switch (category_name[1])
911 case '\0': return UC_CATEGORY_MASK_L;
912 case 'C': return UC_CATEGORY_MASK_LC;
913 case 'u': return UC_CATEGORY_MASK_Lu;
914 case 'l': return UC_CATEGORY_MASK_Ll;
915 case 't': return UC_CATEGORY_MASK_Lt;
916 case 'm': return UC_CATEGORY_MASK_Lm;
917 case 'o': return UC_CATEGORY_MASK_Lo;
921 switch (category_name[1])
923 case '\0': return UC_CATEGORY_MASK_M;
924 case 'n': return UC_CATEGORY_MASK_Mn;
925 case 'c': return UC_CATEGORY_MASK_Mc;
926 case 'e': return UC_CATEGORY_MASK_Me;
930 switch (category_name[1])
932 case '\0': return UC_CATEGORY_MASK_N;
933 case 'd': return UC_CATEGORY_MASK_Nd;
934 case 'l': return UC_CATEGORY_MASK_Nl;
935 case 'o': return UC_CATEGORY_MASK_No;
939 switch (category_name[1])
941 case '\0': return UC_CATEGORY_MASK_P;
942 case 'c': return UC_CATEGORY_MASK_Pc;
943 case 'd': return UC_CATEGORY_MASK_Pd;
944 case 's': return UC_CATEGORY_MASK_Ps;
945 case 'e': return UC_CATEGORY_MASK_Pe;
946 case 'i': return UC_CATEGORY_MASK_Pi;
947 case 'f': return UC_CATEGORY_MASK_Pf;
948 case 'o': return UC_CATEGORY_MASK_Po;
952 switch (category_name[1])
954 case '\0': return UC_CATEGORY_MASK_S;
955 case 'm': return UC_CATEGORY_MASK_Sm;
956 case 'c': return UC_CATEGORY_MASK_Sc;
957 case 'k': return UC_CATEGORY_MASK_Sk;
958 case 'o': return UC_CATEGORY_MASK_So;
962 switch (category_name[1])
964 case '\0': return UC_CATEGORY_MASK_Z;
965 case 's': return UC_CATEGORY_MASK_Zs;
966 case 'l': return UC_CATEGORY_MASK_Zl;
967 case 'p': return UC_CATEGORY_MASK_Zp;
971 switch (category_name[1])
973 case '\0': return UC_CATEGORY_MASK_C;
974 case 'c': return UC_CATEGORY_MASK_Cc;
975 case 'f': return UC_CATEGORY_MASK_Cf;
976 case 's': return UC_CATEGORY_MASK_Cs;
977 case 'o': return UC_CATEGORY_MASK_Co;
978 case 'n': return UC_CATEGORY_MASK_Cn;
982 /* Invalid category name. */
986 /* Construction of sparse 3-level tables. */
987 #define TABLE category_table
988 #define ELEMENT uint8_t
989 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
990 #define xmalloc malloc
991 #define xrealloc realloc
994 /* Output the per-character category table. */
996 output_category (const char *filename, const char *version)
1000 struct category_table t;
1001 unsigned int level1_offset, level2_offset, level3_offset;
1002 uint16_t *level3_packed;
1004 stream = fopen (filename, "w");
1007 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1011 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1012 fprintf (stream, "/* Categories of Unicode characters. */\n");
1013 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1018 category_table_init (&t);
1020 for (ch = 0; ch < 0x110000; ch++)
1023 unsigned int log2_value;
1025 if (is_category_Cs (ch))
1026 value = UC_CATEGORY_MASK_Cs;
1027 else if (unicode_attributes[ch].name != NULL)
1028 value = general_category_byname (unicode_attributes[ch].category);
1032 /* Now value should contain exactly one bit. */
1033 if (value == 0 || ((value & (value - 1)) != 0))
1036 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1038 category_table_add (&t, ch, log2_value);
1041 category_table_finalize (&t);
1043 /* Offsets in t.result, in memory of this process. */
1045 5 * sizeof (uint32_t);
1047 5 * sizeof (uint32_t)
1048 + t.level1_size * sizeof (uint32_t);
1050 5 * sizeof (uint32_t)
1051 + t.level1_size * sizeof (uint32_t)
1052 + (t.level2_size << t.q) * sizeof (uint32_t);
1054 for (i = 0; i < 5; i++)
1055 fprintf (stream, "#define category_header_%d %d\n", i,
1056 ((uint32_t *) t.result)[i]);
1057 fprintf (stream, "static const\n");
1058 fprintf (stream, "struct\n");
1059 fprintf (stream, " {\n");
1060 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1061 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1062 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1063 (1 << t.p) * 5 / 16);
1064 fprintf (stream, " }\n");
1065 fprintf (stream, "u_category =\n");
1066 fprintf (stream, "{\n");
1067 fprintf (stream, " {");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 for (i = 0; i < t.level1_size; i++)
1073 if (i > 0 && (i % 8) == 0)
1074 fprintf (stream, "\n ");
1075 offset = ((uint32_t *) (t.result + level1_offset))[i];
1077 fprintf (stream, " %5d", -1);
1079 fprintf (stream, " %5zu",
1080 (offset - level2_offset) / sizeof (uint32_t));
1081 if (i+1 < t.level1_size)
1082 fprintf (stream, ",");
1084 if (t.level1_size > 8)
1085 fprintf (stream, "\n ");
1086 fprintf (stream, " },\n");
1087 fprintf (stream, " {");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 for (i = 0; i < t.level2_size << t.q; i++)
1093 if (i > 0 && (i % 8) == 0)
1094 fprintf (stream, "\n ");
1095 offset = ((uint32_t *) (t.result + level2_offset))[i];
1097 fprintf (stream, " %5d", -1);
1099 fprintf (stream, " %5zu",
1100 (offset - level3_offset) / sizeof (uint8_t));
1101 if (i+1 < t.level2_size << t.q)
1102 fprintf (stream, ",");
1104 if (t.level2_size << t.q > 8)
1105 fprintf (stream, "\n ");
1106 fprintf (stream, " },\n");
1107 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1108 not 32-bit units, in order to make the lookup function easier. */
1111 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1112 for (i = 0; i < t.level3_size << t.p; i++)
1114 unsigned int j = (i * 5) / 16;
1115 unsigned int k = (i * 5) % 16;
1116 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1117 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1118 level3_packed[j] = value & 0xffff;
1119 level3_packed[j+1] = value >> 16;
1121 fprintf (stream, " {");
1122 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1123 fprintf (stream, "\n ");
1124 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1126 if (i > 0 && (i % 8) == 0)
1127 fprintf (stream, "\n ");
1128 fprintf (stream, " 0x%04x", level3_packed[i]);
1129 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1130 fprintf (stream, ",");
1132 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1133 fprintf (stream, "\n ");
1134 fprintf (stream, " }\n");
1135 free (level3_packed);
1136 fprintf (stream, "};\n");
1138 if (ferror (stream) || fclose (stream))
1140 fprintf (stderr, "error writing to '%s'\n", filename);
1145 /* ========================================================================= */
1147 /* Canonical combining class. */
1148 /* See Unicode 3.0 book, section 4.2,
1151 /* Construction of sparse 3-level tables. */
1152 #define TABLE combclass_table
1153 #define ELEMENT uint8_t
1155 #define xmalloc malloc
1156 #define xrealloc realloc
1159 /* Output the per-character combining class table. */
1161 output_combclass (const char *filename, const char *version)
1165 struct combclass_table t;
1166 unsigned int level1_offset, level2_offset, level3_offset;
1168 stream = fopen (filename, "w");
1171 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1175 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1176 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1177 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1182 combclass_table_init (&t);
1184 for (ch = 0; ch < 0x110000; ch++)
1185 if (unicode_attributes[ch].name != NULL)
1187 int value = atoi (unicode_attributes[ch].combining);
1188 if (!(value >= 0 && value <= 255))
1190 combclass_table_add (&t, ch, value);
1193 combclass_table_finalize (&t);
1195 /* Offsets in t.result, in memory of this process. */
1197 5 * sizeof (uint32_t);
1199 5 * sizeof (uint32_t)
1200 + t.level1_size * sizeof (uint32_t);
1202 5 * sizeof (uint32_t)
1203 + t.level1_size * sizeof (uint32_t)
1204 + (t.level2_size << t.q) * sizeof (uint32_t);
1206 for (i = 0; i < 5; i++)
1207 fprintf (stream, "#define combclass_header_%d %d\n", i,
1208 ((uint32_t *) t.result)[i]);
1209 fprintf (stream, "static const\n");
1210 fprintf (stream, "struct\n");
1211 fprintf (stream, " {\n");
1212 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1213 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1214 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1215 fprintf (stream, " }\n");
1216 fprintf (stream, "u_combclass =\n");
1217 fprintf (stream, "{\n");
1218 fprintf (stream, " {");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 for (i = 0; i < t.level1_size; i++)
1224 if (i > 0 && (i % 8) == 0)
1225 fprintf (stream, "\n ");
1226 offset = ((uint32_t *) (t.result + level1_offset))[i];
1228 fprintf (stream, " %5d", -1);
1230 fprintf (stream, " %5zu",
1231 (offset - level2_offset) / sizeof (uint32_t));
1232 if (i+1 < t.level1_size)
1233 fprintf (stream, ",");
1235 if (t.level1_size > 8)
1236 fprintf (stream, "\n ");
1237 fprintf (stream, " },\n");
1238 fprintf (stream, " {");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 for (i = 0; i < t.level2_size << t.q; i++)
1244 if (i > 0 && (i % 8) == 0)
1245 fprintf (stream, "\n ");
1246 offset = ((uint32_t *) (t.result + level2_offset))[i];
1248 fprintf (stream, " %5d", -1);
1250 fprintf (stream, " %5zu",
1251 (offset - level3_offset) / sizeof (uint8_t));
1252 if (i+1 < t.level2_size << t.q)
1253 fprintf (stream, ",");
1255 if (t.level2_size << t.q > 8)
1256 fprintf (stream, "\n ");
1257 fprintf (stream, " },\n");
1258 fprintf (stream, " {");
1259 if (t.level3_size << t.p > 8)
1260 fprintf (stream, "\n ");
1261 for (i = 0; i < t.level3_size << t.p; i++)
1263 if (i > 0 && (i % 8) == 0)
1264 fprintf (stream, "\n ");
1265 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1266 if (i+1 < t.level3_size << t.p)
1267 fprintf (stream, ",");
1269 if (t.level3_size << t.p > 8)
1270 fprintf (stream, "\n ");
1271 fprintf (stream, " }\n");
1272 fprintf (stream, "};\n");
1274 if (ferror (stream) || fclose (stream))
1276 fprintf (stderr, "error writing to '%s'\n", filename);
1281 /* ========================================================================= */
1283 /* Bidirectional category. */
1284 /* See Unicode 3.0 book, section 4.3,
1289 UC_BIDI_L, /* Left-to-Right */
1290 UC_BIDI_LRE, /* Left-to-Right Embedding */
1291 UC_BIDI_LRO, /* Left-to-Right Override */
1292 UC_BIDI_R, /* Right-to-Left */
1293 UC_BIDI_AL, /* Right-to-Left Arabic */
1294 UC_BIDI_RLE, /* Right-to-Left Embedding */
1295 UC_BIDI_RLO, /* Right-to-Left Override */
1296 UC_BIDI_PDF, /* Pop Directional Format */
1297 UC_BIDI_EN, /* European Number */
1298 UC_BIDI_ES, /* European Number Separator */
1299 UC_BIDI_ET, /* European Number Terminator */
1300 UC_BIDI_AN, /* Arabic Number */
1301 UC_BIDI_CS, /* Common Number Separator */
1302 UC_BIDI_NSM, /* Non-Spacing Mark */
1303 UC_BIDI_BN, /* Boundary Neutral */
1304 UC_BIDI_B, /* Paragraph Separator */
1305 UC_BIDI_S, /* Segment Separator */
1306 UC_BIDI_WS, /* Whitespace */
1307 UC_BIDI_ON /* Other Neutral */
1311 bidi_category_byname (const char *category_name)
1313 switch (category_name[0])
1316 switch (category_name[1])
1319 if (category_name[2] == '\0')
1323 if (category_name[2] == '\0')
1329 switch (category_name[1])
1334 if (category_name[2] == '\0')
1340 switch (category_name[1])
1343 if (category_name[2] == '\0')
1349 switch (category_name[1])
1352 if (category_name[2] == '\0')
1356 if (category_name[2] == '\0')
1360 if (category_name[2] == '\0')
1366 switch (category_name[1])
1371 switch (category_name[2])
1374 if (category_name[3] == '\0')
1378 if (category_name[3] == '\0')
1386 switch (category_name[1])
1389 switch (category_name[2])
1392 if (category_name[3] == '\0')
1400 switch (category_name[1])
1403 if (category_name[2] == '\0')
1409 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1423 switch (category_name[1])
1428 switch (category_name[2])
1431 if (category_name[3] == '\0')
1435 if (category_name[3] == '\0')
1443 if (category_name[1] == '\0')
1447 switch (category_name[1])
1450 if (category_name[2] == '\0')
1456 /* Invalid bidi category name. */
1461 get_bidi_category (unsigned int ch)
1463 if (unicode_attributes[ch].name != NULL)
1464 return bidi_category_byname (unicode_attributes[ch].bidi);
1467 /* The bidi category of unassigned characters depends on the range.
1468 See UTR #9 and DerivedBidiClass.txt. */
1469 if ((ch >= 0x0590 && ch <= 0x05FF)
1470 || (ch >= 0x07FB && ch <= 0x08FF)
1471 || (ch >= 0xFB37 && ch <= 0xFB45)
1472 || (ch >= 0x10800 && ch <= 0x10FFF))
1474 else if ((ch >= 0x0600 && ch <= 0x07BF)
1475 || (ch >= 0x2064 && ch <= 0x2069)
1476 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1477 || (ch >= 0xFDFE && ch <= 0xFEFE))
1479 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1480 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1481 || (ch & 0xFFFF) == 0xFFFE
1482 || (ch & 0xFFFF) == 0xFFFF
1483 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1490 /* Construction of sparse 3-level tables. */
1491 #define TABLE bidi_category_table
1492 #define ELEMENT uint8_t
1493 #define DEFAULT UC_BIDI_L
1494 #define xmalloc malloc
1495 #define xrealloc realloc
1498 /* Output the per-character bidi category table. */
1500 output_bidi_category (const char *filename, const char *version)
1504 struct bidi_category_table t;
1505 unsigned int level1_offset, level2_offset, level3_offset;
1506 uint16_t *level3_packed;
1508 stream = fopen (filename, "w");
1511 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1515 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1516 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1517 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1522 bidi_category_table_init (&t);
1524 for (ch = 0; ch < 0x110000; ch++)
1526 int value = get_bidi_category (ch);
1528 bidi_category_table_add (&t, ch, value);
1531 bidi_category_table_finalize (&t);
1533 /* Offsets in t.result, in memory of this process. */
1535 5 * sizeof (uint32_t);
1537 5 * sizeof (uint32_t)
1538 + t.level1_size * sizeof (uint32_t);
1540 5 * sizeof (uint32_t)
1541 + t.level1_size * sizeof (uint32_t)
1542 + (t.level2_size << t.q) * sizeof (uint32_t);
1544 for (i = 0; i < 5; i++)
1545 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1546 ((uint32_t *) t.result)[i]);
1547 fprintf (stream, "static const\n");
1548 fprintf (stream, "struct\n");
1549 fprintf (stream, " {\n");
1550 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1551 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1552 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1553 (1 << t.p) * 5 / 16);
1554 fprintf (stream, " }\n");
1555 fprintf (stream, "u_bidi_category =\n");
1556 fprintf (stream, "{\n");
1557 fprintf (stream, " {");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 for (i = 0; i < t.level1_size; i++)
1563 if (i > 0 && (i % 8) == 0)
1564 fprintf (stream, "\n ");
1565 offset = ((uint32_t *) (t.result + level1_offset))[i];
1567 fprintf (stream, " %5d", -1);
1569 fprintf (stream, " %5zu",
1570 (offset - level2_offset) / sizeof (uint32_t));
1571 if (i+1 < t.level1_size)
1572 fprintf (stream, ",");
1574 if (t.level1_size > 8)
1575 fprintf (stream, "\n ");
1576 fprintf (stream, " },\n");
1577 fprintf (stream, " {");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 for (i = 0; i < t.level2_size << t.q; i++)
1583 if (i > 0 && (i % 8) == 0)
1584 fprintf (stream, "\n ");
1585 offset = ((uint32_t *) (t.result + level2_offset))[i];
1587 fprintf (stream, " %5d", -1);
1589 fprintf (stream, " %5zu",
1590 (offset - level3_offset) / sizeof (uint8_t));
1591 if (i+1 < t.level2_size << t.q)
1592 fprintf (stream, ",");
1594 if (t.level2_size << t.q > 8)
1595 fprintf (stream, "\n ");
1596 fprintf (stream, " },\n");
1597 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1598 not 32-bit units, in order to make the lookup function easier. */
1601 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1602 for (i = 0; i < t.level3_size << t.p; i++)
1604 unsigned int j = (i * 5) / 16;
1605 unsigned int k = (i * 5) % 16;
1606 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1607 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1608 level3_packed[j] = value & 0xffff;
1609 level3_packed[j+1] = value >> 16;
1611 fprintf (stream, " {");
1612 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1613 fprintf (stream, "\n ");
1614 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1616 if (i > 0 && (i % 8) == 0)
1617 fprintf (stream, "\n ");
1618 fprintf (stream, " 0x%04x", level3_packed[i]);
1619 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1620 fprintf (stream, ",");
1622 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1623 fprintf (stream, "\n ");
1624 fprintf (stream, " }\n");
1625 free (level3_packed);
1626 fprintf (stream, "};\n");
1628 if (ferror (stream) || fclose (stream))
1630 fprintf (stderr, "error writing to '%s'\n", filename);
1635 /* ========================================================================= */
1637 /* Decimal digit value. */
1638 /* See Unicode 3.0 book, section 4.6. */
1641 get_decdigit_value (unsigned int ch)
1643 if (unicode_attributes[ch].name != NULL
1644 && unicode_attributes[ch].decdigit[0] != '\0')
1645 return atoi (unicode_attributes[ch].decdigit);
1649 /* Construction of sparse 3-level tables. */
1650 #define TABLE decdigit_table
1651 #define ELEMENT uint8_t
1653 #define xmalloc malloc
1654 #define xrealloc realloc
1657 /* Output the unit test for the per-character decimal digit value table. */
1659 output_decimal_digit_test (const char *filename, const char *version)
1665 stream = fopen (filename, "w");
1668 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1672 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1673 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1674 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1678 for (ch = 0; ch < 0x110000; ch++)
1680 int value = get_decdigit_value (ch);
1682 if (!(value >= -1 && value < 10))
1688 fprintf (stream, ",\n");
1689 fprintf (stream, " { 0x%04X, %d }", ch, value);
1694 fprintf (stream, "\n");
1696 if (ferror (stream) || fclose (stream))
1698 fprintf (stderr, "error writing to '%s'\n", filename);
1703 /* Output the per-character decimal digit value table. */
1705 output_decimal_digit (const char *filename, const char *version)
1709 struct decdigit_table t;
1710 unsigned int level1_offset, level2_offset, level3_offset;
1712 stream = fopen (filename, "w");
1715 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1719 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1720 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1721 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1726 decdigit_table_init (&t);
1728 for (ch = 0; ch < 0x110000; ch++)
1730 int value = 1 + get_decdigit_value (ch);
1732 if (!(value >= 0 && value <= 10))
1735 decdigit_table_add (&t, ch, value);
1738 decdigit_table_finalize (&t);
1740 /* Offsets in t.result, in memory of this process. */
1742 5 * sizeof (uint32_t);
1744 5 * sizeof (uint32_t)
1745 + t.level1_size * sizeof (uint32_t);
1747 5 * sizeof (uint32_t)
1748 + t.level1_size * sizeof (uint32_t)
1749 + (t.level2_size << t.q) * sizeof (uint32_t);
1751 for (i = 0; i < 5; i++)
1752 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1753 ((uint32_t *) t.result)[i]);
1754 fprintf (stream, "static const\n");
1755 fprintf (stream, "struct\n");
1756 fprintf (stream, " {\n");
1757 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1758 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1759 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1761 fprintf (stream, " }\n");
1762 fprintf (stream, "u_decdigit =\n");
1763 fprintf (stream, "{\n");
1764 fprintf (stream, " {");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 for (i = 0; i < t.level1_size; i++)
1770 if (i > 0 && (i % 8) == 0)
1771 fprintf (stream, "\n ");
1772 offset = ((uint32_t *) (t.result + level1_offset))[i];
1774 fprintf (stream, " %5d", -1);
1776 fprintf (stream, " %5zu",
1777 (offset - level2_offset) / sizeof (uint32_t));
1778 if (i+1 < t.level1_size)
1779 fprintf (stream, ",");
1781 if (t.level1_size > 8)
1782 fprintf (stream, "\n ");
1783 fprintf (stream, " },\n");
1784 fprintf (stream, " {");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 for (i = 0; i < t.level2_size << t.q; i++)
1790 if (i > 0 && (i % 8) == 0)
1791 fprintf (stream, "\n ");
1792 offset = ((uint32_t *) (t.result + level2_offset))[i];
1794 fprintf (stream, " %5d", -1);
1796 fprintf (stream, " %5zu",
1797 (offset - level3_offset) / sizeof (uint8_t));
1798 if (i+1 < t.level2_size << t.q)
1799 fprintf (stream, ",");
1801 if (t.level2_size << t.q > 8)
1802 fprintf (stream, "\n ");
1803 fprintf (stream, " },\n");
1804 /* Pack the level3 array. Each entry needs 4 bits only. */
1805 fprintf (stream, " {");
1806 if (t.level3_size << (t.p - 1) > 8)
1807 fprintf (stream, "\n ");
1808 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1810 if (i > 0 && (i % 8) == 0)
1811 fprintf (stream, "\n ");
1812 fprintf (stream, " 0x%02x",
1813 ((uint8_t *) (t.result + level3_offset))[2*i]
1814 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1815 if (i+1 < t.level3_size << (t.p - 1))
1816 fprintf (stream, ",");
1818 if (t.level3_size << (t.p - 1) > 8)
1819 fprintf (stream, "\n ");
1820 fprintf (stream, " }\n");
1821 fprintf (stream, "};\n");
1823 if (ferror (stream) || fclose (stream))
1825 fprintf (stderr, "error writing to '%s'\n", filename);
1830 /* ========================================================================= */
1833 /* See Unicode 3.0 book, section 4.6. */
1836 get_digit_value (unsigned int ch)
1838 if (unicode_attributes[ch].name != NULL
1839 && unicode_attributes[ch].digit[0] != '\0')
1840 return atoi (unicode_attributes[ch].digit);
1844 /* Output the unit test for the per-character digit value table. */
1846 output_digit_test (const char *filename, const char *version)
1852 stream = fopen (filename, "w");
1855 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1859 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1860 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1861 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1865 for (ch = 0; ch < 0x110000; ch++)
1867 int value = get_digit_value (ch);
1869 if (!(value >= -1 && value < 10))
1875 fprintf (stream, ",\n");
1876 fprintf (stream, " { 0x%04X, %d }", ch, value);
1881 fprintf (stream, "\n");
1883 if (ferror (stream) || fclose (stream))
1885 fprintf (stderr, "error writing to '%s'\n", filename);
1890 /* Output the per-character digit value table. */
1892 output_digit (const char *filename, const char *version)
1896 struct decdigit_table t;
1897 unsigned int level1_offset, level2_offset, level3_offset;
1899 stream = fopen (filename, "w");
1902 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1906 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1907 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1908 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1913 decdigit_table_init (&t);
1915 for (ch = 0; ch < 0x110000; ch++)
1917 int value = 1 + get_digit_value (ch);
1919 if (!(value >= 0 && value <= 10))
1922 decdigit_table_add (&t, ch, value);
1925 decdigit_table_finalize (&t);
1927 /* Offsets in t.result, in memory of this process. */
1929 5 * sizeof (uint32_t);
1931 5 * sizeof (uint32_t)
1932 + t.level1_size * sizeof (uint32_t);
1934 5 * sizeof (uint32_t)
1935 + t.level1_size * sizeof (uint32_t)
1936 + (t.level2_size << t.q) * sizeof (uint32_t);
1938 for (i = 0; i < 5; i++)
1939 fprintf (stream, "#define digit_header_%d %d\n", i,
1940 ((uint32_t *) t.result)[i]);
1941 fprintf (stream, "static const\n");
1942 fprintf (stream, "struct\n");
1943 fprintf (stream, " {\n");
1944 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1945 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1946 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1948 fprintf (stream, " }\n");
1949 fprintf (stream, "u_digit =\n");
1950 fprintf (stream, "{\n");
1951 fprintf (stream, " {");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 for (i = 0; i < t.level1_size; i++)
1957 if (i > 0 && (i % 8) == 0)
1958 fprintf (stream, "\n ");
1959 offset = ((uint32_t *) (t.result + level1_offset))[i];
1961 fprintf (stream, " %5d", -1);
1963 fprintf (stream, " %5zu",
1964 (offset - level2_offset) / sizeof (uint32_t));
1965 if (i+1 < t.level1_size)
1966 fprintf (stream, ",");
1968 if (t.level1_size > 8)
1969 fprintf (stream, "\n ");
1970 fprintf (stream, " },\n");
1971 fprintf (stream, " {");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 for (i = 0; i < t.level2_size << t.q; i++)
1977 if (i > 0 && (i % 8) == 0)
1978 fprintf (stream, "\n ");
1979 offset = ((uint32_t *) (t.result + level2_offset))[i];
1981 fprintf (stream, " %5d", -1);
1983 fprintf (stream, " %5zu",
1984 (offset - level3_offset) / sizeof (uint8_t));
1985 if (i+1 < t.level2_size << t.q)
1986 fprintf (stream, ",");
1988 if (t.level2_size << t.q > 8)
1989 fprintf (stream, "\n ");
1990 fprintf (stream, " },\n");
1991 /* Pack the level3 array. Each entry needs 4 bits only. */
1992 fprintf (stream, " {");
1993 if (t.level3_size << (t.p - 1) > 8)
1994 fprintf (stream, "\n ");
1995 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1997 if (i > 0 && (i % 8) == 0)
1998 fprintf (stream, "\n ");
1999 fprintf (stream, " 0x%02x",
2000 ((uint8_t *) (t.result + level3_offset))[2*i]
2001 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2002 if (i+1 < t.level3_size << (t.p - 1))
2003 fprintf (stream, ",");
2005 if (t.level3_size << (t.p - 1) > 8)
2006 fprintf (stream, "\n ");
2007 fprintf (stream, " }\n");
2008 fprintf (stream, "};\n");
2010 if (ferror (stream) || fclose (stream))
2012 fprintf (stderr, "error writing to '%s'\n", filename);
2017 /* ========================================================================= */
2019 /* Numeric value. */
2020 /* See Unicode 3.0 book, section 4.6. */
2022 typedef struct { int numerator; int denominator; } uc_fraction_t;
2024 static uc_fraction_t
2025 get_numeric_value (unsigned int ch)
2027 uc_fraction_t value;
2029 if (unicode_attributes[ch].name != NULL
2030 && unicode_attributes[ch].numeric[0] != '\0')
2032 const char *str = unicode_attributes[ch].numeric;
2033 /* str is of the form "integer" or "integer/posinteger". */
2034 value.numerator = atoi (str);
2035 if (strchr (str, '/') != NULL)
2036 value.denominator = atoi (strchr (str, '/') + 1);
2038 value.denominator = 1;
2042 value.numerator = 0;
2043 value.denominator = 0;
2048 /* Output the unit test for the per-character numeric value table. */
2050 output_numeric_test (const char *filename, const char *version)
2056 stream = fopen (filename, "w");
2059 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2063 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2064 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2065 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2069 for (ch = 0; ch < 0x110000; ch++)
2071 uc_fraction_t value = get_numeric_value (ch);
2073 if (value.numerator != 0 || value.denominator != 0)
2076 fprintf (stream, ",\n");
2077 fprintf (stream, " { 0x%04X, %d, %d }",
2078 ch, value.numerator, value.denominator);
2083 fprintf (stream, "\n");
2085 if (ferror (stream) || fclose (stream))
2087 fprintf (stderr, "error writing to '%s'\n", filename);
2092 /* Construction of sparse 3-level tables. */
2093 #define TABLE numeric_table
2094 #define ELEMENT uint8_t
2096 #define xmalloc malloc
2097 #define xrealloc realloc
2100 /* Output the per-character numeric value table. */
2102 output_numeric (const char *filename, const char *version)
2105 uc_fraction_t fractions[128];
2106 unsigned int nfractions;
2107 unsigned int ch, i, j;
2108 struct numeric_table t;
2109 unsigned int level1_offset, level2_offset, level3_offset;
2110 uint16_t *level3_packed;
2112 stream = fopen (filename, "w");
2115 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2119 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2120 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2121 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2124 /* Create table of occurring fractions. */
2126 for (ch = 0; ch < 0x110000; ch++)
2128 uc_fraction_t value = get_numeric_value (ch);
2130 for (i = 0; i < nfractions; i++)
2131 if (value.numerator == fractions[i].numerator
2132 && value.denominator == fractions[i].denominator)
2134 if (i == nfractions)
2136 if (nfractions == 128)
2138 for (i = 0; i < nfractions; i++)
2139 if (value.denominator < fractions[i].denominator
2140 || (value.denominator == fractions[i].denominator
2141 && value.numerator < fractions[i].numerator))
2143 for (j = nfractions; j > i; j--)
2144 fractions[j] = fractions[j - 1];
2145 fractions[i] = value;
2150 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2152 fprintf (stream, "{\n");
2153 for (i = 0; i < nfractions; i++)
2155 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2156 fractions[i].denominator);
2157 if (i+1 < nfractions)
2158 fprintf (stream, ",");
2159 fprintf (stream, "\n");
2161 fprintf (stream, "};\n");
2165 numeric_table_init (&t);
2167 for (ch = 0; ch < 0x110000; ch++)
2169 uc_fraction_t value = get_numeric_value (ch);
2171 for (i = 0; i < nfractions; i++)
2172 if (value.numerator == fractions[i].numerator
2173 && value.denominator == fractions[i].denominator)
2175 if (i == nfractions)
2178 numeric_table_add (&t, ch, i);
2181 numeric_table_finalize (&t);
2183 /* Offsets in t.result, in memory of this process. */
2185 5 * sizeof (uint32_t);
2187 5 * sizeof (uint32_t)
2188 + t.level1_size * sizeof (uint32_t);
2190 5 * sizeof (uint32_t)
2191 + t.level1_size * sizeof (uint32_t)
2192 + (t.level2_size << t.q) * sizeof (uint32_t);
2194 for (i = 0; i < 5; i++)
2195 fprintf (stream, "#define numeric_header_%d %d\n", i,
2196 ((uint32_t *) t.result)[i]);
2197 fprintf (stream, "static const\n");
2198 fprintf (stream, "struct\n");
2199 fprintf (stream, " {\n");
2200 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2201 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2202 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2203 (1 << t.p) * 7 / 16);
2204 fprintf (stream, " }\n");
2205 fprintf (stream, "u_numeric =\n");
2206 fprintf (stream, "{\n");
2207 fprintf (stream, " {");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 for (i = 0; i < t.level1_size; i++)
2213 if (i > 0 && (i % 8) == 0)
2214 fprintf (stream, "\n ");
2215 offset = ((uint32_t *) (t.result + level1_offset))[i];
2217 fprintf (stream, " %5d", -1);
2219 fprintf (stream, " %5zu",
2220 (offset - level2_offset) / sizeof (uint32_t));
2221 if (i+1 < t.level1_size)
2222 fprintf (stream, ",");
2224 if (t.level1_size > 8)
2225 fprintf (stream, "\n ");
2226 fprintf (stream, " },\n");
2227 fprintf (stream, " {");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 for (i = 0; i < t.level2_size << t.q; i++)
2233 if (i > 0 && (i % 8) == 0)
2234 fprintf (stream, "\n ");
2235 offset = ((uint32_t *) (t.result + level2_offset))[i];
2237 fprintf (stream, " %5d", -1);
2239 fprintf (stream, " %5zu",
2240 (offset - level3_offset) / sizeof (uint8_t));
2241 if (i+1 < t.level2_size << t.q)
2242 fprintf (stream, ",");
2244 if (t.level2_size << t.q > 8)
2245 fprintf (stream, "\n ");
2246 fprintf (stream, " },\n");
2247 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2248 not 32-bit units, in order to make the lookup function easier. */
2251 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2252 for (i = 0; i < t.level3_size << t.p; i++)
2254 unsigned int j = (i * 7) / 16;
2255 unsigned int k = (i * 7) % 16;
2256 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2257 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2258 level3_packed[j] = value & 0xffff;
2259 level3_packed[j+1] = value >> 16;
2261 fprintf (stream, " {");
2262 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2263 fprintf (stream, "\n ");
2264 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2266 if (i > 0 && (i % 8) == 0)
2267 fprintf (stream, "\n ");
2268 fprintf (stream, " 0x%04x", level3_packed[i]);
2269 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2270 fprintf (stream, ",");
2272 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2273 fprintf (stream, "\n ");
2274 fprintf (stream, " }\n");
2275 free (level3_packed);
2276 fprintf (stream, "};\n");
2278 if (ferror (stream) || fclose (stream))
2280 fprintf (stderr, "error writing to '%s'\n", filename);
2285 /* ========================================================================= */
2288 /* See Unicode 3.0 book, section 4.7,
2291 /* List of mirrored character pairs. This is a subset of the characters
2292 having the BidiMirrored property. */
2293 static unsigned int mirror_pairs[][2] =
2350 get_mirror_value (unsigned int ch)
2353 unsigned int mirror_char;
2356 mirrored = (unicode_attributes[ch].name != NULL
2357 && unicode_attributes[ch].mirrored);
2358 mirror_char = 0xfffd;
2359 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2360 if (ch == mirror_pairs[i][0])
2362 mirror_char = mirror_pairs[i][1];
2365 else if (ch == mirror_pairs[i][1])
2367 mirror_char = mirror_pairs[i][0];
2371 return (int) mirror_char - (int) ch;
2374 if (mirror_char != 0xfffd)
2380 /* Construction of sparse 3-level tables. */
2381 #define TABLE mirror_table
2382 #define ELEMENT int32_t
2384 #define xmalloc malloc
2385 #define xrealloc realloc
2388 /* Output the per-character mirror table. */
2390 output_mirror (const char *filename, const char *version)
2394 struct mirror_table t;
2395 unsigned int level1_offset, level2_offset, level3_offset;
2397 stream = fopen (filename, "w");
2400 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2404 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2405 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2406 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2411 mirror_table_init (&t);
2413 for (ch = 0; ch < 0x110000; ch++)
2415 int value = get_mirror_value (ch);
2417 mirror_table_add (&t, ch, value);
2420 mirror_table_finalize (&t);
2422 /* Offsets in t.result, in memory of this process. */
2424 5 * sizeof (uint32_t);
2426 5 * sizeof (uint32_t)
2427 + t.level1_size * sizeof (uint32_t);
2429 5 * sizeof (uint32_t)
2430 + t.level1_size * sizeof (uint32_t)
2431 + (t.level2_size << t.q) * sizeof (uint32_t);
2433 for (i = 0; i < 5; i++)
2434 fprintf (stream, "#define mirror_header_%d %d\n", i,
2435 ((uint32_t *) t.result)[i]);
2436 fprintf (stream, "static const\n");
2437 fprintf (stream, "struct\n");
2438 fprintf (stream, " {\n");
2439 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2440 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2441 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2442 fprintf (stream, " }\n");
2443 fprintf (stream, "u_mirror =\n");
2444 fprintf (stream, "{\n");
2445 fprintf (stream, " {");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 for (i = 0; i < t.level1_size; i++)
2451 if (i > 0 && (i % 8) == 0)
2452 fprintf (stream, "\n ");
2453 offset = ((uint32_t *) (t.result + level1_offset))[i];
2455 fprintf (stream, " %5d", -1);
2457 fprintf (stream, " %5zu",
2458 (offset - level2_offset) / sizeof (uint32_t));
2459 if (i+1 < t.level1_size)
2460 fprintf (stream, ",");
2462 if (t.level1_size > 8)
2463 fprintf (stream, "\n ");
2464 fprintf (stream, " },\n");
2465 fprintf (stream, " {");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 for (i = 0; i < t.level2_size << t.q; i++)
2471 if (i > 0 && (i % 8) == 0)
2472 fprintf (stream, "\n ");
2473 offset = ((uint32_t *) (t.result + level2_offset))[i];
2475 fprintf (stream, " %5d", -1);
2477 fprintf (stream, " %5zu",
2478 (offset - level3_offset) / sizeof (int32_t));
2479 if (i+1 < t.level2_size << t.q)
2480 fprintf (stream, ",");
2482 if (t.level2_size << t.q > 8)
2483 fprintf (stream, "\n ");
2484 fprintf (stream, " },\n");
2485 fprintf (stream, " {");
2486 if (t.level3_size << t.p > 8)
2487 fprintf (stream, "\n ");
2488 for (i = 0; i < t.level3_size << t.p; i++)
2490 if (i > 0 && (i % 8) == 0)
2491 fprintf (stream, "\n ");
2492 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2493 if (i+1 < t.level3_size << t.p)
2494 fprintf (stream, ",");
2496 if (t.level3_size << t.p > 8)
2497 fprintf (stream, "\n ");
2498 fprintf (stream, " }\n");
2499 fprintf (stream, "};\n");
2501 if (ferror (stream) || fclose (stream))
2503 fprintf (stderr, "error writing to '%s'\n", filename);
2508 /* ========================================================================= */
2510 /* Particular values of the word break property. */
2513 is_WBP_MIDNUMLET (unsigned int ch)
2515 return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
2516 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2520 is_WBP_MIDLETTER (unsigned int ch)
2522 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2523 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
2526 /* ========================================================================= */
2530 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2539 PROP_QUOTATION_MARK,
2540 PROP_TERMINAL_PUNCTUATION,
2543 PROP_ASCII_HEX_DIGIT,
2544 PROP_OTHER_ALPHABETIC,
2548 PROP_OTHER_LOWERCASE,
2549 PROP_OTHER_UPPERCASE,
2550 PROP_NONCHARACTER_CODE_POINT,
2551 PROP_OTHER_GRAPHEME_EXTEND,
2552 PROP_IDS_BINARY_OPERATOR,
2553 PROP_IDS_TRINARY_OPERATOR,
2555 PROP_UNIFIED_IDEOGRAPH,
2556 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2559 PROP_LOGICAL_ORDER_EXCEPTION,
2560 PROP_OTHER_ID_START,
2561 PROP_OTHER_ID_CONTINUE,
2563 PROP_VARIATION_SELECTOR,
2564 PROP_PATTERN_WHITE_SPACE,
2565 PROP_PATTERN_SYNTAX,
2566 /* DerivedCoreProperties.txt */
2572 PROP_CASE_IGNORABLE,
2573 PROP_CHANGES_WHEN_LOWERCASED,
2574 PROP_CHANGES_WHEN_UPPERCASED,
2575 PROP_CHANGES_WHEN_TITLECASED,
2576 PROP_CHANGES_WHEN_CASEFOLDED,
2577 PROP_CHANGES_WHEN_CASEMAPPED,
2582 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2583 PROP_GRAPHEME_EXTEND,
2587 unsigned long long unicode_properties[0x110000];
2590 clear_properties (void)
2594 for (i = 0; i < 0x110000; i++)
2595 unicode_properties[i] = 0;
2598 /* Stores in unicode_properties[] the properties from the
2599 PropList.txt or DerivedCoreProperties.txt file. */
2601 fill_properties (const char *proplist_filename)
2606 stream = fopen (proplist_filename, "r");
2609 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2616 unsigned int i1, i2;
2617 char padding[200+1];
2618 char propname[200+1];
2619 unsigned int propvalue;
2621 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2624 if (buf[0] == '\0' || buf[0] == '#')
2627 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2629 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2631 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2636 #define PROP(name,value) \
2637 if (strcmp (propname, name) == 0) propvalue = value; else
2639 PROP ("White_Space", PROP_WHITE_SPACE)
2640 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2641 PROP ("Join_Control", PROP_JOIN_CONTROL)
2642 PROP ("Dash", PROP_DASH)
2643 PROP ("Hyphen", PROP_HYPHEN)
2644 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2645 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2646 PROP ("Other_Math", PROP_OTHER_MATH)
2647 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2648 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2649 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2650 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2651 PROP ("Diacritic", PROP_DIACRITIC)
2652 PROP ("Extender", PROP_EXTENDER)
2653 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2654 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2655 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2656 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2657 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2658 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2659 PROP ("Radical", PROP_RADICAL)
2660 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2661 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2662 PROP ("Deprecated", PROP_DEPRECATED)
2663 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2664 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2665 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2666 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2667 PROP ("STerm", PROP_STERM)
2668 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2669 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2670 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2671 /* DerivedCoreProperties.txt */
2672 PROP ("Math", PROP_MATH)
2673 PROP ("Alphabetic", PROP_ALPHABETIC)
2674 PROP ("Lowercase", PROP_LOWERCASE)
2675 PROP ("Uppercase", PROP_UPPERCASE)
2676 PROP ("Cased", PROP_CASED)
2677 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2678 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2679 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2680 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2681 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2682 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2683 PROP ("ID_Start", PROP_ID_START)
2684 PROP ("ID_Continue", PROP_ID_CONTINUE)
2685 PROP ("XID_Start", PROP_XID_START)
2686 PROP ("XID_Continue", PROP_XID_CONTINUE)
2687 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2688 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2689 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2690 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2693 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2697 if (!(i1 <= i2 && i2 < 0x110000))
2700 for (i = i1; i <= i2; i++)
2701 unicode_properties[i] |= 1ULL << propvalue;
2704 if (ferror (stream) || fclose (stream))
2706 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2711 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2714 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2720 for (i = 0; i < 0x110000; i++)
2723 stream = fopen (proplist_filename, "r");
2726 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2730 /* Search for the "Property dump for: ..." line. */
2733 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2735 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2739 while (strstr (buf, property_name) == NULL);
2743 unsigned int i1, i2;
2745 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2749 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2751 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2753 fprintf (stderr, "parse error in property in '%s'\n",
2758 else if (strlen (buf) >= 4)
2760 if (sscanf (buf, "%4X", &i1) < 1)
2762 fprintf (stderr, "parse error in property in '%s'\n",
2770 fprintf (stderr, "parse error in property in '%s'\n",
2774 if (!(i1 <= i2 && i2 < 0x110000))
2776 for (i = i1; i <= i2; i++)
2780 if (ferror (stream) || fclose (stream))
2782 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2787 /* Properties from Unicode 3.0 PropList.txt file. */
2789 /* The paired punctuation property from the PropList.txt file. */
2790 char unicode_pairedpunctuation[0x110000];
2792 /* The left of pair property from the PropList.txt file. */
2793 char unicode_leftofpair[0x110000];
2796 fill_properties30 (const char *proplist30_filename)
2798 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2799 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2802 /* ------------------------------------------------------------------------- */
2804 /* See PropList.txt, UCD.html. */
2806 is_property_white_space (unsigned int ch)
2808 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2811 /* See Unicode 3.0 book, section 4.10,
2812 PropList.txt, UCD.html,
2813 DerivedCoreProperties.txt, UCD.html. */
2815 is_property_alphabetic (unsigned int ch)
2819 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2820 /* For some reason, the following are listed as having property
2821 Alphabetic but not as having property Other_Alphabetic. */
2822 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2823 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2824 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2825 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2826 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2827 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2828 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2829 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2830 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2831 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2832 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2833 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2834 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2836 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2838 if (result1 != result2)
2843 /* See PropList.txt, UCD.html. */
2845 is_property_other_alphabetic (unsigned int ch)
2847 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2850 /* See PropList.txt, UCD.html. */
2852 is_property_not_a_character (unsigned int ch)
2854 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2857 /* See PropList.txt, UCD.html,
2858 DerivedCoreProperties.txt, UCD.html. */
2860 is_property_default_ignorable_code_point (unsigned int ch)
2863 (is_category_Cf (ch)
2864 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2865 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
2866 /* For some reason, the following are not listed as having property
2867 Default_Ignorable_Code_Point. */
2868 && !(ch == 0x110BD))
2869 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2870 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2872 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2874 if (result1 != result2)
2879 /* See PropList.txt, UCD.html. */
2881 is_property_other_default_ignorable_code_point (unsigned int ch)
2883 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2886 /* See PropList.txt, UCD.html. */
2888 is_property_deprecated (unsigned int ch)
2890 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2893 /* See PropList.txt, UCD.html. */
2895 is_property_logical_order_exception (unsigned int ch)
2897 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2900 /* See PropList.txt, UCD.html. */
2902 is_property_variation_selector (unsigned int ch)
2904 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2907 /* See PropList-3.0.1.txt. */
2909 is_property_private_use (unsigned int ch)
2911 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2912 return (ch >= 0xE000 && ch <= 0xF8FF)
2913 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2914 || (ch >= 0x100000 && ch <= 0x10FFFD);
2917 /* See PropList-3.0.1.txt. */
2919 is_property_unassigned_code_value (unsigned int ch)
2921 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2924 /* See PropList.txt, UCD.html,
2925 DerivedCoreProperties.txt, UCD.html. */
2927 is_property_uppercase (unsigned int ch)
2931 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2933 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2935 if (result1 != result2)
2940 /* See PropList.txt, UCD.html. */
2942 is_property_other_uppercase (unsigned int ch)
2944 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2947 /* See PropList.txt, UCD.html,
2948 DerivedCoreProperties.txt, UCD.html. */
2950 is_property_lowercase (unsigned int ch)
2954 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2956 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2958 if (result1 != result2)
2963 /* See PropList.txt, UCD.html. */
2965 is_property_other_lowercase (unsigned int ch)
2967 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2970 /* See PropList-3.0.1.txt. */
2972 is_property_titlecase (unsigned int ch)
2974 return is_category_Lt (ch);
2977 /* See DerivedCoreProperties.txt. */
2979 is_property_cased (unsigned int ch)
2981 bool result1 = (is_property_lowercase (ch)
2982 || is_property_uppercase (ch)
2983 || is_category_Lt (ch));
2984 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
2986 if (result1 != result2)
2991 /* See DerivedCoreProperties.txt. */
2993 is_property_case_ignorable (unsigned int ch)
2995 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
2996 || is_category_Mn (ch)
2997 || is_category_Me (ch)
2998 || is_category_Cf (ch)
2999 || is_category_Lm (ch)
3000 || is_category_Sk (ch));
3001 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3003 if (result1 != result2)
3008 /* See DerivedCoreProperties.txt. */
3010 is_property_changes_when_lowercased (unsigned int ch)
3012 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3013 bool result2 = (unicode_attributes[ch].name != NULL
3014 && unicode_attributes[ch].lower != NONE
3015 && unicode_attributes[ch].lower != ch);
3017 if (result1 != result2)
3022 /* See DerivedCoreProperties.txt. */
3024 is_property_changes_when_uppercased (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3029 /* See DerivedCoreProperties.txt. */
3031 is_property_changes_when_titlecased (unsigned int ch)
3033 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3036 /* See DerivedCoreProperties.txt. */
3038 is_property_changes_when_casefolded (unsigned int ch)
3040 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3043 /* See DerivedCoreProperties.txt. */
3045 is_property_changes_when_casemapped (unsigned int ch)
3047 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3050 /* See PropList.txt, UCD.html. */
3052 is_property_soft_dotted (unsigned int ch)
3054 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3057 /* See DerivedCoreProperties.txt, UCD.html. */
3059 is_property_id_start (unsigned int ch)
3061 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3064 /* See PropList.txt, UCD.html. */
3066 is_property_other_id_start (unsigned int ch)
3068 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3071 /* See DerivedCoreProperties.txt, UCD.html. */
3073 is_property_id_continue (unsigned int ch)
3075 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3078 /* See PropList.txt, UCD.html. */
3080 is_property_other_id_continue (unsigned int ch)
3082 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3085 /* See DerivedCoreProperties.txt, UCD.html. */
3087 is_property_xid_start (unsigned int ch)
3089 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3092 /* See DerivedCoreProperties.txt, UCD.html. */
3094 is_property_xid_continue (unsigned int ch)
3096 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3099 /* See PropList.txt, UCD.html. */
3101 is_property_pattern_white_space (unsigned int ch)
3103 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3106 /* See PropList.txt, UCD.html. */
3108 is_property_pattern_syntax (unsigned int ch)
3110 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3113 /* See PropList.txt, UCD.html. */
3115 is_property_join_control (unsigned int ch)
3117 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3120 /* See DerivedCoreProperties.txt, UCD.html. */
3122 is_property_grapheme_base (unsigned int ch)
3124 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3127 /* See DerivedCoreProperties.txt, UCD.html. */
3129 is_property_grapheme_extend (unsigned int ch)
3131 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3134 /* See PropList.txt, UCD.html. */
3136 is_property_other_grapheme_extend (unsigned int ch)
3138 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3141 /* See DerivedCoreProperties.txt, UCD.html. */
3143 is_property_grapheme_link (unsigned int ch)
3145 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3148 /* See PropList.txt, UCD.html. */
3150 is_property_bidi_control (unsigned int ch)
3152 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3155 /* See PropList-3.0.1.txt. */
3157 is_property_bidi_left_to_right (unsigned int ch)
3159 return (get_bidi_category (ch) == UC_BIDI_L);
3162 /* See PropList-3.0.1.txt. */
3164 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3166 return (get_bidi_category (ch) == UC_BIDI_R);
3169 /* See PropList-3.0.1.txt. */
3171 is_property_bidi_arabic_right_to_left (unsigned int ch)
3173 return (get_bidi_category (ch) == UC_BIDI_AL);
3176 /* See PropList-3.0.1.txt. */
3178 is_property_bidi_european_digit (unsigned int ch)
3180 return (get_bidi_category (ch) == UC_BIDI_EN);
3183 /* See PropList-3.0.1.txt. */
3185 is_property_bidi_eur_num_separator (unsigned int ch)
3187 return (get_bidi_category (ch) == UC_BIDI_ES);
3190 /* See PropList-3.0.1.txt. */
3192 is_property_bidi_eur_num_terminator (unsigned int ch)
3194 return (get_bidi_category (ch) == UC_BIDI_ET);
3197 /* See PropList-3.0.1.txt. */
3199 is_property_bidi_arabic_digit (unsigned int ch)
3201 return (get_bidi_category (ch) == UC_BIDI_AN);
3204 /* See PropList-3.0.1.txt. */
3206 is_property_bidi_common_separator (unsigned int ch)
3208 return (get_bidi_category (ch) == UC_BIDI_CS);
3211 /* See PropList-3.0.1.txt. */
3213 is_property_bidi_block_separator (unsigned int ch)
3215 return (get_bidi_category (ch) == UC_BIDI_B);
3218 /* See PropList-3.0.1.txt. */
3220 is_property_bidi_segment_separator (unsigned int ch)
3222 return (get_bidi_category (ch) == UC_BIDI_S);
3225 /* See PropList-3.0.1.txt. */
3227 is_property_bidi_whitespace (unsigned int ch)
3229 return (get_bidi_category (ch) == UC_BIDI_WS);
3232 /* See PropList-3.0.1.txt. */
3234 is_property_bidi_non_spacing_mark (unsigned int ch)
3236 return (get_bidi_category (ch) == UC_BIDI_NSM);
3239 /* See PropList-3.0.1.txt. */
3241 is_property_bidi_boundary_neutral (unsigned int ch)
3243 return (get_bidi_category (ch) == UC_BIDI_BN);
3246 /* See PropList-3.0.1.txt. */
3248 is_property_bidi_pdf (unsigned int ch)
3250 return (get_bidi_category (ch) == UC_BIDI_PDF);
3253 /* See PropList-3.0.1.txt. */
3255 is_property_bidi_embedding_or_override (unsigned int ch)
3257 int category = get_bidi_category (ch);
3258 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3259 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3262 /* See PropList-3.0.1.txt. */
3264 is_property_bidi_other_neutral (unsigned int ch)
3266 return (get_bidi_category (ch) == UC_BIDI_ON);
3269 /* See PropList.txt, UCD.html. */
3271 is_property_hex_digit (unsigned int ch)
3273 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3276 /* See PropList.txt, UCD.html. */
3278 is_property_ascii_hex_digit (unsigned int ch)
3280 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3283 /* See Unicode 3.0 book, section 4.10,
3284 PropList.txt, UCD.html. */
3286 is_property_ideographic (unsigned int ch)
3288 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3291 /* See PropList.txt, UCD.html. */
3293 is_property_unified_ideograph (unsigned int ch)
3295 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3298 /* See PropList.txt, UCD.html. */
3300 is_property_radical (unsigned int ch)
3302 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3305 /* See PropList.txt, UCD.html. */
3307 is_property_ids_binary_operator (unsigned int ch)
3309 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3312 /* See PropList.txt, UCD.html. */
3314 is_property_ids_trinary_operator (unsigned int ch)
3316 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3319 /* See PropList-3.0.1.txt. */
3321 is_property_zero_width (unsigned int ch)
3323 return is_category_Cf (ch)
3324 || (unicode_attributes[ch].name != NULL
3325 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3328 /* See PropList-3.0.1.txt. */
3330 is_property_space (unsigned int ch)
3332 return is_category_Zs (ch);
3335 /* See PropList-3.0.1.txt. */
3337 is_property_non_break (unsigned int ch)
3339 /* This is exactly the set of characters having line breaking
3341 return (ch == 0x00A0 /* NO-BREAK SPACE */
3342 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3343 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3344 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3345 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3346 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3347 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3348 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3349 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3350 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3351 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3352 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3353 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3354 || ch == 0x2007 /* FIGURE SPACE */
3355 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3356 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3359 /* See PropList-3.0.1.txt. */
3361 is_property_iso_control (unsigned int ch)
3364 (unicode_attributes[ch].name != NULL
3365 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3367 is_category_Cc (ch);
3369 if (result1 != result2)
3374 /* See PropList-3.0.1.txt. */
3376 is_property_format_control (unsigned int ch)
3378 return (is_category_Cf (ch)
3379 && get_bidi_category (ch) == UC_BIDI_BN
3380 && !is_property_join_control (ch)
3384 /* See PropList.txt, UCD.html. */
3386 is_property_dash (unsigned int ch)
3388 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3391 /* See PropList.txt, UCD.html. */
3393 is_property_hyphen (unsigned int ch)
3395 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3398 /* See PropList-3.0.1.txt. */
3400 is_property_punctuation (unsigned int ch)
3402 return is_category_P (ch);
3405 /* See PropList-3.0.1.txt. */
3407 is_property_line_separator (unsigned int ch)
3409 return is_category_Zl (ch);
3412 /* See PropList-3.0.1.txt. */
3414 is_property_paragraph_separator (unsigned int ch)
3416 return is_category_Zp (ch);
3419 /* See PropList.txt, UCD.html. */
3421 is_property_quotation_mark (unsigned int ch)
3423 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3426 /* See PropList.txt, UCD.html. */
3428 is_property_sentence_terminal (unsigned int ch)
3430 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3433 /* See PropList.txt, UCD.html. */
3435 is_property_terminal_punctuation (unsigned int ch)
3437 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3440 /* See PropList-3.0.1.txt. */
3442 is_property_currency_symbol (unsigned int ch)
3444 return is_category_Sc (ch);
3447 /* See Unicode 3.0 book, section 4.9,
3448 PropList.txt, UCD.html,
3449 DerivedCoreProperties.txt, UCD.html. */
3451 is_property_math (unsigned int ch)
3455 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3457 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3459 if (result1 != result2)
3464 /* See PropList.txt, UCD.html. */
3466 is_property_other_math (unsigned int ch)
3468 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3471 /* See PropList-3.0.1.txt. */
3473 is_property_paired_punctuation (unsigned int ch)
3475 return unicode_pairedpunctuation[ch];
3478 /* See PropList-3.0.1.txt. */
3480 is_property_left_of_pair (unsigned int ch)
3482 return unicode_leftofpair[ch];
3485 /* See PropList-3.0.1.txt. */
3487 is_property_combining (unsigned int ch)
3489 return (unicode_attributes[ch].name != NULL
3490 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3491 || is_category_Mc (ch)
3492 || is_category_Me (ch)
3493 || is_category_Mn (ch)));
3496 #if 0 /* same as is_property_bidi_non_spacing_mark */
3497 /* See PropList-3.0.1.txt. */
3499 is_property_non_spacing (unsigned int ch)
3501 return (unicode_attributes[ch].name != NULL
3502 && get_bidi_category (ch) == UC_BIDI_NSM);
3506 /* See PropList-3.0.1.txt. */
3508 is_property_composite (unsigned int ch)
3510 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3511 logical in some sense. */
3512 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3514 if (unicode_attributes[ch].name != NULL
3515 && unicode_attributes[ch].decomposition != NULL)
3517 /* Test whether the decomposition contains more than one character,
3518 and the first is not a space. */
3519 const char *decomp = unicode_attributes[ch].decomposition;
3520 if (decomp[0] == '<')
3522 decomp = strchr (decomp, '>') + 1;
3523 if (decomp[0] == ' ')
3526 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3531 /* See PropList-3.0.1.txt. */
3533 is_property_decimal_digit (unsigned int ch)
3535 return is_category_Nd (ch);
3538 /* See PropList-3.0.1.txt. */
3540 is_property_numeric (unsigned int ch)
3542 return ((get_numeric_value (ch)).denominator > 0)
3543 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3544 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3547 /* See PropList.txt, UCD.html. */
3549 is_property_diacritic (unsigned int ch)
3551 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3554 /* See PropList.txt, UCD.html. */
3556 is_property_extender (unsigned int ch)
3558 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3561 /* See PropList-3.0.1.txt. */
3563 is_property_ignorable_control (unsigned int ch)
3565 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3566 || is_category_Cf (ch))
3570 /* ------------------------------------------------------------------------- */
3572 /* Output all properties. */
3574 output_properties (const char *version)
3576 #define PROPERTY(P) \
3577 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3578 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3579 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3580 PROPERTY(white_space)
3581 PROPERTY(alphabetic)
3582 PROPERTY(other_alphabetic)
3583 PROPERTY(not_a_character)
3584 PROPERTY(default_ignorable_code_point)
3585 PROPERTY(other_default_ignorable_code_point)
3586 PROPERTY(deprecated)
3587 PROPERTY(logical_order_exception)
3588 PROPERTY(variation_selector)
3589 PROPERTY(private_use)
3590 PROPERTY(unassigned_code_value)
3592 PROPERTY(other_uppercase)
3594 PROPERTY(other_lowercase)
3597 PROPERTY(case_ignorable)
3598 PROPERTY(changes_when_lowercased)
3599 PROPERTY(changes_when_uppercased)
3600 PROPERTY(changes_when_titlecased)
3601 PROPERTY(changes_when_casefolded)
3602 PROPERTY(changes_when_casemapped)
3603 PROPERTY(soft_dotted)
3605 PROPERTY(other_id_start)
3606 PROPERTY(id_continue)
3607 PROPERTY(other_id_continue)
3609 PROPERTY(xid_continue)
3610 PROPERTY(pattern_white_space)
3611 PROPERTY(pattern_syntax)
3612 PROPERTY(join_control)
3613 PROPERTY(grapheme_base)
3614 PROPERTY(grapheme_extend)
3615 PROPERTY(other_grapheme_extend)
3616 PROPERTY(grapheme_link)
3617 PROPERTY(bidi_control)
3618 PROPERTY(bidi_left_to_right)
3619 PROPERTY(bidi_hebrew_right_to_left)
3620 PROPERTY(bidi_arabic_right_to_left)
3621 PROPERTY(bidi_european_digit)
3622 PROPERTY(bidi_eur_num_separator)
3623 PROPERTY(bidi_eur_num_terminator)
3624 PROPERTY(bidi_arabic_digit)
3625 PROPERTY(bidi_common_separator)
3626 PROPERTY(bidi_block_separator)
3627 PROPERTY(bidi_segment_separator)
3628 PROPERTY(bidi_whitespace)
3629 PROPERTY(bidi_non_spacing_mark)
3630 PROPERTY(bidi_boundary_neutral)
3632 PROPERTY(bidi_embedding_or_override)
3633 PROPERTY(bidi_other_neutral)
3635 PROPERTY(ascii_hex_digit)
3636 PROPERTY(ideographic)
3637 PROPERTY(unified_ideograph)
3639 PROPERTY(ids_binary_operator)
3640 PROPERTY(ids_trinary_operator)
3641 PROPERTY(zero_width)
3644 PROPERTY(iso_control)
3645 PROPERTY(format_control)
3648 PROPERTY(punctuation)
3649 PROPERTY(line_separator)
3650 PROPERTY(paragraph_separator)
3651 PROPERTY(quotation_mark)
3652 PROPERTY(sentence_terminal)
3653 PROPERTY(terminal_punctuation)
3654 PROPERTY(currency_symbol)
3656 PROPERTY(other_math)
3657 PROPERTY(paired_punctuation)
3658 PROPERTY(left_of_pair)
3661 PROPERTY(decimal_digit)
3665 PROPERTY(ignorable_control)
3669 /* ========================================================================= */
3671 /* Arabic Shaping. */
3675 UC_JOINING_TYPE_U, /* Non_Joining */
3676 UC_JOINING_TYPE_T, /* Transparent */
3677 UC_JOINING_TYPE_C, /* Join_Causing */
3678 UC_JOINING_TYPE_L, /* Left_Joining */
3679 UC_JOINING_TYPE_R, /* Right_Joining */
3680 UC_JOINING_TYPE_D /* Dual_Joining */
3683 static uint8_t unicode_joining_type[0x110000];
3687 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
3688 UC_JOINING_GROUP_AIN, /* Ain */
3689 UC_JOINING_GROUP_ALAPH, /* Alaph */
3690 UC_JOINING_GROUP_ALEF, /* Alef */
3691 UC_JOINING_GROUP_BEH, /* Beh */
3692 UC_JOINING_GROUP_BETH, /* Beth */
3693 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3694 UC_JOINING_GROUP_DAL, /* Dal */
3695 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
3696 UC_JOINING_GROUP_E, /* E */
3697 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
3698 UC_JOINING_GROUP_FE, /* Fe */
3699 UC_JOINING_GROUP_FEH, /* Feh */
3700 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
3701 UC_JOINING_GROUP_GAF, /* Gaf */
3702 UC_JOINING_GROUP_GAMAL, /* Gamal */
3703 UC_JOINING_GROUP_HAH, /* Hah */
3704 UC_JOINING_GROUP_HE, /* He */
3705 UC_JOINING_GROUP_HEH, /* Heh */
3706 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
3707 UC_JOINING_GROUP_HETH, /* Heth */
3708 UC_JOINING_GROUP_KAF, /* Kaf */
3709 UC_JOINING_GROUP_KAPH, /* Kaph */
3710 UC_JOINING_GROUP_KHAPH, /* Khaph */
3711 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
3712 UC_JOINING_GROUP_LAM, /* Lam */
3713 UC_JOINING_GROUP_LAMADH, /* Lamadh */
3714 UC_JOINING_GROUP_MEEM, /* Meem */
3715 UC_JOINING_GROUP_MIM, /* Mim */
3716 UC_JOINING_GROUP_NOON, /* Noon */
3717 UC_JOINING_GROUP_NUN, /* Nun */
3718 UC_JOINING_GROUP_NYA, /* Nya */
3719 UC_JOINING_GROUP_PE, /* Pe */
3720 UC_JOINING_GROUP_QAF, /* Qaf */
3721 UC_JOINING_GROUP_QAPH, /* Qaph */
3722 UC_JOINING_GROUP_REH, /* Reh */
3723 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
3724 UC_JOINING_GROUP_SAD, /* Sad */
3725 UC_JOINING_GROUP_SADHE, /* Sadhe */
3726 UC_JOINING_GROUP_SEEN, /* Seen */
3727 UC_JOINING_GROUP_SEMKATH, /* Semkath */
3728 UC_JOINING_GROUP_SHIN, /* Shin */
3729 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
3730 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
3731 UC_JOINING_GROUP_TAH, /* Tah */
3732 UC_JOINING_GROUP_TAW, /* Taw */
3733 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
3734 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
3735 UC_JOINING_GROUP_TETH, /* Teth */
3736 UC_JOINING_GROUP_WAW, /* Waw */
3737 UC_JOINING_GROUP_YEH, /* Yeh */
3738 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
3739 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
3740 UC_JOINING_GROUP_YUDH, /* Yudh */
3741 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
3742 UC_JOINING_GROUP_ZAIN, /* Zain */
3743 UC_JOINING_GROUP_ZHAIN /* Zhain */
3746 static uint8_t unicode_joining_group[0x110000];
3749 fill_arabicshaping (const char *arabicshaping_filename)
3755 stream = fopen (arabicshaping_filename, "r");
3758 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3762 for (i = 0; i < 0x110000; i++)
3764 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3765 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3772 char separator1[100+1];
3773 char padding1[100+1];
3774 char schematic_name[100+1];
3775 char separator2[100+1];
3776 char padding2[100+1];
3777 char joining_type_name[100+1];
3778 char separator3[100+1];
3779 char padding3[100+1];
3780 char joining_group_name[100+1];
3785 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3788 if (buf[0] == '\0' || buf[0] == '#')
3791 if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
3792 &i, separator1, padding1, schematic_name, separator2,
3793 padding2, joining_type_name, separator3, padding3,
3794 joining_group_name) != 10)
3796 fprintf (stderr, "parse error in '%s':%d\n",
3797 arabicshaping_filename, lineno);
3803 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3805 TRY(UC_JOINING_TYPE_U)
3806 TRY(UC_JOINING_TYPE_T)
3807 TRY(UC_JOINING_TYPE_C)
3808 TRY(UC_JOINING_TYPE_L)
3809 TRY(UC_JOINING_TYPE_R)
3810 TRY(UC_JOINING_TYPE_D)
3814 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3815 joining_type_name, arabicshaping_filename, lineno);
3819 /* Remove trailing spaces. */
3820 while (joining_group_name[0] != '\0'
3821 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3822 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3824 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3826 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3827 TRY(UC_JOINING_GROUP_AIN, "AIN")
3828 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
3829 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
3830 TRY(UC_JOINING_GROUP_BEH, "BEH")
3831 TRY(UC_JOINING_GROUP_BETH, "BETH")
3832 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
3833 TRY(UC_JOINING_GROUP_DAL, "DAL")
3834 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
3835 TRY(UC_JOINING_GROUP_E, "E")
3836 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
3837 TRY(UC_JOINING_GROUP_FE, "FE")
3838 TRY(UC_JOINING_GROUP_FEH, "FEH")
3839 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
3840 TRY(UC_JOINING_GROUP_GAF, "GAF")
3841 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
3842 TRY(UC_JOINING_GROUP_HAH, "HAH")
3843 TRY(UC_JOINING_GROUP_HE, "HE")
3844 TRY(UC_JOINING_GROUP_HEH, "HEH")
3845 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
3846 TRY(UC_JOINING_GROUP_HETH, "HETH")
3847 TRY(UC_JOINING_GROUP_KAF, "KAF")
3848 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
3849 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
3850 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
3851 TRY(UC_JOINING_GROUP_LAM, "LAM")
3852 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
3853 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
3854 TRY(UC_JOINING_GROUP_MIM, "MIM")
3855 TRY(UC_JOINING_GROUP_NOON, "NOON")
3856 TRY(UC_JOINING_GROUP_NUN, "NUN")
3857 TRY(UC_JOINING_GROUP_NYA, "NYA")
3858 TRY(UC_JOINING_GROUP_PE, "PE")
3859 TRY(UC_JOINING_GROUP_QAF, "QAF")
3860 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
3861 TRY(UC_JOINING_GROUP_REH, "REH")
3862 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
3863 TRY(UC_JOINING_GROUP_SAD, "SAD")
3864 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
3865 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
3866 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
3867 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
3868 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
3869 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
3870 TRY(UC_JOINING_GROUP_TAH, "TAH")
3871 TRY(UC_JOINING_GROUP_TAW, "TAW")
3872 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
3873 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
3874 TRY(UC_JOINING_GROUP_TETH, "TETH")
3875 TRY(UC_JOINING_GROUP_WAW, "WAW")
3876 TRY(UC_JOINING_GROUP_YEH, "YEH")
3877 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
3878 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
3879 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
3880 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
3881 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
3882 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
3886 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
3887 joining_group_name, arabicshaping_filename, lineno);
3891 unicode_joining_type[i] = joining_type;
3892 unicode_joining_group[i] = joining_group;
3895 if (ferror (stream) || fclose (stream))
3897 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
3902 /* Convert a Joining_Type value to a C identifier. */
3904 joining_type_as_c_identifier (int joining_type)
3906 #define TRY(value) if (joining_type == value) return #value;
3907 TRY(UC_JOINING_TYPE_U)
3908 TRY(UC_JOINING_TYPE_T)
3909 TRY(UC_JOINING_TYPE_C)
3910 TRY(UC_JOINING_TYPE_L)
3911 TRY(UC_JOINING_TYPE_R)
3912 TRY(UC_JOINING_TYPE_D)
3918 output_joining_type_test (const char *filename, const char *version)
3924 stream = fopen (filename, "w");
3927 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3931 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3932 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
3933 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3937 for (ch = 0; ch < 0x110000; ch++)
3939 int value = unicode_joining_type[ch];
3941 if (value != (uint8_t)~(uint8_t)0)
3944 fprintf (stream, ",\n");
3945 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
3950 fprintf (stream, "\n");
3952 if (ferror (stream) || fclose (stream))
3954 fprintf (stderr, "error writing to '%s'\n", filename);
3959 /* Construction of sparse 3-level tables. */
3960 #define TABLE joining_type_table
3961 #define ELEMENT uint8_t
3962 #define DEFAULT (uint8_t)~(uint8_t)0
3963 #define xmalloc malloc
3964 #define xrealloc realloc
3968 output_joining_type (const char *filename, const char *version)
3972 struct joining_type_table t;
3973 unsigned int level1_offset, level2_offset, level3_offset;
3974 uint8_t *level3_packed;
3976 stream = fopen (filename, "w");
3979 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3983 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3984 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
3985 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3990 joining_type_table_init (&t);
3992 for (ch = 0; ch < 0x110000; ch++)
3994 uint8_t value = unicode_joining_type[ch];
3996 joining_type_table_add (&t, ch, value);
3999 joining_type_table_finalize (&t);
4001 /* Offsets in t.result, in memory of this process. */
4003 5 * sizeof (uint32_t);
4005 5 * sizeof (uint32_t)
4006 + t.level1_size * sizeof (uint32_t);
4008 5 * sizeof (uint32_t)
4009 + t.level1_size * sizeof (uint32_t)
4010 + (t.level2_size << t.q) * sizeof (uint32_t);
4012 for (i = 0; i < 5; i++)
4013 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4014 ((uint32_t *) t.result)[i]);
4015 fprintf (stream, "static const\n");
4016 fprintf (stream, "struct\n");
4017 fprintf (stream, " {\n");
4018 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4019 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4020 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4021 (1 << t.p) * 4 / 8);
4022 fprintf (stream, " }\n");
4023 fprintf (stream, "u_joining_type =\n");
4024 fprintf (stream, "{\n");
4025 fprintf (stream, " {");
4026 if (t.level1_size > 8)
4027 fprintf (stream, "\n ");
4028 for (i = 0; i < t.level1_size; i++)
4031 if (i > 0 && (i % 8) == 0)
4032 fprintf (stream, "\n ");
4033 offset = ((uint32_t *) (t.result + level1_offset))[i];
4035 fprintf (stream, " %5d", -1);
4037 fprintf (stream, " %5zu",
4038 (offset - level2_offset) / sizeof (uint32_t));
4039 if (i+1 < t.level1_size)
4040 fprintf (stream, ",");
4042 if (t.level1_size > 8)
4043 fprintf (stream, "\n ");
4044 fprintf (stream, " },\n");
4045 fprintf (stream, " {");
4046 if (t.level2_size << t.q > 8)
4047 fprintf (stream, "\n ");
4048 for (i = 0; i < t.level2_size << t.q; i++)
4051 if (i > 0 && (i % 8) == 0)
4052 fprintf (stream, "\n ");
4053 offset = ((uint32_t *) (t.result + level2_offset))[i];
4055 fprintf (stream, " %5d", -1);
4057 fprintf (stream, " %5zu",
4058 (offset - level3_offset) / sizeof (uint8_t));
4059 if (i+1 < t.level2_size << t.q)
4060 fprintf (stream, ",");
4062 if (t.level2_size << t.q > 8)
4063 fprintf (stream, "\n ");
4064 fprintf (stream, " },\n");
4065 /* Pack the level3 array. Each entry needs 4 bits only. */
4067 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4068 for (i = 0; i < t.level3_size << t.p; i++)
4070 unsigned int j = (i * 4) / 8;
4071 unsigned int k = (i * 4) % 8;
4072 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4073 level3_packed[j] |= (value << k);
4075 fprintf (stream, " {");
4076 if ((t.level3_size << t.p) * 4 / 8 > 8)
4077 fprintf (stream, "\n ");
4078 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4080 if (i > 0 && (i % 8) == 0)
4081 fprintf (stream, "\n ");
4082 fprintf (stream, " 0x%02x", level3_packed[i]);
4083 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4084 fprintf (stream, ",");
4086 if ((t.level3_size << t.p) * 4 / 8 > 8)
4087 fprintf (stream, "\n ");
4088 fprintf (stream, " }\n");
4089 free (level3_packed);
4090 fprintf (stream, "};\n");
4092 if (ferror (stream) || fclose (stream))
4094 fprintf (stderr, "error writing to '%s'\n", filename);
4099 /* Convert a Joining_Group value to a C identifier. */
4101 joining_group_as_c_identifier (int joining_group)
4103 #define TRY(value) if (joining_group == value) return #value;
4104 TRY(UC_JOINING_GROUP_NONE)
4105 TRY(UC_JOINING_GROUP_AIN)
4106 TRY(UC_JOINING_GROUP_ALAPH)
4107 TRY(UC_JOINING_GROUP_ALEF)
4108 TRY(UC_JOINING_GROUP_BEH)
4109 TRY(UC_JOINING_GROUP_BETH)
4110 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4111 TRY(UC_JOINING_GROUP_DAL)
4112 TRY(UC_JOINING_GROUP_DALATH_RISH)
4113 TRY(UC_JOINING_GROUP_E)
4114 TRY(UC_JOINING_GROUP_FARSI_YEH)
4115 TRY(UC_JOINING_GROUP_FE)
4116 TRY(UC_JOINING_GROUP_FEH)
4117 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4118 TRY(UC_JOINING_GROUP_GAF)
4119 TRY(UC_JOINING_GROUP_GAMAL)
4120 TRY(UC_JOINING_GROUP_HAH)
4121 TRY(UC_JOINING_GROUP_HE)
4122 TRY(UC_JOINING_GROUP_HEH)
4123 TRY(UC_JOINING_GROUP_HEH_GOAL)
4124 TRY(UC_JOINING_GROUP_HETH)
4125 TRY(UC_JOINING_GROUP_KAF)
4126 TRY(UC_JOINING_GROUP_KAPH)
4127 TRY(UC_JOINING_GROUP_KHAPH)
4128 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4129 TRY(UC_JOINING_GROUP_LAM)
4130 TRY(UC_JOINING_GROUP_LAMADH)
4131 TRY(UC_JOINING_GROUP_MEEM)
4132 TRY(UC_JOINING_GROUP_MIM)
4133 TRY(UC_JOINING_GROUP_NOON)
4134 TRY(UC_JOINING_GROUP_NUN)
4135 TRY(UC_JOINING_GROUP_NYA)
4136 TRY(UC_JOINING_GROUP_PE)
4137 TRY(UC_JOINING_GROUP_QAF)
4138 TRY(UC_JOINING_GROUP_QAPH)
4139 TRY(UC_JOINING_GROUP_REH)
4140 TRY(UC_JOINING_GROUP_REVERSED_PE)
4141 TRY(UC_JOINING_GROUP_SAD)
4142 TRY(UC_JOINING_GROUP_SADHE)
4143 TRY(UC_JOINING_GROUP_SEEN)
4144 TRY(UC_JOINING_GROUP_SEMKATH)
4145 TRY(UC_JOINING_GROUP_SHIN)
4146 TRY(UC_JOINING_GROUP_SWASH_KAF)
4147 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4148 TRY(UC_JOINING_GROUP_TAH)
4149 TRY(UC_JOINING_GROUP_TAW)
4150 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4151 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4152 TRY(UC_JOINING_GROUP_TETH)
4153 TRY(UC_JOINING_GROUP_WAW)
4154 TRY(UC_JOINING_GROUP_YEH)
4155 TRY(UC_JOINING_GROUP_YEH_BARREE)
4156 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4157 TRY(UC_JOINING_GROUP_YUDH)
4158 TRY(UC_JOINING_GROUP_YUDH_HE)
4159 TRY(UC_JOINING_GROUP_ZAIN)
4160 TRY(UC_JOINING_GROUP_ZHAIN)
4166 output_joining_group_test (const char *filename, const char *version)
4172 stream = fopen (filename, "w");
4175 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4179 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4180 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4181 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4185 for (ch = 0; ch < 0x110000; ch++)
4187 int value = unicode_joining_group[ch];
4189 if (value != UC_JOINING_GROUP_NONE)
4192 fprintf (stream, ",\n");
4193 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4198 fprintf (stream, "\n");
4200 if (ferror (stream) || fclose (stream))
4202 fprintf (stderr, "error writing to '%s'\n", filename);
4208 output_joining_group (const char *filename, const char *version)
4211 unsigned int ch_min, ch_max, ch, i;
4213 stream = fopen (filename, "w");
4216 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4220 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4221 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4222 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4226 for (ch = 0; ch < 0x110000; ch++)
4227 if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
4234 for (ch = 0x10FFFF; ch > 0; ch--)
4235 if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
4241 if (!(ch_min <= ch_max))
4244 /* If the interval [ch_min, ch_max] is too large, we should better use a
4246 if (!(ch_max - ch_min < 0x200))
4249 fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
4250 fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
4251 ch_max + 1, ch_min);
4252 fprintf (stream, "{");
4253 for (i = 0; i <= ch_max - ch_min; i++)
4259 fprintf (stream, "\n ");
4260 s = joining_group_as_c_identifier (unicode_joining_group[ch]);
4261 fprintf (stream, " %s", s);
4262 if (i+1 <= ch_max - ch_min)
4264 fprintf (stream, ",");
4265 if (((i+1) % 2) != 0)
4266 fprintf (stream, "%*s", 38 - (int) strlen (s), "");
4269 fprintf (stream, "\n");
4270 fprintf (stream, "};\n");
4272 if (ferror (stream) || fclose (stream))
4274 fprintf (stderr, "error writing to '%s'\n", filename);
4279 /* ========================================================================= */
4283 static const char *scripts[256];
4284 static unsigned int numscripts;
4286 static uint8_t unicode_scripts[0x110000];
4289 fill_scripts (const char *scripts_filename)
4294 stream = fopen (scripts_filename, "r");
4297 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4303 for (i = 0; i < 0x110000; i++)
4304 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4309 unsigned int i1, i2;
4310 char padding[200+1];
4311 char scriptname[200+1];
4314 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4317 if (buf[0] == '\0' || buf[0] == '#')
4320 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4322 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4324 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4334 for (script = numscripts - 1; script >= 0; script--)
4335 if (strcmp (scripts[script], scriptname) == 0)
4339 scripts[numscripts] = strdup (scriptname);
4340 script = numscripts;
4342 if (numscripts == 256)
4346 for (i = i1; i <= i2; i++)
4348 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4349 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4350 unicode_scripts[i] = script;
4354 if (ferror (stream) || fclose (stream))
4356 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4361 /* Construction of sparse 3-level tables. */
4362 #define TABLE script_table
4363 #define ELEMENT uint8_t
4364 #define DEFAULT (uint8_t)~(uint8_t)0
4365 #define xmalloc malloc
4366 #define xrealloc realloc
4370 output_scripts (const char *version)
4372 const char *filename = "unictype/scripts.h";
4374 unsigned int ch, s, i;
4375 struct script_table t;
4376 unsigned int level1_offset, level2_offset, level3_offset;
4380 const char *lowercase_name;
4383 scriptinfo_t scriptinfo[256];
4385 stream = fopen (filename, "w");
4388 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4392 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4393 fprintf (stream, "/* Unicode scripts. */\n");
4394 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4397 for (s = 0; s < numscripts; s++)
4399 char *lcp = strdup (scripts[s]);
4402 for (cp = lcp; *cp != '\0'; cp++)
4403 if (*cp >= 'A' && *cp <= 'Z')
4406 scriptinfo[s].lowercase_name = lcp;
4409 for (s = 0; s < numscripts; s++)
4411 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4412 scriptinfo[s].lowercase_name);
4413 fprintf (stream, "{\n");
4415 for (ch = 0; ch < 0x110000; ch++)
4416 if (unicode_scripts[ch] == s)
4422 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4427 fprintf (stream, ",\n");
4429 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4431 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4435 fprintf (stream, "\n");
4436 fprintf (stream, "};\n");
4439 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4440 fprintf (stream, "{\n");
4441 for (s = 0; s < numscripts; s++)
4443 fprintf (stream, " {\n");
4444 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4445 scriptinfo[s].lowercase_name);
4446 fprintf (stream, " script_%s_intervals,\n",
4447 scriptinfo[s].lowercase_name);
4448 fprintf (stream, " \"%s\"\n", scripts[s]);
4449 fprintf (stream, " }");
4450 if (s+1 < numscripts)
4451 fprintf (stream, ",");
4452 fprintf (stream, "\n");
4454 fprintf (stream, "};\n");
4458 script_table_init (&t);
4460 for (ch = 0; ch < 0x110000; ch++)
4462 unsigned int s = unicode_scripts[ch];
4463 if (s != (uint8_t)~(uint8_t)0)
4464 script_table_add (&t, ch, s);
4467 script_table_finalize (&t);
4469 /* Offsets in t.result, in memory of this process. */
4471 5 * sizeof (uint32_t);
4473 5 * sizeof (uint32_t)
4474 + t.level1_size * sizeof (uint32_t);
4476 5 * sizeof (uint32_t)
4477 + t.level1_size * sizeof (uint32_t)
4478 + (t.level2_size << t.q) * sizeof (uint32_t);
4480 for (i = 0; i < 5; i++)
4481 fprintf (stream, "#define script_header_%d %d\n", i,
4482 ((uint32_t *) t.result)[i]);
4483 fprintf (stream, "static const\n");
4484 fprintf (stream, "struct\n");
4485 fprintf (stream, " {\n");
4486 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4487 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4488 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4489 fprintf (stream, " }\n");
4490 fprintf (stream, "u_script =\n");
4491 fprintf (stream, "{\n");
4492 fprintf (stream, " {");
4493 if (t.level1_size > 8)
4494 fprintf (stream, "\n ");
4495 for (i = 0; i < t.level1_size; i++)
4498 if (i > 0 && (i % 8) == 0)
4499 fprintf (stream, "\n ");
4500 offset = ((uint32_t *) (t.result + level1_offset))[i];
4502 fprintf (stream, " %5d", -1);
4504 fprintf (stream, " %5zu",
4505 (offset - level2_offset) / sizeof (uint32_t));
4506 if (i+1 < t.level1_size)
4507 fprintf (stream, ",");
4509 if (t.level1_size > 8)
4510 fprintf (stream, "\n ");
4511 fprintf (stream, " },\n");
4512 fprintf (stream, " {");
4513 if (t.level2_size << t.q > 8)
4514 fprintf (stream, "\n ");
4515 for (i = 0; i < t.level2_size << t.q; i++)
4518 if (i > 0 && (i % 8) == 0)
4519 fprintf (stream, "\n ");
4520 offset = ((uint32_t *) (t.result + level2_offset))[i];
4522 fprintf (stream, " %5d", -1);
4524 fprintf (stream, " %5zu",
4525 (offset - level3_offset) / sizeof (uint8_t));
4526 if (i+1 < t.level2_size << t.q)
4527 fprintf (stream, ",");
4529 if (t.level2_size << t.q > 8)
4530 fprintf (stream, "\n ");
4531 fprintf (stream, " },\n");
4532 fprintf (stream, " {");
4533 if (t.level3_size << t.p > 8)
4534 fprintf (stream, "\n ");
4535 for (i = 0; i < t.level3_size << t.p; i++)
4537 if (i > 0 && (i % 8) == 0)
4538 fprintf (stream, "\n ");
4539 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4540 if (i+1 < t.level3_size << t.p)
4541 fprintf (stream, ",");
4543 if (t.level3_size << t.p > 8)
4544 fprintf (stream, "\n ");
4545 fprintf (stream, " }\n");
4546 fprintf (stream, "};\n");
4548 if (ferror (stream) || fclose (stream))
4550 fprintf (stderr, "error writing to '%s'\n", filename);
4556 output_scripts_byname (const char *version)
4558 const char *filename = "unictype/scripts_byname.gperf";
4562 stream = fopen (filename, "w");
4565 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4569 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4570 fprintf (stream, "/* Unicode scripts. */\n");
4571 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4573 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4574 fprintf (stream, "%%struct-type\n");
4575 fprintf (stream, "%%language=ANSI-C\n");
4576 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4577 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4578 fprintf (stream, "%%readonly-tables\n");
4579 fprintf (stream, "%%global-table\n");
4580 fprintf (stream, "%%define word-array-name script_names\n");
4581 fprintf (stream, "%%pic\n");
4582 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4583 fprintf (stream, "%%%%\n");
4584 for (s = 0; s < numscripts; s++)
4585 fprintf (stream, "%s, %u\n", scripts[s], s);
4587 if (ferror (stream) || fclose (stream))
4589 fprintf (stderr, "error writing to '%s'\n", filename);
4594 /* ========================================================================= */
4598 typedef struct { unsigned int start; unsigned int end; const char *name; }
4600 static block_t blocks[256];
4601 static unsigned int numblocks;
4604 fill_blocks (const char *blocks_filename)
4608 stream = fopen (blocks_filename, "r");
4611 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4618 unsigned int i1, i2;
4619 char padding[200+1];
4620 char blockname[200+1];
4622 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4625 if (buf[0] == '\0' || buf[0] == '#')
4628 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4630 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4633 blocks[numblocks].start = i1;
4634 blocks[numblocks].end = i2;
4635 blocks[numblocks].name = strdup (blockname);
4636 /* It must be sorted. */
4637 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
4640 if (numblocks == 256)
4644 if (ferror (stream) || fclose (stream))
4646 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4651 /* Return the smallest block index among the blocks for characters >= ch. */
4653 block_first_index (unsigned int ch)
4655 /* Binary search. */
4656 unsigned int lo = 0;
4657 unsigned int hi = numblocks;
4659 All blocks[i], i < lo, have blocks[i].end < ch,
4660 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4663 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4664 if (blocks[mid].end < ch)
4672 /* Return the largest block index among the blocks for characters <= ch,
4675 block_last_index (unsigned int ch)
4677 /* Binary search. */
4678 unsigned int lo = 0;
4679 unsigned int hi = numblocks;
4681 All blocks[i], i < lo, have blocks[i].start <= ch,
4682 all blocks[i], i >= hi, have blocks[i].start > ch. */
4685 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4686 if (blocks[mid].start <= ch)
4695 output_blocks (const char *version)
4697 const char *filename = "unictype/blocks.h";
4698 const unsigned int shift = 8; /* bits to shift away for array access */
4699 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
4704 stream = fopen (filename, "w");
4707 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4711 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4712 fprintf (stream, "/* Unicode blocks. */\n");
4713 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4716 fprintf (stream, "static const uc_block_t blocks[] =\n");
4717 fprintf (stream, "{\n");
4718 for (i = 0; i < numblocks; i++)
4720 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4721 blocks[i].end, blocks[i].name);
4722 if (i+1 < numblocks)
4723 fprintf (stream, ",");
4724 fprintf (stream, "\n");
4726 fprintf (stream, "};\n");
4727 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4728 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4729 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4730 threshold >> shift);
4731 fprintf (stream, "{\n");
4732 for (i1 = 0; i1 < (threshold >> shift); i1++)
4734 unsigned int first_index = block_first_index (i1 << shift);
4735 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4736 fprintf (stream, " %3d, %3d", first_index, last_index);
4737 if (i1+1 < (threshold >> shift))
4738 fprintf (stream, ",");
4739 fprintf (stream, "\n");
4741 fprintf (stream, "};\n");
4742 fprintf (stream, "#define blocks_upper_first_index %d\n",
4743 block_first_index (threshold));
4744 fprintf (stream, "#define blocks_upper_last_index %d\n",
4745 block_last_index (0x10FFFF));
4747 if (ferror (stream) || fclose (stream))
4749 fprintf (stderr, "error writing to '%s'\n", filename);
4754 /* ========================================================================= */
4756 /* C and Java syntax. */
4760 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4761 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4762 UC_IDENTIFIER_INVALID, /* not valid */
4763 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4766 /* ISO C 99 section 6.4.(3). */
4768 is_c_whitespace (unsigned int ch)
4770 return (ch == ' ' /* space */
4771 || ch == '\t' /* horizontal tab */
4772 || ch == '\n' || ch == '\r' /* new-line */
4773 || ch == '\v' /* vertical tab */
4774 || ch == '\f'); /* form-feed */
4777 /* ISO C 99 section 6.4.2.1 and appendix D. */
4779 c_ident_category (unsigned int ch)
4781 /* Section 6.4.2.1. */
4782 if (ch >= '0' && ch <= '9')
4783 return UC_IDENTIFIER_VALID;
4784 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4785 return UC_IDENTIFIER_START;
4791 || (ch >= 0x00C0 && ch <= 0x00D6)
4792 || (ch >= 0x00D8 && ch <= 0x00F6)
4793 || (ch >= 0x00F8 && ch <= 0x01F5)
4794 || (ch >= 0x01FA && ch <= 0x0217)
4795 || (ch >= 0x0250 && ch <= 0x02A8)
4796 || (ch >= 0x1E00 && ch <= 0x1E9B)
4797 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4801 || (ch >= 0x0388 && ch <= 0x038A)
4803 || (ch >= 0x038E && ch <= 0x03A1)
4804 || (ch >= 0x03A3 && ch <= 0x03CE)
4805 || (ch >= 0x03D0 && ch <= 0x03D6)
4810 || (ch >= 0x03E2 && ch <= 0x03F3)
4811 || (ch >= 0x1F00 && ch <= 0x1F15)
4812 || (ch >= 0x1F18 && ch <= 0x1F1D)
4813 || (ch >= 0x1F20 && ch <= 0x1F45)
4814 || (ch >= 0x1F48 && ch <= 0x1F4D)
4815 || (ch >= 0x1F50 && ch <= 0x1F57)
4819 || (ch >= 0x1F5F && ch <= 0x1F7D)
4820 || (ch >= 0x1F80 && ch <= 0x1FB4)
4821 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4822 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4823 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4824 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4825 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4826 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4827 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4828 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4830 || (ch >= 0x0401 && ch <= 0x040C)
4831 || (ch >= 0x040E && ch <= 0x044F)
4832 || (ch >= 0x0451 && ch <= 0x045C)
4833 || (ch >= 0x045E && ch <= 0x0481)
4834 || (ch >= 0x0490 && ch <= 0x04C4)
4835 || (ch >= 0x04C7 && ch <= 0x04C8)
4836 || (ch >= 0x04CB && ch <= 0x04CC)
4837 || (ch >= 0x04D0 && ch <= 0x04EB)
4838 || (ch >= 0x04EE && ch <= 0x04F5)
4839 || (ch >= 0x04F8 && ch <= 0x04F9)
4841 || (ch >= 0x0531 && ch <= 0x0556)
4842 || (ch >= 0x0561 && ch <= 0x0587)
4844 || (ch >= 0x05B0 && ch <= 0x05B9)
4845 || (ch >= 0x05BB && ch <= 0x05BD)
4847 || (ch >= 0x05C1 && ch <= 0x05C2)
4848 || (ch >= 0x05D0 && ch <= 0x05EA)
4849 || (ch >= 0x05F0 && ch <= 0x05F2)
4851 || (ch >= 0x0621 && ch <= 0x063A)
4852 || (ch >= 0x0640 && ch <= 0x0652)
4853 || (ch >= 0x0670 && ch <= 0x06B7)
4854 || (ch >= 0x06BA && ch <= 0x06BE)
4855 || (ch >= 0x06C0 && ch <= 0x06CE)
4856 || (ch >= 0x06D0 && ch <= 0x06DC)
4857 || (ch >= 0x06E5 && ch <= 0x06E8)
4858 || (ch >= 0x06EA && ch <= 0x06ED)
4860 || (ch >= 0x0901 && ch <= 0x0903)
4861 || (ch >= 0x0905 && ch <= 0x0939)
4862 || (ch >= 0x093E && ch <= 0x094D)
4863 || (ch >= 0x0950 && ch <= 0x0952)
4864 || (ch >= 0x0958 && ch <= 0x0963)
4866 || (ch >= 0x0981 && ch <= 0x0983)
4867 || (ch >= 0x0985 && ch <= 0x098C)
4868 || (ch >= 0x098F && ch <= 0x0990)
4869 || (ch >= 0x0993 && ch <= 0x09A8)
4870 || (ch >= 0x09AA && ch <= 0x09B0)
4872 || (ch >= 0x09B6 && ch <= 0x09B9)
4873 || (ch >= 0x09BE && ch <= 0x09C4)
4874 || (ch >= 0x09C7 && ch <= 0x09C8)
4875 || (ch >= 0x09CB && ch <= 0x09CD)
4876 || (ch >= 0x09DC && ch <= 0x09DD)
4877 || (ch >= 0x09DF && ch <= 0x09E3)
4878 || (ch >= 0x09F0 && ch <= 0x09F1)
4881 || (ch >= 0x0A05 && ch <= 0x0A0A)
4882 || (ch >= 0x0A0F && ch <= 0x0A10)
4883 || (ch >= 0x0A13 && ch <= 0x0A28)
4884 || (ch >= 0x0A2A && ch <= 0x0A30)
4885 || (ch >= 0x0A32 && ch <= 0x0A33)
4886 || (ch >= 0x0A35 && ch <= 0x0A36)
4887 || (ch >= 0x0A38 && ch <= 0x0A39)
4888 || (ch >= 0x0A3E && ch <= 0x0A42)
4889 || (ch >= 0x0A47 && ch <= 0x0A48)
4890 || (ch >= 0x0A4B && ch <= 0x0A4D)
4891 || (ch >= 0x0A59 && ch <= 0x0A5C)
4895 || (ch >= 0x0A81 && ch <= 0x0A83)
4896 || (ch >= 0x0A85 && ch <= 0x0A8B)
4898 || (ch >= 0x0A8F && ch <= 0x0A91)
4899 || (ch >= 0x0A93 && ch <= 0x0AA8)
4900 || (ch >= 0x0AAA && ch <= 0x0AB0)
4901 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4902 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4903 || (ch >= 0x0ABD && ch <= 0x0AC5)
4904 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4905 || (ch >= 0x0ACB && ch <= 0x0ACD)
4909 || (ch >= 0x0B01 && ch <= 0x0B03)
4910 || (ch >= 0x0B05 && ch <= 0x0B0C)
4911 || (ch >= 0x0B0F && ch <= 0x0B10)
4912 || (ch >= 0x0B13 && ch <= 0x0B28)
4913 || (ch >= 0x0B2A && ch <= 0x0B30)
4914 || (ch >= 0x0B32 && ch <= 0x0B33)
4915 || (ch >= 0x0B36 && ch <= 0x0B39)
4916 || (ch >= 0x0B3E && ch <= 0x0B43)
4917 || (ch >= 0x0B47 && ch <= 0x0B48)
4918 || (ch >= 0x0B4B && ch <= 0x0B4D)
4919 || (ch >= 0x0B5C && ch <= 0x0B5D)
4920 || (ch >= 0x0B5F && ch <= 0x0B61)
4922 || (ch >= 0x0B82 && ch <= 0x0B83)
4923 || (ch >= 0x0B85 && ch <= 0x0B8A)
4924 || (ch >= 0x0B8E && ch <= 0x0B90)
4925 || (ch >= 0x0B92 && ch <= 0x0B95)
4926 || (ch >= 0x0B99 && ch <= 0x0B9A)
4928 || (ch >= 0x0B9E && ch <= 0x0B9F)
4929 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4930 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4931 || (ch >= 0x0BAE && ch <= 0x0BB5)
4932 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4933 || (ch >= 0x0BBE && ch <= 0x0BC2)
4934 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4935 || (ch >= 0x0BCA && ch <= 0x0BCD)
4937 || (ch >= 0x0C01 && ch <= 0x0C03)
4938 || (ch >= 0x0C05 && ch <= 0x0C0C)
4939 || (ch >= 0x0C0E && ch <= 0x0C10)
4940 || (ch >= 0x0C12 && ch <= 0x0C28)
4941 || (ch >= 0x0C2A && ch <= 0x0C33)
4942 || (ch >= 0x0C35 && ch <= 0x0C39)
4943 || (ch >= 0x0C3E && ch <= 0x0C44)
4944 || (ch >= 0x0C46 && ch <= 0x0C48)
4945 || (ch >= 0x0C4A && ch <= 0x0C4D)
4946 || (ch >= 0x0C60 && ch <= 0x0C61)
4948 || (ch >= 0x0C82 && ch <= 0x0C83)
4949 || (ch >= 0x0C85 && ch <= 0x0C8C)
4950 || (ch >= 0x0C8E && ch <= 0x0C90)
4951 || (ch >= 0x0C92 && ch <= 0x0CA8)
4952 || (ch >= 0x0CAA && ch <= 0x0CB3)
4953 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4954 || (ch >= 0x0CBE && ch <= 0x0CC4)
4955 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4956 || (ch >= 0x0CCA && ch <= 0x0CCD)
4958 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4960 || (ch >= 0x0D02 && ch <= 0x0D03)
4961 || (ch >= 0x0D05 && ch <= 0x0D0C)
4962 || (ch >= 0x0D0E && ch <= 0x0D10)
4963 || (ch >= 0x0D12 && ch <= 0x0D28)
4964 || (ch >= 0x0D2A && ch <= 0x0D39)
4965 || (ch >= 0x0D3E && ch <= 0x0D43)
4966 || (ch >= 0x0D46 && ch <= 0x0D48)
4967 || (ch >= 0x0D4A && ch <= 0x0D4D)
4968 || (ch >= 0x0D60 && ch <= 0x0D61)
4970 || (ch >= 0x0E01 && ch <= 0x0E3A)
4971 || (ch >= 0x0E40 && ch <= 0x0E5B)
4973 || (ch >= 0x0E81 && ch <= 0x0E82)
4975 || (ch >= 0x0E87 && ch <= 0x0E88)
4978 || (ch >= 0x0E94 && ch <= 0x0E97)
4979 || (ch >= 0x0E99 && ch <= 0x0E9F)
4980 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4983 || (ch >= 0x0EAA && ch <= 0x0EAB)
4984 || (ch >= 0x0EAD && ch <= 0x0EAE)
4985 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4986 || (ch >= 0x0EBB && ch <= 0x0EBD)
4987 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4989 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4990 || (ch >= 0x0EDC && ch <= 0x0EDD)
4993 || (ch >= 0x0F18 && ch <= 0x0F19)
4997 || (ch >= 0x0F3E && ch <= 0x0F47)
4998 || (ch >= 0x0F49 && ch <= 0x0F69)
4999 || (ch >= 0x0F71 && ch <= 0x0F84)
5000 || (ch >= 0x0F86 && ch <= 0x0F8B)
5001 || (ch >= 0x0F90 && ch <= 0x0F95)
5003 || (ch >= 0x0F99 && ch <= 0x0FAD)
5004 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5007 || (ch >= 0x10A0 && ch <= 0x10C5)
5008 || (ch >= 0x10D0 && ch <= 0x10F6)
5010 || (ch >= 0x3041 && ch <= 0x3093)
5011 || (ch >= 0x309B && ch <= 0x309C)
5013 || (ch >= 0x30A1 && ch <= 0x30F6)
5014 || (ch >= 0x30FB && ch <= 0x30FC)
5016 || (ch >= 0x3105 && ch <= 0x312C)
5017 /* CJK Unified Ideographs */
5018 || (ch >= 0x4E00 && ch <= 0x9FA5)
5020 || (ch >= 0xAC00 && ch <= 0xD7A3)
5022 || (ch >= 0x0660 && ch <= 0x0669)
5023 || (ch >= 0x06F0 && ch <= 0x06F9)
5024 || (ch >= 0x0966 && ch <= 0x096F)
5025 || (ch >= 0x09E6 && ch <= 0x09EF)
5026 || (ch >= 0x0A66 && ch <= 0x0A6F)
5027 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5028 || (ch >= 0x0B66 && ch <= 0x0B6F)
5029 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5030 || (ch >= 0x0C66 && ch <= 0x0C6F)
5031 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5032 || (ch >= 0x0D66 && ch <= 0x0D6F)
5033 || (ch >= 0x0E50 && ch <= 0x0E59)
5034 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5035 || (ch >= 0x0F20 && ch <= 0x0F33)
5036 /* Special characters */
5039 || (ch >= 0x02B0 && ch <= 0x02B8)
5041 || (ch >= 0x02BD && ch <= 0x02C1)
5042 || (ch >= 0x02D0 && ch <= 0x02D1)
5043 || (ch >= 0x02E0 && ch <= 0x02E4)
5049 || (ch >= 0x203F && ch <= 0x2040)
5052 || (ch >= 0x210A && ch <= 0x2113)
5054 || (ch >= 0x2118 && ch <= 0x211D)
5058 || (ch >= 0x212A && ch <= 0x2131)
5059 || (ch >= 0x2133 && ch <= 0x2138)
5060 || (ch >= 0x2160 && ch <= 0x2182)
5061 || (ch >= 0x3005 && ch <= 0x3007)
5062 || (ch >= 0x3021 && ch <= 0x3029)
5064 return UC_IDENTIFIER_START;
5065 return UC_IDENTIFIER_INVALID;
5068 /* The Java Language Specification, 3rd edition, §3.6.
5069 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
5071 is_java_whitespace (unsigned int ch)
5073 return (ch == ' ' || ch == '\t' || ch == '\f'
5074 || ch == '\n' || ch == '\r');
5077 /* The Java Language Specification, 3rd edition, §3.8.
5078 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
5079 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5081 java_ident_category (unsigned int ch)
5083 /* FIXME: Check this against Sun's JDK implementation. */
5084 if (is_category_L (ch) /* = Character.isLetter(ch) */
5085 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5086 || is_category_Sc (ch) /* currency symbol */
5087 || is_category_Pc (ch) /* connector punctuation */
5089 return UC_IDENTIFIER_START;
5090 if (is_category_Nd (ch) /* digit */
5091 || is_category_Mc (ch) /* combining mark */
5092 || is_category_Mn (ch) /* non-spacing mark */
5094 return UC_IDENTIFIER_VALID;
5095 if ((ch >= 0x0000 && ch <= 0x0008)
5096 || (ch >= 0x000E && ch <= 0x001B)
5097 || (ch >= 0x007F && ch <= 0x009F)
5098 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5100 return UC_IDENTIFIER_IGNORABLE;
5101 return UC_IDENTIFIER_INVALID;
5104 /* Construction of sparse 3-level tables. */
5105 #define TABLE identsyntax_table
5106 #define ELEMENT uint8_t
5107 #define DEFAULT UC_IDENTIFIER_INVALID
5108 #define xmalloc malloc
5109 #define xrealloc realloc
5112 /* Output an identifier syntax categorization in a three-level bitmap. */
5114 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5118 struct identsyntax_table t;
5119 unsigned int level1_offset, level2_offset, level3_offset;
5121 stream = fopen (filename, "w");
5124 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5128 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5129 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5130 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5135 identsyntax_table_init (&t);
5137 for (ch = 0; ch < 0x110000; ch++)
5139 int syntaxcode = predicate (ch);
5140 if (syntaxcode != UC_IDENTIFIER_INVALID)
5141 identsyntax_table_add (&t, ch, syntaxcode);
5144 identsyntax_table_finalize (&t);
5146 /* Offsets in t.result, in memory of this process. */
5148 5 * sizeof (uint32_t);
5150 5 * sizeof (uint32_t)
5151 + t.level1_size * sizeof (uint32_t);
5153 5 * sizeof (uint32_t)
5154 + t.level1_size * sizeof (uint32_t)
5155 + (t.level2_size << t.q) * sizeof (uint32_t);
5157 for (i = 0; i < 5; i++)
5158 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5159 ((uint32_t *) t.result)[i]);
5160 fprintf (stream, "static const\n");
5161 fprintf (stream, "struct\n");
5162 fprintf (stream, " {\n");
5163 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5164 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5165 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5166 (1 << t.p) * 2 / 16);
5167 fprintf (stream, " }\n");
5168 fprintf (stream, "%s =\n", name);
5169 fprintf (stream, "{\n");
5170 fprintf (stream, " {");
5171 if (t.level1_size > 8)
5172 fprintf (stream, "\n ");
5173 for (i = 0; i < t.level1_size; i++)
5176 if (i > 0 && (i % 8) == 0)
5177 fprintf (stream, "\n ");
5178 offset = ((uint32_t *) (t.result + level1_offset))[i];
5180 fprintf (stream, " %5d", -1);
5182 fprintf (stream, " %5zu",
5183 (offset - level2_offset) / sizeof (uint32_t));
5184 if (i+1 < t.level1_size)
5185 fprintf (stream, ",");
5187 if (t.level1_size > 8)
5188 fprintf (stream, "\n ");
5189 fprintf (stream, " },\n");
5190 fprintf (stream, " {");
5191 if (t.level2_size << t.q > 8)
5192 fprintf (stream, "\n ");
5193 for (i = 0; i < t.level2_size << t.q; i++)
5196 if (i > 0 && (i % 8) == 0)
5197 fprintf (stream, "\n ");
5198 offset = ((uint32_t *) (t.result + level2_offset))[i];
5200 fprintf (stream, " %5d", -1);
5202 fprintf (stream, " %5zu",
5203 (offset - level3_offset) / sizeof (uint8_t));
5204 if (i+1 < t.level2_size << t.q)
5205 fprintf (stream, ",");
5207 if (t.level2_size << t.q > 8)
5208 fprintf (stream, "\n ");
5209 fprintf (stream, " },\n");
5210 /* Pack the level3 array. Each entry needs 2 bits only. */
5211 fprintf (stream, " {");
5212 if ((t.level3_size << t.p) * 2 / 16 > 8)
5213 fprintf (stream, "\n ");
5214 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5216 if (i > 0 && (i % 8) == 0)
5217 fprintf (stream, "\n ");
5218 fprintf (stream, " 0x%04x",
5219 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5220 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5221 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5222 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5223 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5224 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5225 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5226 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5227 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5228 fprintf (stream, ",");
5230 if ((t.level3_size << t.p) * 2 / 16 > 8)
5231 fprintf (stream, "\n ");
5232 fprintf (stream, " }\n");
5233 fprintf (stream, "};\n");
5235 if (ferror (stream) || fclose (stream))
5237 fprintf (stderr, "error writing to '%s'\n", filename);
5243 output_ident_properties (const char *version)
5245 #define PROPERTY(P) \
5246 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5247 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5248 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5249 PROPERTY(c_whitespace)
5250 PROPERTY(java_whitespace)
5253 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5254 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5257 /* ========================================================================= */
5259 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5260 glibc/localedata/locales/i18n file, generated by
5261 glibc/localedata/gen-unicode-ctype.c. */
5263 /* Character mappings. */
5266 to_upper (unsigned int ch)
5268 if (unicode_attributes[ch].name != NULL
5269 && unicode_attributes[ch].upper != NONE)
5270 return unicode_attributes[ch].upper;
5276 to_lower (unsigned int ch)
5278 if (unicode_attributes[ch].name != NULL
5279 && unicode_attributes[ch].lower != NONE)
5280 return unicode_attributes[ch].lower;
5286 to_title (unsigned int ch)
5288 if (unicode_attributes[ch].name != NULL
5289 && unicode_attributes[ch].title != NONE)
5290 return unicode_attributes[ch].title;
5295 /* Character class properties. */
5298 is_upper (unsigned int ch)
5300 return (to_lower (ch) != ch);
5304 is_lower (unsigned int ch)
5306 return (to_upper (ch) != ch)
5307 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5312 is_alpha (unsigned int ch)
5314 return (unicode_attributes[ch].name != NULL
5315 && ((unicode_attributes[ch].category[0] == 'L'
5316 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5317 <U0E2F>, <U0E46> should belong to is_punct. */
5318 && (ch != 0x0E2F) && (ch != 0x0E46))
5319 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5320 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5322 || (ch >= 0x0E34 && ch <= 0x0E3A)
5323 || (ch >= 0x0E47 && ch <= 0x0E4E)
5324 /* Avoid warning for <U0345>. */
5326 /* Avoid warnings for <U2160>..<U217F>. */
5327 || (unicode_attributes[ch].category[0] == 'N'
5328 && unicode_attributes[ch].category[1] == 'l')
5329 /* Avoid warnings for <U24B6>..<U24E9>. */
5330 || (unicode_attributes[ch].category[0] == 'S'
5331 && unicode_attributes[ch].category[1] == 'o'
5332 && strstr (unicode_attributes[ch].name, " LETTER ")
5334 /* Consider all the non-ASCII digits as alphabetic.
5335 ISO C 99 forbids us to have them in category "digit",
5336 but we want iswalnum to return true on them. */
5337 || (unicode_attributes[ch].category[0] == 'N'
5338 && unicode_attributes[ch].category[1] == 'd'
5339 && !(ch >= 0x0030 && ch <= 0x0039))));
5343 is_digit (unsigned int ch)
5346 return (unicode_attributes[ch].name != NULL
5347 && unicode_attributes[ch].category[0] == 'N'
5348 && unicode_attributes[ch].category[1] == 'd');
5349 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5350 a zero. Must add <0> in front of them by hand. */
5352 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5355 The iswdigit function tests for any wide character that corresponds
5356 to a decimal-digit character (as defined in 5.2.1).
5358 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5360 return (ch >= 0x0030 && ch <= 0x0039);
5365 is_outdigit (unsigned int ch)
5367 return (ch >= 0x0030 && ch <= 0x0039);
5371 is_alnum (unsigned int ch)
5373 return is_alpha (ch) || is_digit (ch);
5377 is_blank (unsigned int ch)
5379 return (ch == 0x0009 /* '\t' */
5380 /* Category Zs without mention of "<noBreak>" */
5381 || (unicode_attributes[ch].name != NULL
5382 && unicode_attributes[ch].category[0] == 'Z'
5383 && unicode_attributes[ch].category[1] == 's'
5384 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5388 is_space (unsigned int ch)
5390 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5391 should treat it like a punctuation character, not like a space. */
5392 return (ch == 0x0020 /* ' ' */
5393 || ch == 0x000C /* '\f' */
5394 || ch == 0x000A /* '\n' */
5395 || ch == 0x000D /* '\r' */
5396 || ch == 0x0009 /* '\t' */
5397 || ch == 0x000B /* '\v' */
5398 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5399 || (unicode_attributes[ch].name != NULL
5400 && unicode_attributes[ch].category[0] == 'Z'
5401 && (unicode_attributes[ch].category[1] == 'l'
5402 || unicode_attributes[ch].category[1] == 'p'
5403 || (unicode_attributes[ch].category[1] == 's'
5404 && !strstr (unicode_attributes[ch].decomposition,
5409 is_cntrl (unsigned int ch)
5411 return (unicode_attributes[ch].name != NULL
5412 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5413 /* Categories Zl and Zp */
5414 || (unicode_attributes[ch].category[0] == 'Z'
5415 && (unicode_attributes[ch].category[1] == 'l'
5416 || unicode_attributes[ch].category[1] == 'p'))));
5420 is_xdigit (unsigned int ch)
5423 return is_digit (ch)
5424 || (ch >= 0x0041 && ch <= 0x0046)
5425 || (ch >= 0x0061 && ch <= 0x0066);
5427 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5430 The iswxdigit function tests for any wide character that corresponds
5431 to a hexadecimal-digit character (as defined in 6.4.4.1).
5433 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5435 return (ch >= 0x0030 && ch <= 0x0039)
5436 || (ch >= 0x0041 && ch <= 0x0046)
5437 || (ch >= 0x0061 && ch <= 0x0066);
5442 is_graph (unsigned int ch)
5444 return (unicode_attributes[ch].name != NULL
5445 && strcmp (unicode_attributes[ch].name, "<control>")
5450 is_print (unsigned int ch)
5452 return (unicode_attributes[ch].name != NULL
5453 && strcmp (unicode_attributes[ch].name, "<control>")
5454 /* Categories Zl and Zp */
5455 && !(unicode_attributes[ch].name != NULL
5456 && unicode_attributes[ch].category[0] == 'Z'
5457 && (unicode_attributes[ch].category[1] == 'l'
5458 || unicode_attributes[ch].category[1] == 'p')));
5462 is_punct (unsigned int ch)
5465 return (unicode_attributes[ch].name != NULL
5466 && unicode_attributes[ch].category[0] == 'P');
5468 /* The traditional POSIX definition of punctuation is every graphic,
5469 non-alphanumeric character. */
5470 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5474 /* Output all properties. */
5476 output_old_ctype (const char *version)
5478 #define PROPERTY(P) \
5479 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5480 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5481 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5500 is_combining (unsigned int ch)
5502 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5503 file. In 3.0.1 it was identical to the union of the general categories
5504 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5505 PropList.txt file, so we take the latter definition. */
5506 return (unicode_attributes[ch].name != NULL
5507 && unicode_attributes[ch].category[0] == 'M'
5508 && (unicode_attributes[ch].category[1] == 'n'
5509 || unicode_attributes[ch].category[1] == 'c'
5510 || unicode_attributes[ch].category[1] == 'e'));
5514 is_combining_level3 (unsigned int ch)
5516 return is_combining (ch)
5517 && !(unicode_attributes[ch].combining[0] != '\0'
5518 && unicode_attributes[ch].combining[0] != '0'
5519 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5522 /* Return the UCS symbol string for a Unicode character. */
5524 ucs_symbol (unsigned int i)
5526 static char buf[11+1];
5528 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5532 /* Return the UCS symbol range string for a Unicode characters interval. */
5534 ucs_symbol_range (unsigned int low, unsigned int high)
5536 static char buf[24+1];
5538 strcpy (buf, ucs_symbol (low));
5540 strcat (buf, ucs_symbol (high));
5544 /* Output a character class (= property) table. */
5547 output_charclass (FILE *stream, const char *classname,
5548 bool (*func) (unsigned int))
5550 char table[0x110000];
5552 bool need_semicolon;
5553 const int max_column = 75;
5556 for (i = 0; i < 0x110000; i++)
5557 table[i] = (int) func (i);
5559 fprintf (stream, "%s ", classname);
5560 need_semicolon = false;
5562 for (i = 0; i < 0x110000; )
5568 unsigned int low, high;
5574 while (i < 0x110000 && table[i]);
5578 strcpy (buf, ucs_symbol (low));
5580 strcpy (buf, ucs_symbol_range (low, high));
5584 fprintf (stream, ";");
5588 if (column + strlen (buf) > max_column)
5590 fprintf (stream, "/\n ");
5594 fprintf (stream, "%s", buf);
5595 column += strlen (buf);
5596 need_semicolon = true;
5599 fprintf (stream, "\n");
5602 /* Output a character mapping table. */
5605 output_charmap (FILE *stream, const char *mapname,
5606 unsigned int (*func) (unsigned int))
5608 char table[0x110000];
5610 bool need_semicolon;
5611 const int max_column = 75;
5614 for (i = 0; i < 0x110000; i++)
5615 table[i] = (func (i) != i);
5617 fprintf (stream, "%s ", mapname);
5618 need_semicolon = false;
5620 for (i = 0; i < 0x110000; i++)
5626 strcat (buf, ucs_symbol (i));
5628 strcat (buf, ucs_symbol (func (i)));
5633 fprintf (stream, ";");
5637 if (column + strlen (buf) > max_column)
5639 fprintf (stream, "/\n ");
5643 fprintf (stream, "%s", buf);
5644 column += strlen (buf);
5645 need_semicolon = true;
5647 fprintf (stream, "\n");
5650 /* Output the width table. */
5653 output_widthmap (FILE *stream)
5657 /* Output the tables to the given file. */
5660 output_tables (const char *filename, const char *version)
5665 stream = fopen (filename, "w");
5668 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5672 fprintf (stream, "escape_char /\n");
5673 fprintf (stream, "comment_char %%\n");
5674 fprintf (stream, "\n");
5675 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
5677 fprintf (stream, "\n");
5679 fprintf (stream, "LC_IDENTIFICATION\n");
5680 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5681 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5682 fprintf (stream, "address \"\"\n");
5683 fprintf (stream, "contact \"\"\n");
5684 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5685 fprintf (stream, "tel \"\"\n");
5686 fprintf (stream, "fax \"\"\n");
5687 fprintf (stream, "language \"\"\n");
5688 fprintf (stream, "territory \"Earth\"\n");
5689 fprintf (stream, "revision \"%s\"\n", version);
5694 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5695 fprintf (stream, "date \"%s\"\n", date);
5697 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5698 fprintf (stream, "END LC_IDENTIFICATION\n");
5699 fprintf (stream, "\n");
5702 for (ch = 0; ch < 0x110000; ch++)
5704 /* toupper restriction: "Only characters specified for the keywords
5705 lower and upper shall be specified. */
5706 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5708 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5709 ucs_symbol (ch), ch, to_upper (ch));
5711 /* tolower restriction: "Only characters specified for the keywords
5712 lower and upper shall be specified. */
5713 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5715 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5716 ucs_symbol (ch), ch, to_lower (ch));
5718 /* alpha restriction: "Characters classified as either upper or lower
5719 shall automatically belong to this class. */
5720 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5721 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5723 /* alpha restriction: "No character specified for the keywords cntrl,
5724 digit, punct or space shall be specified." */
5725 if (is_alpha (ch) && is_cntrl (ch))
5726 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5727 if (is_alpha (ch) && is_digit (ch))
5728 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5729 if (is_alpha (ch) && is_punct (ch))
5730 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5731 if (is_alpha (ch) && is_space (ch))
5732 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5734 /* space restriction: "No character specified for the keywords upper,
5735 lower, alpha, digit, graph or xdigit shall be specified."
5736 upper, lower, alpha already checked above. */
5737 if (is_space (ch) && is_digit (ch))
5738 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5739 if (is_space (ch) && is_graph (ch))
5740 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5741 if (is_space (ch) && is_xdigit (ch))
5742 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5744 /* cntrl restriction: "No character specified for the keywords upper,
5745 lower, alpha, digit, punct, graph, print or xdigit shall be
5746 specified." upper, lower, alpha already checked above. */
5747 if (is_cntrl (ch) && is_digit (ch))
5748 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5749 if (is_cntrl (ch) && is_punct (ch))
5750 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5751 if (is_cntrl (ch) && is_graph (ch))
5752 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5753 if (is_cntrl (ch) && is_print (ch))
5754 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5755 if (is_cntrl (ch) && is_xdigit (ch))
5756 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5758 /* punct restriction: "No character specified for the keywords upper,
5759 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5760 be specified." upper, lower, alpha, cntrl already checked above. */
5761 if (is_punct (ch) && is_digit (ch))
5762 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5763 if (is_punct (ch) && is_xdigit (ch))
5764 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5765 if (is_punct (ch) && (ch == 0x0020))
5766 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5768 /* graph restriction: "No character specified for the keyword cntrl
5769 shall be specified." Already checked above. */
5771 /* print restriction: "No character specified for the keyword cntrl
5772 shall be specified." Already checked above. */
5774 /* graph - print relation: differ only in the <space> character.
5775 How is this possible if there are more than one space character?!
5776 I think susv2/xbd/locale.html should speak of "space characters",
5777 not "space character". */
5778 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5780 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5781 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5783 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5786 fprintf (stream, "LC_CTYPE\n");
5787 output_charclass (stream, "upper", is_upper);
5788 output_charclass (stream, "lower", is_lower);
5789 output_charclass (stream, "alpha", is_alpha);
5790 output_charclass (stream, "digit", is_digit);
5791 output_charclass (stream, "outdigit", is_outdigit);
5792 output_charclass (stream, "blank", is_blank);
5793 output_charclass (stream, "space", is_space);
5794 output_charclass (stream, "cntrl", is_cntrl);
5795 output_charclass (stream, "punct", is_punct);
5796 output_charclass (stream, "xdigit", is_xdigit);
5797 output_charclass (stream, "graph", is_graph);
5798 output_charclass (stream, "print", is_print);
5799 output_charclass (stream, "class \"combining\";", is_combining);
5800 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5801 output_charmap (stream, "toupper", to_upper);
5802 output_charmap (stream, "tolower", to_lower);
5803 output_charmap (stream, "map \"totitle\";", to_title);
5804 output_widthmap (stream);
5805 fprintf (stream, "END LC_CTYPE\n");
5807 if (ferror (stream) || fclose (stream))
5809 fprintf (stderr, "error writing to '%s'\n", filename);
5816 /* ========================================================================= */
5818 /* The width property from the EastAsianWidth.txt file.
5819 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5820 const char * unicode_width[0x110000];
5822 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5825 fill_width (const char *width_filename)
5829 char field0[FIELDLEN];
5830 char field1[FIELDLEN];
5831 char field2[FIELDLEN];
5834 for (i = 0; i < 0x110000; i++)
5835 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5837 stream = fopen (width_filename, "r");
5840 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5855 do c = getc (stream); while (c != EOF && c != '\n');
5859 n = getfield (stream, field0, ';');
5860 n += getfield (stream, field1, ' ');
5861 n += getfield (stream, field2, '\n');
5866 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5869 i = strtoul (field0, NULL, 16);
5870 if (strstr (field0, "..") != NULL)
5872 /* Deal with a range. */
5873 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5875 unicode_width[i] = strdup (field1);
5879 /* Single character line. */
5880 unicode_width[i] = strdup (field1);
5884 if (ferror (stream) || fclose (stream))
5886 fprintf (stderr, "error reading from '%s'\n", width_filename);
5891 /* ========================================================================= */
5893 /* Non-spacing attribute and width. */
5895 /* The non-spacing attribute table consists of:
5896 - Non-spacing characters; generated from PropList.txt or
5897 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
5898 - Format control characters; generated from
5899 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
5900 - Zero width characters; generated from
5901 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
5905 is_nonspacing (unsigned int ch)
5907 return (unicode_attributes[ch].name != NULL
5908 && (get_bidi_category (ch) == UC_BIDI_NSM
5909 || is_category_Cc (ch) || is_category_Cf (ch)
5910 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
5914 output_nonspacing_property (const char *filename)
5917 int ind[0x110000 / 0x200];
5922 stream = fopen (filename, "w");
5925 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5930 for (i = 0; i < 0x110000 / 0x200; i++)
5932 bool nontrivial = false;
5935 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
5936 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
5937 if (is_nonspacing (ch))
5943 ind[i] = next_ind++;
5948 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
5951 for (i = 0; i < 0x110000 / 0x200; i++)
5953 bool nontrivial = (ind[i] >= 0);
5959 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
5960 for (j = 0; j < 8; j++)
5964 fprintf (stream, " ");
5965 for (k = 0; k < 8; k++)
5968 unsigned char bits = 0;
5970 for (l = 0; l < 8; l++)
5972 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
5974 if (is_nonspacing (ch))
5977 fprintf (stream, " 0x%02x%c", bits,
5978 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
5980 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5981 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
5986 fprintf (stream, "};\n");
5988 i_max = ((i_max + 8 - 1) / 8) * 8;
5989 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
5994 for (j = 0; j < i_max / 8; j++)
5998 fprintf (stream, " ");
5999 for (k = 0; k < 8; k++)
6002 fprintf (stream, " %2d%c", ind[i],
6003 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6005 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6006 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6009 fprintf (stream, "};\n");
6011 if (ferror (stream) || fclose (stream))
6013 fprintf (stderr, "error writing to '%s'\n", filename);
6018 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6020 symbolic_width (unsigned int ch)
6022 /* Test for unassigned character. */
6023 if (is_property_unassigned_code_value (ch))
6025 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6026 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6028 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6029 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6030 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6031 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6032 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6038 /* Test for non-spacing or control character. */
6039 if (is_category_Cc (ch) && ch < 0x00A0)
6041 if (is_nonspacing (ch))
6043 /* Test for double-width character. */
6044 if (unicode_width[ch] != NULL
6045 && (strcmp (unicode_width[ch], "W") == 0
6046 || strcmp (unicode_width[ch], "F") == 0))
6048 /* Test for half-width character. */
6049 if (unicode_width[ch] != NULL
6050 && strcmp (unicode_width[ch], "H") == 0)
6053 /* In ancient CJK encodings, Cyrillic and most other characters are
6054 double-width as well. */
6055 if (ch >= 0x00A1 && ch < 0x10000)
6061 output_width_property_test (const char *filename)
6064 unsigned int interval_start, interval_end, ch;
6065 char interval_value;
6067 stream = fopen (filename, "w");
6070 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6075 interval_start = interval_end = 0; /* avoid GCC warning */
6076 for (ch = 0; ch < 0x110000; ch++)
6078 char value = symbolic_width (ch);
6079 if (value != 0) /* skip Cc control characters and unassigned characters */
6081 if (value == interval_value)
6082 /* Extend the interval. */
6086 /* Terminate the interval. */
6087 if (interval_value != 0)
6089 if (interval_end == interval_start)
6090 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6092 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6094 /* Start a new interval. */
6095 interval_start = interval_end = ch;
6096 interval_value = value;
6100 /* Terminate the last interval. */
6101 if (interval_value != 0)
6103 if (interval_end == interval_start)
6104 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6106 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6109 if (ferror (stream) || fclose (stream))
6111 fprintf (stderr, "error writing to '%s'\n", filename);
6116 /* ========================================================================= */
6118 /* Line breaking classification.
6119 Updated for Unicode TR #14 revision 26. */
6123 /* Values >= 25 are resolved at run time. */
6124 LBP_BK = 25, /* mandatory break */
6125 /*LBP_CR, carriage return - not used here because it's a DOSism */
6126 /*LBP_LF, line feed - not used here because it's a DOSism */
6127 LBP_CM = 26, /* attached characters and combining marks */
6128 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6129 /*LBP_SG, surrogates - not used here because they are not characters */
6130 LBP_WJ = 0, /* word joiner */
6131 LBP_ZW = 27, /* zero width space */
6132 LBP_GL = 1, /* non-breaking (glue) */
6133 LBP_SP = 28, /* space */
6134 LBP_B2 = 2, /* break opportunity before and after */
6135 LBP_BA = 3, /* break opportunity after */
6136 LBP_BB = 4, /* break opportunity before */
6137 LBP_HY = 5, /* hyphen */
6138 LBP_CB = 29, /* contingent break opportunity */
6139 LBP_CL = 6, /* closing punctuation */
6140 LBP_CP = 7, /* closing parenthesis */
6141 LBP_EX = 8, /* exclamation/interrogation */
6142 LBP_IN = 9, /* inseparable */
6143 LBP_NS = 10, /* non starter */
6144 LBP_OP = 11, /* opening punctuation */
6145 LBP_QU = 12, /* ambiguous quotation */
6146 LBP_IS = 13, /* infix separator (numeric) */
6147 LBP_NU = 14, /* numeric */
6148 LBP_PO = 15, /* postfix (numeric) */
6149 LBP_PR = 16, /* prefix (numeric) */
6150 LBP_SY = 17, /* symbols allowing breaks */
6151 LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
6152 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6153 LBP_H2 = 19, /* Hangul LV syllable */
6154 LBP_H3 = 20, /* Hangul LVT syllable */
6155 LBP_ID = 21, /* ideographic */
6156 LBP_JL = 22, /* Hangul L Jamo */
6157 LBP_JV = 23, /* Hangul V Jamo */
6158 LBP_JT = 24, /* Hangul T Jamo */
6159 LBP_SA = 31, /* complex context (South East Asian) */
6160 LBP_XX = 32 /* unknown */
6163 /* Returns the line breaking classification for ch, as a bit mask. */
6165 get_lbp (unsigned int ch)
6169 if (unicode_attributes[ch].name != NULL)
6171 /* mandatory break */
6172 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6173 || ch == 0x000C /* form feed */
6174 || ch == 0x000B /* line tabulation */
6175 || ch == 0x2028 /* LINE SEPARATOR */
6176 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6177 attr |= (int64_t) 1 << LBP_BK;
6179 if (ch == 0x2060 /* WORD JOINER */
6180 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6181 attr |= (int64_t) 1 << LBP_WJ;
6183 /* zero width space */
6184 if (ch == 0x200B /* ZERO WIDTH SPACE */)
6185 attr |= (int64_t) 1 << LBP_ZW;
6187 /* non-breaking (glue) */
6188 if (ch == 0x00A0 /* NO-BREAK SPACE */
6189 || ch == 0x202F /* NARROW NO-BREAK SPACE */
6190 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6191 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6192 || ch == 0x2007 /* FIGURE SPACE */
6193 || ch == 0x2011 /* NON-BREAKING HYPHEN */
6194 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6195 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6196 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6197 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6198 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6199 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6200 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6201 attr |= (int64_t) 1 << LBP_GL;
6204 if (ch == 0x0020 /* SPACE */)
6205 attr |= (int64_t) 1 << LBP_SP;
6207 /* break opportunity before and after */
6208 if (ch == 0x2014 /* EM DASH */)
6209 attr |= (int64_t) 1 << LBP_B2;
6211 /* break opportunity after */
6212 if (/* Breaking Spaces */
6213 ch == 0x1680 /* OGHAM SPACE MARK */
6214 || ch == 0x2000 /* EN QUAD */
6215 || ch == 0x2001 /* EM QUAD */
6216 || ch == 0x2002 /* EN SPACE */
6217 || ch == 0x2003 /* EM SPACE */
6218 || ch == 0x2004 /* THREE-PER-EM SPACE */
6219 || ch == 0x2005 /* FOUR-PER-EM SPACE */
6220 || ch == 0x2006 /* SIX-PER-EM SPACE */
6221 || ch == 0x2008 /* PUNCTUATION SPACE */
6222 || ch == 0x2009 /* THIN SPACE */
6223 || ch == 0x200A /* HAIR SPACE */
6224 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6226 || ch == 0x0009 /* tab */
6227 /* Conditional Hyphens */
6228 || ch == 0x00AD /* SOFT HYPHEN */
6229 /* Breaking Hyphens */
6230 || ch == 0x058A /* ARMENIAN HYPHEN */
6231 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6232 || ch == 0x2010 /* HYPHEN */
6233 || ch == 0x2012 /* FIGURE DASH */
6234 || ch == 0x2013 /* EN DASH */
6235 /* Visible Word Dividers */
6236 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6237 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6238 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6239 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6240 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6241 || ch == 0x2027 /* HYPHENATION POINT */
6242 || ch == 0x007C /* VERTICAL LINE */
6243 /* Historic Word Separators */
6244 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6245 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6246 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6247 || ch == 0x2056 /* THREE DOT PUNCTUATION */
6248 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6249 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6250 || ch == 0x205A /* TWO DOT PUNCTUATION */
6251 || ch == 0x205B /* FOUR DOT MARK */
6252 || ch == 0x205D /* TRICOLON */
6253 || ch == 0x205E /* VERTICAL FOUR DOTS */
6254 || ch == 0x2E19 /* PALM BRANCH */
6255 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6256 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6257 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6258 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6259 || ch == 0x2E30 /* RING POINT */
6260 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6261 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6262 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6263 || ch == 0x10102 /* AEGEAN CHECK MARK */
6264 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6265 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6266 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6267 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6269 || ch == 0x0964 /* DEVANAGARI DANDA */
6270 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6271 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6272 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6273 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6274 || ch == 0x104B /* MYANMAR SIGN SECTION */
6275 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6276 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6277 || ch == 0x17D4 /* KHMER SIGN KHAN */
6278 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6279 || ch == 0x1B5E /* BALINESE CARIK SIKI */
6280 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6281 || ch == 0xA8CE /* SAURASHTRA DANDA */
6282 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6283 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6284 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6285 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6286 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6287 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6289 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6290 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6291 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6292 || ch == 0x0FBE /* TIBETAN KU RU KHA */
6293 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6294 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6295 /* Other Terminating Punctuation */
6296 || ch == 0x1804 /* MONGOLIAN COLON */
6297 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6298 || ch == 0x1B5A /* BALINESE PANTI */
6299 || ch == 0x1B5B /* BALINESE PAMADA */
6300 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6301 || ch == 0x1B60 /* BALINESE PAMENENG */
6302 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6303 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6304 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6305 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6306 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6307 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6308 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6309 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6310 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6311 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6312 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6313 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6314 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6315 || ch == 0xA60D /* VAI COMMA */
6316 || ch == 0xA60F /* VAI QUESTION MARK */
6317 || ch == 0xA92E /* KAYAH LI SIGN CWI */
6318 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6319 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6320 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6321 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6322 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6323 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6324 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6325 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6326 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6327 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6328 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6329 || ch == 0xA6F3 /* BAMUM FULL STOP */
6330 || ch == 0xA6F4 /* BAMUM COLON */
6331 || ch == 0xA6F5 /* BAMUM COMMA */
6332 || ch == 0xA6F6 /* BAMUM SEMICOLON */
6333 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6334 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6335 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6336 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6337 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6338 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6339 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6340 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6341 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6342 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6343 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6344 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6345 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6346 || ch == 0x11047 /* BRAHMI DANDA */
6347 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6348 || ch == 0x110BE /* KAITHI SECTION MARK */
6349 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6350 || ch == 0x110C0 /* KAITHI DANDA */
6351 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6352 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6353 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6354 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
6355 attr |= (int64_t) 1 << LBP_BA;
6357 /* break opportunity before */
6358 if (ch == 0x00B4 /* ACUTE ACCENT */
6359 || ch == 0x1FFD /* GREEK OXIA */
6360 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6361 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6362 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6363 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6364 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6365 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6366 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6367 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6368 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6369 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6370 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6371 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6372 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6373 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6374 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6375 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6376 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
6377 attr |= (int64_t) 1 << LBP_BB;
6380 if (ch == 0x002D /* HYPHEN-MINUS */)
6381 attr |= (int64_t) 1 << LBP_HY;
6383 /* contingent break opportunity */
6384 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6385 attr |= (int64_t) 1 << LBP_CB;
6387 /* closing parenthesis */
6388 if (ch == 0x0029 /* RIGHT PARENTHESIS */
6389 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6390 attr |= (int64_t) 1 << LBP_CP;
6392 /* closing punctuation */
6393 if ((unicode_attributes[ch].category[0] == 'P'
6394 && unicode_attributes[ch].category[1] == 'e'
6395 && !(attr & ((int64_t) 1 << LBP_CP)))
6396 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6397 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6398 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6399 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6400 || ch == 0xFE50 /* SMALL COMMA */
6401 || ch == 0xFE52 /* SMALL FULL STOP */
6402 || ch == 0xFF0C /* FULLWIDTH COMMA */
6403 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6404 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6405 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6406 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6407 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6408 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6409 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6410 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6411 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6412 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6413 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6414 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
6415 attr |= (int64_t) 1 << LBP_CL;
6417 /* exclamation/interrogation */
6418 if (ch == 0x0021 /* EXCLAMATION MARK */
6419 || ch == 0x003F /* QUESTION MARK */
6420 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6421 || ch == 0x061B /* ARABIC SEMICOLON */
6422 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6423 || ch == 0x061F /* ARABIC QUESTION MARK */
6424 || ch == 0x06D4 /* ARABIC FULL STOP */
6425 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6426 || ch == 0x0F0D /* TIBETAN MARK SHAD */
6427 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6428 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6429 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6430 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6431 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6432 || ch == 0x1802 /* MONGOLIAN COMMA */
6433 || ch == 0x1803 /* MONGOLIAN FULL STOP */
6434 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6435 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6436 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6437 || ch == 0x1945 /* LIMBU QUESTION MARK */
6438 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6439 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6440 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6441 || ch == 0x2CFE /* COPTIC FULL STOP */
6442 || ch == 0x2E2E /* REVERSED QUESTION MARK */
6443 || ch == 0xA60E /* VAI FULL STOP */
6444 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6445 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6446 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6447 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6448 || ch == 0xFE56 /* SMALL QUESTION MARK */
6449 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6450 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6451 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
6452 attr |= (int64_t) 1 << LBP_EX;
6455 if (ch == 0x2024 /* ONE DOT LEADER */
6456 || ch == 0x2025 /* TWO DOT LEADER */
6457 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6458 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
6459 attr |= (int64_t) 1 << LBP_IN;
6462 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6463 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6464 || ch == 0x203D /* INTERROBANG */
6465 || ch == 0x2047 /* DOUBLE QUESTION MARK */
6466 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6467 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6468 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6469 || ch == 0x301C /* WAVE DASH */
6470 || ch == 0x303C /* MASU MARK */
6471 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6472 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6473 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6474 || ch == 0x309D /* HIRAGANA ITERATION MARK */
6475 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6476 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6477 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6478 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6479 || ch == 0x30FD /* KATAKANA ITERATION MARK */
6480 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6481 || ch == 0xA015 /* YI SYLLABLE WU */
6482 || ch == 0xFE54 /* SMALL SEMICOLON */
6483 || ch == 0xFE55 /* SMALL COLON */
6484 || ch == 0xFF1A /* FULLWIDTH COLON */
6485 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6486 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6487 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6488 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6489 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6490 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6491 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6492 attr |= (int64_t) 1 << LBP_NS;
6494 /* opening punctuation */
6495 if ((unicode_attributes[ch].category[0] == 'P'
6496 && unicode_attributes[ch].category[1] == 's')
6497 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6498 || ch == 0x00BF /* INVERTED QUESTION MARK */
6499 || ch == 0x2E18 /* INVERTED INTERROBANG */
6500 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6501 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6502 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6503 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6504 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6505 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6506 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
6507 attr |= (int64_t) 1 << LBP_OP;
6509 /* ambiguous quotation */
6510 if ((unicode_attributes[ch].category[0] == 'P'
6511 && (unicode_attributes[ch].category[1] == 'f'
6512 || unicode_attributes[ch].category[1] == 'i'))
6513 || ch == 0x0022 /* QUOTATION MARK */
6514 || ch == 0x0027 /* APOSTROPHE */
6515 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6516 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6517 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6518 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6519 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6520 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6521 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6522 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6523 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6524 || ch == 0x2E0B /* RAISED SQUARE */)
6525 attr |= (int64_t) 1 << LBP_QU;
6527 /* infix separator (numeric) */
6528 if (ch == 0x002C /* COMMA */
6529 || ch == 0x002E /* FULL STOP */
6530 || ch == 0x003A /* COLON */
6531 || ch == 0x003B /* SEMICOLON */
6532 || ch == 0x037E /* GREEK QUESTION MARK */
6533 || ch == 0x0589 /* ARMENIAN FULL STOP */
6534 || ch == 0x060C /* ARABIC COMMA */
6535 || ch == 0x060D /* ARABIC DATE SEPARATOR */
6536 || ch == 0x07F8 /* NKO COMMA */
6537 || ch == 0x2044 /* FRACTION SLASH */
6538 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6539 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6540 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6541 attr |= (int64_t) 1 << LBP_IS;
6544 if ((unicode_attributes[ch].category[0] == 'N'
6545 && unicode_attributes[ch].category[1] == 'd'
6546 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
6547 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
6548 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6549 attr |= (int64_t) 1 << LBP_NU;
6551 /* postfix (numeric) */
6552 if (ch == 0x0025 /* PERCENT SIGN */
6553 || ch == 0x00A2 /* CENT SIGN */
6554 || ch == 0x00B0 /* DEGREE SIGN */
6555 || ch == 0x060B /* AFGHANI SIGN */
6556 || ch == 0x066A /* ARABIC PERCENT SIGN */
6557 || ch == 0x2030 /* PER MILLE SIGN */
6558 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
6559 || ch == 0x2032 /* PRIME */
6560 || ch == 0x2033 /* DOUBLE PRIME */
6561 || ch == 0x2034 /* TRIPLE PRIME */
6562 || ch == 0x2035 /* REVERSED PRIME */
6563 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
6564 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
6565 || ch == 0x20A7 /* PESETA SIGN */
6566 || ch == 0x2103 /* DEGREE CELSIUS */
6567 || ch == 0x2109 /* DEGREE FAHRENHEIT */
6568 || ch == 0xFDFC /* RIAL SIGN */
6569 || ch == 0xFE6A /* SMALL PERCENT SIGN */
6570 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
6571 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6572 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6573 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6574 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6575 || ch == 0x09F2 /* BENGALI RUPEE MARK */
6576 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
6577 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6578 || ch == 0x0D79 /* MALAYALAM DATE MARK */
6579 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
6580 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
6581 attr |= (int64_t) 1 << LBP_PO;
6583 /* prefix (numeric) */
6584 if ((unicode_attributes[ch].category[0] == 'S'
6585 && unicode_attributes[ch].category[1] == 'c')
6586 || ch == 0x002B /* PLUS SIGN */
6587 || ch == 0x005C /* REVERSE SOLIDUS */
6588 || ch == 0x00B1 /* PLUS-MINUS SIGN */
6589 || ch == 0x2116 /* NUMERO SIGN */
6590 || ch == 0x2212 /* MINUS SIGN */
6591 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
6592 if (!(attr & ((int64_t) 1 << LBP_PO)))
6593 attr |= (int64_t) 1 << LBP_PR;
6595 /* symbols allowing breaks */
6596 if (ch == 0x002F /* SOLIDUS */)
6597 attr |= (int64_t) 1 << LBP_SY;
6599 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
6600 attr |= (int64_t) 1 << LBP_H2;
6602 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
6603 attr |= (int64_t) 1 << LBP_H3;
6605 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
6606 attr |= (int64_t) 1 << LBP_JL;
6608 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
6609 attr |= (int64_t) 1 << LBP_JV;
6611 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
6612 attr |= (int64_t) 1 << LBP_JT;
6614 /* complex context (South East Asian) */
6615 if (((unicode_attributes[ch].category[0] == 'C'
6616 && unicode_attributes[ch].category[1] == 'f')
6617 || (unicode_attributes[ch].category[0] == 'L'
6618 && (unicode_attributes[ch].category[1] == 'm'
6619 || unicode_attributes[ch].category[1] == 'o'))
6620 || (unicode_attributes[ch].category[0] == 'M'
6621 && (unicode_attributes[ch].category[1] == 'c'
6622 || unicode_attributes[ch].category[1] == 'n')
6623 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6624 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6625 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
6626 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6627 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6628 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6629 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6630 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6631 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6632 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
6633 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6634 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6635 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6636 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6637 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6638 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
6639 attr |= (int64_t) 1 << LBP_SA;
6641 /* attached characters and combining marks */
6642 if ((unicode_attributes[ch].category[0] == 'M'
6643 && (unicode_attributes[ch].category[1] == 'c'
6644 || unicode_attributes[ch].category[1] == 'e'
6645 || unicode_attributes[ch].category[1] == 'n'))
6646 || (unicode_attributes[ch].category[0] == 'C'
6647 && (unicode_attributes[ch].category[1] == 'c'
6648 || unicode_attributes[ch].category[1] == 'f')
6649 && ch != 0x110BD /* KAITHI NUMBER SIGN */))
6650 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6651 attr |= (int64_t) 1 << LBP_CM;
6654 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6655 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6656 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6657 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6658 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6659 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6660 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6661 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6662 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6663 || ch == 0xFE62 /* SMALL PLUS SIGN */
6664 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6665 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6666 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6667 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6668 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
6669 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
6670 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
6671 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
6672 || (ch >= 0x3000 && ch <= 0x33FF
6673 && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
6674 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6675 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
6676 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
6677 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
6678 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
6679 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
6680 || ch == 0xFE45 /* SESAME DOT */
6681 || ch == 0xFE46 /* WHITE SESAME DOT */
6682 || ch == 0xFE49 /* DASHED OVERLINE */
6683 || ch == 0xFE4A /* CENTRELINE OVERLINE */
6684 || ch == 0xFE4B /* WAVY OVERLINE */
6685 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
6686 || ch == 0xFE4D /* DASHED LOW LINE */
6687 || ch == 0xFE4E /* CENTRELINE LOW LINE */
6688 || ch == 0xFE4F /* WAVY LOW LINE */
6689 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
6690 || ch == 0xFE58 /* SMALL EM DASH */
6691 || ch == 0xFE5F /* SMALL NUMBER SIGN */
6692 || ch == 0xFE60 /* SMALL AMPERSAND */
6693 || ch == 0xFE61 /* SMALL ASTERISK */
6694 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
6695 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
6696 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
6697 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
6698 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
6699 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
6700 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
6701 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
6702 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
6703 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
6704 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
6705 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
6706 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
6707 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
6708 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
6709 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
6710 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
6711 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
6712 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
6713 || ch == 0xFF5E /* FULLWIDTH TILDE */
6714 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
6715 || ch == 0xFFE3 /* FULLWIDTH MACRON */
6716 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
6717 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6718 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
6719 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
6720 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
6721 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
6722 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
6723 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
6725 /* ambiguous (ideograph) ? */
6726 if ((unicode_width[ch] != NULL
6727 && unicode_width[ch][0] == 'A'
6729 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6730 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
6731 attr |= (int64_t) 1 << LBP_AI;
6733 attr |= (int64_t) 1 << LBP_ID;
6736 /* ordinary alphabetic and symbol characters */
6737 if ((unicode_attributes[ch].category[0] == 'L'
6738 && (unicode_attributes[ch].category[1] == 'u'
6739 || unicode_attributes[ch].category[1] == 'l'
6740 || unicode_attributes[ch].category[1] == 't'
6741 || unicode_attributes[ch].category[1] == 'm'
6742 || unicode_attributes[ch].category[1] == 'o'))
6743 || (unicode_attributes[ch].category[0] == 'S'
6744 && (unicode_attributes[ch].category[1] == 'm'
6745 || unicode_attributes[ch].category[1] == 'k'
6746 || unicode_attributes[ch].category[1] == 'o'))
6747 || (unicode_attributes[ch].category[0] == 'N'
6748 && (unicode_attributes[ch].category[1] == 'l'
6749 || unicode_attributes[ch].category[1] == 'o'))
6750 || (unicode_attributes[ch].category[0] == 'P'
6751 && (unicode_attributes[ch].category[1] == 'c'
6752 || unicode_attributes[ch].category[1] == 'd'
6753 || unicode_attributes[ch].category[1] == 'o'))
6754 || ch == 0x0600 /* ARABIC NUMBER SIGN */
6755 || ch == 0x0601 /* ARABIC SIGN SANAH */
6756 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
6757 || ch == 0x0603 /* ARABIC SIGN SAFHA */
6758 || ch == 0x06DD /* ARABIC END OF AYAH */
6759 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
6760 || ch == 0x2061 /* FUNCTION APPLICATION */
6761 || ch == 0x2062 /* INVISIBLE TIMES */
6762 || ch == 0x2063 /* INVISIBLE SEPARATOR */
6763 || ch == 0x2064 /* INVISIBLE PLUS */
6764 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6765 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
6766 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
6768 /* ambiguous (alphabetic) ? */
6769 if ((unicode_width[ch] != NULL
6770 && unicode_width[ch][0] == 'A'
6772 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
6773 && ch != 0x2022 /* BULLET */
6774 && ch != 0x203E /* OVERLINE */
6775 && ch != 0x2126 /* OHM SIGN */
6776 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
6777 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
6778 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
6779 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
6780 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
6781 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
6782 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
6783 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
6784 || ch == 0x00A7 /* SECTION SIGN */
6785 || ch == 0x00A8 /* DIAERESIS */
6786 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
6787 || ch == 0x00B2 /* SUPERSCRIPT TWO */
6788 || ch == 0x00B3 /* SUPERSCRIPT THREE */
6789 || ch == 0x00B6 /* PILCROW SIGN */
6790 || ch == 0x00B7 /* MIDDLE DOT */
6791 || ch == 0x00B8 /* CEDILLA */
6792 || ch == 0x00B9 /* SUPERSCRIPT ONE */
6793 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
6794 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
6795 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
6796 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
6797 || ch == 0x00D7 /* MULTIPLICATION SIGN */
6798 || ch == 0x00F7 /* DIVISION SIGN */
6799 || ch == 0x02C7 /* CARON */
6800 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
6801 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
6802 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
6803 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
6804 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
6805 || ch == 0x02D8 /* BREVE */
6806 || ch == 0x02D9 /* DOT ABOVE */
6807 || ch == 0x02DA /* RING ABOVE */
6808 || ch == 0x02DB /* OGONEK */
6809 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
6810 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6811 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
6812 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6813 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
6814 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
6815 || ch == 0x2616 /* WHITE SHOGI PIECE */
6816 || ch == 0x2617 /* BLACK SHOGI PIECE */)
6817 attr |= (int64_t) 1 << LBP_AI;
6819 attr |= (int64_t) 1 << LBP_AL;
6820 attr &= ~((int64_t) 1 << LBP_CM);
6825 /* Unassigned character. */
6826 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
6827 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
6828 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
6829 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
6830 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
6831 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6832 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
6833 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6834 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
6835 attr |= (int64_t) 1 << LBP_ID;
6840 attr |= (int64_t) 1 << LBP_XX;
6845 /* Output the line breaking properties in a human readable format. */
6847 debug_output_lbp (FILE *stream)
6851 for (i = 0; i < 0x110000; i++)
6853 int64_t attr = get_lbp (i);
6854 if (attr != (int64_t) 1 << LBP_XX)
6856 fprintf (stream, "0x%04X", i);
6857 #define PRINT_BIT(attr,bit) \
6858 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
6859 PRINT_BIT(attr,LBP_BK);
6860 PRINT_BIT(attr,LBP_CM);
6861 PRINT_BIT(attr,LBP_WJ);
6862 PRINT_BIT(attr,LBP_ZW);
6863 PRINT_BIT(attr,LBP_GL);
6864 PRINT_BIT(attr,LBP_SP);
6865 PRINT_BIT(attr,LBP_B2);
6866 PRINT_BIT(attr,LBP_BA);
6867 PRINT_BIT(attr,LBP_BB);
6868 PRINT_BIT(attr,LBP_HY);
6869 PRINT_BIT(attr,LBP_CB);
6870 PRINT_BIT(attr,LBP_CL);
6871 PRINT_BIT(attr,LBP_CP);
6872 PRINT_BIT(attr,LBP_EX);
6873 PRINT_BIT(attr,LBP_IN);
6874 PRINT_BIT(attr,LBP_NS);
6875 PRINT_BIT(attr,LBP_OP);
6876 PRINT_BIT(attr,LBP_QU);
6877 PRINT_BIT(attr,LBP_IS);
6878 PRINT_BIT(attr,LBP_NU);
6879 PRINT_BIT(attr,LBP_PO);
6880 PRINT_BIT(attr,LBP_PR);
6881 PRINT_BIT(attr,LBP_SY);
6882 PRINT_BIT(attr,LBP_AI);
6883 PRINT_BIT(attr,LBP_AL);
6884 PRINT_BIT(attr,LBP_H2);
6885 PRINT_BIT(attr,LBP_H3);
6886 PRINT_BIT(attr,LBP_ID);
6887 PRINT_BIT(attr,LBP_JL);
6888 PRINT_BIT(attr,LBP_JV);
6889 PRINT_BIT(attr,LBP_JT);
6890 PRINT_BIT(attr,LBP_SA);
6891 PRINT_BIT(attr,LBP_XX);
6893 fprintf (stream, "\n");
6899 debug_output_lbrk_tables (const char *filename)
6903 stream = fopen (filename, "w");
6906 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6910 debug_output_lbp (stream);
6912 if (ferror (stream) || fclose (stream))
6914 fprintf (stderr, "error writing to '%s'\n", filename);
6919 /* The line breaking property from the LineBreak.txt file. */
6920 int unicode_org_lbp[0x110000];
6922 /* Stores in unicode_org_lbp[] the line breaking property from the
6923 LineBreak.txt file. */
6925 fill_org_lbp (const char *linebreak_filename)
6929 char field0[FIELDLEN];
6930 char field1[FIELDLEN];
6931 char field2[FIELDLEN];
6934 for (i = 0; i < 0x110000; i++)
6935 unicode_org_lbp[i] = LBP_XX;
6937 stream = fopen (linebreak_filename, "r");
6940 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
6956 do c = getc (stream); while (c != EOF && c != '\n');
6960 n = getfield (stream, field0, ';');
6961 n += getfield (stream, field1, ' ');
6962 n += getfield (stream, field2, '\n');
6967 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
6971 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7007 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7008 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7009 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7010 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7013 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7014 field1, linebreak_filename, lineno);
7017 i = strtoul (field0, NULL, 16);
7018 if (strstr (field0, "..") != NULL)
7020 /* Deal with a range. */
7021 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7023 unicode_org_lbp[i] = value;
7027 /* Single character line. */
7028 unicode_org_lbp[i] = value;
7032 if (ferror (stream) || fclose (stream))
7034 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7039 /* Output the line breaking properties in a human readable format. */
7041 debug_output_org_lbp (FILE *stream)
7045 for (i = 0; i < 0x110000; i++)
7047 int attr = unicode_org_lbp[i];
7050 fprintf (stream, "0x%04X", i);
7051 #define PRINT_BIT(attr,bit) \
7052 if (attr == bit) fprintf (stream, " " #bit);
7053 PRINT_BIT(attr,LBP_BK);
7054 PRINT_BIT(attr,LBP_CM);
7055 PRINT_BIT(attr,LBP_WJ);
7056 PRINT_BIT(attr,LBP_ZW);
7057 PRINT_BIT(attr,LBP_GL);
7058 PRINT_BIT(attr,LBP_SP);
7059 PRINT_BIT(attr,LBP_B2);
7060 PRINT_BIT(attr,LBP_BA);
7061 PRINT_BIT(attr,LBP_BB);
7062 PRINT_BIT(attr,LBP_HY);
7063 PRINT_BIT(attr,LBP_CB);
7064 PRINT_BIT(attr,LBP_CL);
7065 PRINT_BIT(attr,LBP_CP);
7066 PRINT_BIT(attr,LBP_EX);
7067 PRINT_BIT(attr,LBP_IN);
7068 PRINT_BIT(attr,LBP_NS);
7069 PRINT_BIT(attr,LBP_OP);
7070 PRINT_BIT(attr,LBP_QU);
7071 PRINT_BIT(attr,LBP_IS);
7072 PRINT_BIT(attr,LBP_NU);
7073 PRINT_BIT(attr,LBP_PO);
7074 PRINT_BIT(attr,LBP_PR);
7075 PRINT_BIT(attr,LBP_SY);
7076 PRINT_BIT(attr,LBP_AI);
7077 PRINT_BIT(attr,LBP_AL);
7078 PRINT_BIT(attr,LBP_H2);
7079 PRINT_BIT(attr,LBP_H3);
7080 PRINT_BIT(attr,LBP_ID);
7081 PRINT_BIT(attr,LBP_JL);
7082 PRINT_BIT(attr,LBP_JV);
7083 PRINT_BIT(attr,LBP_JT);
7084 PRINT_BIT(attr,LBP_SA);
7085 PRINT_BIT(attr,LBP_XX);
7087 fprintf (stream, "\n");
7093 debug_output_org_lbrk_tables (const char *filename)
7097 stream = fopen (filename, "w");
7100 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7104 debug_output_org_lbp (stream);
7106 if (ferror (stream) || fclose (stream))
7108 fprintf (stderr, "error writing to '%s'\n", filename);
7113 /* Construction of sparse 3-level tables. */
7114 #define TABLE lbp_table
7115 #define ELEMENT unsigned char
7116 #define DEFAULT LBP_XX
7117 #define xmalloc malloc
7118 #define xrealloc realloc
7122 output_lbp (FILE *stream1, FILE *stream2)
7126 unsigned int level1_offset, level2_offset, level3_offset;
7130 lbp_table_init (&t);
7132 for (i = 0; i < 0x110000; i++)
7134 int64_t attr = get_lbp (i);
7136 /* Now attr should contain exactly one bit. */
7137 if (attr == 0 || ((attr & (attr - 1)) != 0))
7140 if (attr != (int64_t) 1 << LBP_XX)
7142 unsigned int log2_attr;
7143 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7145 lbp_table_add (&t, i, log2_attr);
7149 lbp_table_finalize (&t);
7152 5 * sizeof (uint32_t);
7154 5 * sizeof (uint32_t)
7155 + t.level1_size * sizeof (uint32_t);
7157 5 * sizeof (uint32_t)
7158 + t.level1_size * sizeof (uint32_t)
7159 + (t.level2_size << t.q) * sizeof (uint32_t);
7161 for (i = 0; i < 5; i++)
7162 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7163 ((uint32_t *) t.result)[i]);
7164 fprintf (stream1, "\n");
7165 fprintf (stream1, "typedef struct\n");
7166 fprintf (stream1, " {\n");
7167 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7168 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7169 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7170 fprintf (stream1, " }\n");
7171 fprintf (stream1, "lbrkprop_t;\n");
7172 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7174 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7175 fprintf (stream2, "{\n");
7176 fprintf (stream2, " {");
7177 if (t.level1_size > 8)
7178 fprintf (stream2, "\n ");
7179 for (i = 0; i < t.level1_size; i++)
7182 if (i > 0 && (i % 8) == 0)
7183 fprintf (stream2, "\n ");
7184 offset = ((uint32_t *) (t.result + level1_offset))[i];
7186 fprintf (stream2, " %5d", -1);
7188 fprintf (stream2, " %5zu",
7189 (offset - level2_offset) / sizeof (uint32_t));
7190 if (i+1 < t.level1_size)
7191 fprintf (stream2, ",");
7193 if (t.level1_size > 8)
7194 fprintf (stream2, "\n ");
7195 fprintf (stream2, " },\n");
7196 fprintf (stream2, " {");
7197 if (t.level2_size << t.q > 8)
7198 fprintf (stream2, "\n ");
7199 for (i = 0; i < t.level2_size << t.q; i++)
7202 if (i > 0 && (i % 8) == 0)
7203 fprintf (stream2, "\n ");
7204 offset = ((uint32_t *) (t.result + level2_offset))[i];
7206 fprintf (stream2, " %5d", -1);
7208 fprintf (stream2, " %5zu",
7209 (offset - level3_offset) / sizeof (unsigned char));
7210 if (i+1 < t.level2_size << t.q)
7211 fprintf (stream2, ",");
7213 if (t.level2_size << t.q > 8)
7214 fprintf (stream2, "\n ");
7215 fprintf (stream2, " },\n");
7216 fprintf (stream2, " {");
7217 if (t.level3_size << t.p > 8)
7218 fprintf (stream2, "\n ");
7219 for (i = 0; i < t.level3_size << t.p; i++)
7221 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7222 const char *value_string;
7225 #define CASE(x) case x: value_string = #x; break;
7263 if (i > 0 && (i % 8) == 0)
7264 fprintf (stream2, "\n ");
7265 fprintf (stream2, " %s%s", value_string,
7266 (i+1 < t.level3_size << t.p ? "," : ""));
7268 if (t.level3_size << t.p > 8)
7269 fprintf (stream2, "\n ");
7270 fprintf (stream2, " }\n");
7271 fprintf (stream2, "};\n");
7275 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7277 const char *filenames[2];
7281 filenames[0] = filename1;
7282 filenames[1] = filename2;
7284 for (i = 0; i < 2; i++)
7286 streams[i] = fopen (filenames[i], "w");
7287 if (streams[i] == NULL)
7289 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7294 for (i = 0; i < 2; i++)
7296 FILE *stream = streams[i];
7298 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7299 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7300 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7302 fprintf (stream, "\n");
7304 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7305 still carries the GPL header), and it's gnulib-tool which replaces the
7306 GPL header with an LGPL header. */
7307 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7308 fprintf (stream, "\n");
7309 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7310 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7311 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7312 fprintf (stream, " (at your option) any later version.\n");
7313 fprintf (stream, "\n");
7314 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7315 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7316 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7317 fprintf (stream, " GNU General Public License for more details.\n");
7318 fprintf (stream, "\n");
7319 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7320 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7321 fprintf (stream, "\n");
7324 output_lbp (streams[0], streams[1]);
7326 for (i = 0; i < 2; i++)
7328 if (ferror (streams[i]) || fclose (streams[i]))
7330 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7336 /* ========================================================================= */
7338 /* Word break property.
7339 Updated for Unicode TR #29 revision 17. */
7341 /* Possible values of the Word_Break property. */
7356 WBP_EXTENDNUMLET = 7
7359 /* Returns the word breaking property for ch, as a bit mask. */
7361 get_wbp (unsigned int ch)
7365 if (unicode_attributes[ch].name != NULL)
7368 attr |= 1 << WBP_CR;
7371 attr |= 1 << WBP_LF;
7373 if (ch == 0x000B || ch == 0x000C
7375 || ch == 0x2028 || ch == 0x2029)
7376 attr |= 1 << WBP_NEWLINE;
7378 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
7379 || (unicode_attributes[ch].category != NULL
7380 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
7381 attr |= 1 << WBP_EXTEND;
7383 if (unicode_attributes[ch].category != NULL
7384 && strcmp (unicode_attributes[ch].category, "Cf") == 0
7385 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
7386 attr |= 1 << WBP_FORMAT;
7388 if ((unicode_scripts[ch] < numscripts
7389 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
7390 || (ch >= 0x3031 && ch <= 0x3035)
7391 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
7393 attr |= 1 << WBP_KATAKANA;
7395 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
7397 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
7398 && (attr & (1 << WBP_KATAKANA)) == 0
7399 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
7400 && !(unicode_scripts[ch] < numscripts
7401 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
7402 && (attr & (1 << WBP_EXTEND)) == 0)
7403 attr |= 1 << WBP_ALETTER;
7405 if (is_WBP_MIDNUMLET (ch))
7406 attr |= 1 << WBP_MIDNUMLET;
7408 if (is_WBP_MIDLETTER (ch))
7409 attr |= 1 << WBP_MIDLETTER;
7411 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
7412 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
7414 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
7415 attr |= 1 << WBP_MIDNUM;
7417 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
7419 attr |= 1 << WBP_NUMERIC;
7421 if (unicode_attributes[ch].category != NULL
7422 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
7423 attr |= 1 << WBP_EXTENDNUMLET;
7428 attr |= 1 << WBP_OTHER;
7433 /* Output the word break property in a human readable format. */
7435 debug_output_wbp (FILE *stream)
7439 for (i = 0; i < 0x110000; i++)
7441 int attr = get_wbp (i);
7442 if (attr != 1 << WBP_OTHER)
7444 fprintf (stream, "0x%04X", i);
7445 if (attr & (1 << WBP_CR))
7446 fprintf (stream, " CR");
7447 if (attr & (1 << WBP_LF))
7448 fprintf (stream, " LF");
7449 if (attr & (1 << WBP_NEWLINE))
7450 fprintf (stream, " Newline");
7451 if (attr & (1 << WBP_EXTEND))
7452 fprintf (stream, " Extend");
7453 if (attr & (1 << WBP_FORMAT))
7454 fprintf (stream, " Format");
7455 if (attr & (1 << WBP_KATAKANA))
7456 fprintf (stream, " Katakana");
7457 if (attr & (1 << WBP_ALETTER))
7458 fprintf (stream, " ALetter");
7459 if (attr & (1 << WBP_MIDNUMLET))
7460 fprintf (stream, " MidNumLet");
7461 if (attr & (1 << WBP_MIDLETTER))
7462 fprintf (stream, " MidLetter");
7463 if (attr & (1 << WBP_MIDNUM))
7464 fprintf (stream, " MidNum");
7465 if (attr & (1 << WBP_NUMERIC))
7466 fprintf (stream, " Numeric");
7467 if (attr & (1 << WBP_EXTENDNUMLET))
7468 fprintf (stream, " ExtendNumLet");
7469 fprintf (stream, "\n");
7475 debug_output_wbrk_tables (const char *filename)
7479 stream = fopen (filename, "w");
7482 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7486 debug_output_wbp (stream);
7488 if (ferror (stream) || fclose (stream))
7490 fprintf (stderr, "error writing to '%s'\n", filename);
7495 /* The word break property from the WordBreakProperty.txt file. */
7496 int unicode_org_wbp[0x110000];
7498 /* Stores in unicode_org_wbp[] the word break property from the
7499 WordBreakProperty.txt file. */
7501 fill_org_wbp (const char *wordbreakproperty_filename)
7506 for (i = 0; i < 0x110000; i++)
7507 unicode_org_wbp[i] = WBP_OTHER;
7509 stream = fopen (wordbreakproperty_filename, "r");
7512 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
7519 unsigned int i1, i2;
7520 char padding[200+1];
7521 char propname[200+1];
7524 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7527 if (buf[0] == '\0' || buf[0] == '#')
7530 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7532 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7534 fprintf (stderr, "parse error in '%s'\n",
7535 wordbreakproperty_filename);
7540 #define PROP(name,value) \
7541 if (strcmp (propname, name) == 0) propvalue = value; else
7544 PROP ("Newline", WBP_NEWLINE)
7545 PROP ("Extend", WBP_EXTEND)
7546 PROP ("Format", WBP_FORMAT)
7547 PROP ("Katakana", WBP_KATAKANA)
7548 PROP ("ALetter", WBP_ALETTER)
7549 PROP ("MidNumLet", WBP_MIDNUMLET)
7550 PROP ("MidLetter", WBP_MIDLETTER)
7551 PROP ("MidNum", WBP_MIDNUM)
7552 PROP ("Numeric", WBP_NUMERIC)
7553 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7556 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
7557 wordbreakproperty_filename);
7560 if (!(i1 <= i2 && i2 < 0x110000))
7563 for (i = i1; i <= i2; i++)
7564 unicode_org_wbp[i] = propvalue;
7567 if (ferror (stream) || fclose (stream))
7569 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
7574 /* Output the word break property in a human readable format. */
7576 debug_output_org_wbp (FILE *stream)
7580 for (i = 0; i < 0x110000; i++)
7582 int propvalue = unicode_org_wbp[i];
7583 if (propvalue != WBP_OTHER)
7585 fprintf (stream, "0x%04X", i);
7586 #define PROP(name,value) \
7587 if (propvalue == value) fprintf (stream, " " name); else
7590 PROP ("Newline", WBP_NEWLINE)
7591 PROP ("Extend", WBP_EXTEND)
7592 PROP ("Format", WBP_FORMAT)
7593 PROP ("Katakana", WBP_KATAKANA)
7594 PROP ("ALetter", WBP_ALETTER)
7595 PROP ("MidNumLet", WBP_MIDNUMLET)
7596 PROP ("MidLetter", WBP_MIDLETTER)
7597 PROP ("MidNum", WBP_MIDNUM)
7598 PROP ("Numeric", WBP_NUMERIC)
7599 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7601 fprintf (stream, " ??");
7602 fprintf (stream, "\n");
7608 debug_output_org_wbrk_tables (const char *filename)
7612 stream = fopen (filename, "w");
7615 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7619 debug_output_org_wbp (stream);
7621 if (ferror (stream) || fclose (stream))
7623 fprintf (stderr, "error writing to '%s'\n", filename);
7628 /* Construction of sparse 3-level tables. */
7629 #define TABLE wbp_table
7630 #define ELEMENT unsigned char
7631 #define DEFAULT WBP_OTHER
7632 #define xmalloc malloc
7633 #define xrealloc realloc
7637 output_wbp (FILE *stream)
7641 unsigned int level1_offset, level2_offset, level3_offset;
7645 wbp_table_init (&t);
7647 for (i = 0; i < 0x110000; i++)
7649 int attr = get_wbp (i);
7651 /* Now attr should contain exactly one bit. */
7652 if (attr == 0 || ((attr & (attr - 1)) != 0))
7655 if (attr != 1 << WBP_OTHER)
7657 unsigned int log2_attr;
7658 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7660 wbp_table_add (&t, i, log2_attr);
7664 wbp_table_finalize (&t);
7667 5 * sizeof (uint32_t);
7669 5 * sizeof (uint32_t)
7670 + t.level1_size * sizeof (uint32_t);
7672 5 * sizeof (uint32_t)
7673 + t.level1_size * sizeof (uint32_t)
7674 + (t.level2_size << t.q) * sizeof (uint32_t);
7676 for (i = 0; i < 5; i++)
7677 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
7678 ((uint32_t *) t.result)[i]);
7679 fprintf (stream, "\n");
7680 fprintf (stream, "typedef struct\n");
7681 fprintf (stream, " {\n");
7682 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7683 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
7684 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7685 fprintf (stream, " }\n");
7686 fprintf (stream, "wbrkprop_t;\n");
7687 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
7688 fprintf (stream, "{\n");
7689 fprintf (stream, " {");
7690 if (t.level1_size > 8)
7691 fprintf (stream, "\n ");
7692 for (i = 0; i < t.level1_size; i++)
7695 if (i > 0 && (i % 8) == 0)
7696 fprintf (stream, "\n ");
7697 offset = ((uint32_t *) (t.result + level1_offset))[i];
7699 fprintf (stream, " %5d", -1);
7701 fprintf (stream, " %5zu",
7702 (offset - level2_offset) / sizeof (uint32_t));
7703 if (i+1 < t.level1_size)
7704 fprintf (stream, ",");
7706 if (t.level1_size > 8)
7707 fprintf (stream, "\n ");
7708 fprintf (stream, " },\n");
7709 fprintf (stream, " {");
7710 if (t.level2_size << t.q > 8)
7711 fprintf (stream, "\n ");
7712 for (i = 0; i < t.level2_size << t.q; i++)
7715 if (i > 0 && (i % 8) == 0)
7716 fprintf (stream, "\n ");
7717 offset = ((uint32_t *) (t.result + level2_offset))[i];
7719 fprintf (stream, " %5d", -1);
7721 fprintf (stream, " %5zu",
7722 (offset - level3_offset) / sizeof (unsigned char));
7723 if (i+1 < t.level2_size << t.q)
7724 fprintf (stream, ",");
7726 if (t.level2_size << t.q > 8)
7727 fprintf (stream, "\n ");
7728 fprintf (stream, " },\n");
7729 fprintf (stream, " {");
7730 if (t.level3_size << t.p > 4)
7731 fprintf (stream, "\n ");
7732 for (i = 0; i < t.level3_size << t.p; i++)
7734 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7735 const char *value_string;
7738 #define CASE(x) case x: value_string = #x; break;
7747 CASE(WBP_MIDNUMLET);
7748 CASE(WBP_MIDLETTER);
7751 CASE(WBP_EXTENDNUMLET);
7756 if (i > 0 && (i % 4) == 0)
7757 fprintf (stream, "\n ");
7758 fprintf (stream, " %s%s", value_string,
7759 (i+1 < t.level3_size << t.p ? "," : ""));
7761 if (t.level3_size << t.p > 4)
7762 fprintf (stream, "\n ");
7763 fprintf (stream, " }\n");
7764 fprintf (stream, "};\n");
7768 output_wbrk_tables (const char *filename, const char *version)
7772 stream = fopen (filename, "w");
7775 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7779 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7780 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7781 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7783 fprintf (stream, "\n");
7785 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7786 still carries the GPL header), and it's gnulib-tool which replaces the
7787 GPL header with an LGPL header. */
7788 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
7789 fprintf (stream, "\n");
7790 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7791 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7792 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7793 fprintf (stream, " (at your option) any later version.\n");
7794 fprintf (stream, "\n");
7795 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7796 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7797 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7798 fprintf (stream, " GNU General Public License for more details.\n");
7799 fprintf (stream, "\n");
7800 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7801 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7802 fprintf (stream, "\n");
7804 output_wbp (stream);
7806 if (ferror (stream) || fclose (stream))
7808 fprintf (stderr, "error writing to '%s'\n", filename);
7813 /* ========================================================================= */
7815 /* Grapheme break property.
7816 Updated for Unicode TR #29 revision 17. */
7818 /* Possible values of the Grapheme_Cluster_Break property. */
7827 GBP_SPACINGMARK = 6,
7835 /* Construction of sparse 3-level tables. */
7836 #define TABLE gbp_table
7837 #define ELEMENT unsigned char
7838 #define DEFAULT GBP_OTHER
7839 #define xmalloc malloc
7840 #define xrealloc realloc
7843 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
7844 int unicode_org_gbp[0x110000];
7846 /* Output the unit test data for the grapheme break property. */
7848 output_gbp_test (const char *filename)
7854 stream = fopen (filename, "w");
7857 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7861 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7862 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
7863 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
7864 fprintf (stream, "\n");
7865 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7866 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7867 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7868 fprintf (stream, " (at your option) any later version.\n");
7869 fprintf (stream, "\n");
7870 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7871 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7872 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7873 fprintf (stream, " GNU General Public License for more details.\n");
7874 fprintf (stream, "\n");
7875 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7876 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7877 fprintf (stream, "\n");
7880 for (ch = 0; ch < 0x110000; ch++)
7882 int gbp = unicode_org_gbp[ch];
7883 const char *gbp_string;
7885 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
7890 #define CASE(x) case x: gbp_string = #x; break;
7897 CASE (GBP_SPACINGMARK)
7909 fprintf (stream, ",\n");
7910 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
7914 fprintf (stream, "\n");
7916 if (ferror (stream) || fclose (stream))
7918 fprintf (stderr, "error writing to '%s'\n", filename);
7923 /* Output the per-character grapheme break property table. */
7925 output_gbp_table (const char *filename, const char *version)
7930 unsigned int level1_offset, level2_offset, level3_offset;
7932 stream = fopen (filename, "w");
7935 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7939 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7940 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
7941 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7946 gbp_table_init (&t);
7948 for (ch = 0; ch < 0x110000; ch++)
7949 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
7951 gbp_table_finalize (&t);
7953 /* Offsets in t.result, in memory of this process. */
7955 5 * sizeof (uint32_t);
7957 5 * sizeof (uint32_t)
7958 + t.level1_size * sizeof (uint32_t);
7960 5 * sizeof (uint32_t)
7961 + t.level1_size * sizeof (uint32_t)
7962 + (t.level2_size << t.q) * sizeof (uint32_t);
7964 for (i = 0; i < 5; i++)
7965 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
7966 ((uint32_t *) t.result)[i]);
7967 fprintf (stream, "static const\n");
7968 fprintf (stream, "struct\n");
7969 fprintf (stream, " {\n");
7970 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7971 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7972 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
7973 t.level3_size, t.p);
7974 fprintf (stream, " }\n");
7975 fprintf (stream, "unigbrkprop =\n");
7976 fprintf (stream, "{\n");
7977 fprintf (stream, " {");
7978 if (t.level1_size > 8)
7979 fprintf (stream, "\n ");
7980 for (i = 0; i < t.level1_size; i++)
7983 if (i > 0 && (i % 8) == 0)
7984 fprintf (stream, "\n ");
7985 offset = ((uint32_t *) (t.result + level1_offset))[i];
7987 fprintf (stream, " %5d", -1);
7989 fprintf (stream, " %5zu",
7990 (offset - level2_offset) / sizeof (uint32_t));
7991 if (i+1 < t.level1_size)
7992 fprintf (stream, ",");
7994 if (t.level1_size > 8)
7995 fprintf (stream, "\n ");
7996 fprintf (stream, " },\n");
7997 fprintf (stream, " {");
7998 if (t.level2_size << t.q > 8)
7999 fprintf (stream, "\n ");
8000 for (i = 0; i < t.level2_size << t.q; i++)
8003 if (i > 0 && (i % 8) == 0)
8004 fprintf (stream, "\n ");
8005 offset = ((uint32_t *) (t.result + level2_offset))[i];
8007 fprintf (stream, " %5d", -1);
8009 fprintf (stream, " %5zu",
8010 (offset - level3_offset) / sizeof (uint8_t) / 2);
8011 if (i+1 < t.level2_size << t.q)
8012 fprintf (stream, ",");
8014 if (t.level2_size << t.q > 8)
8015 fprintf (stream, "\n ");
8016 fprintf (stream, " },\n");
8017 fprintf (stream, " {");
8018 if (t.level3_size << t.p > 8)
8019 fprintf (stream, "\n ");
8020 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
8022 unsigned char *p = (unsigned char *) (t.result + level3_offset);
8023 unsigned char value0 = p[i * 2];
8024 unsigned char value1 = p[i * 2 + 1];
8025 if (i > 0 && (i % 8) == 0)
8026 fprintf (stream, "\n ");
8027 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
8028 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
8030 if (t.level3_size << t.p > 8)
8031 fprintf (stream, "\n ");
8032 fprintf (stream, " }\n");
8033 fprintf (stream, "};\n");
8035 if (ferror (stream) || fclose (stream))
8037 fprintf (stderr, "error writing to '%s'\n", filename);
8042 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8043 GraphemeBreakProperty.txt file. */
8045 fill_org_gbp (const char *graphemebreakproperty_filename)
8051 for (i = 0; i < 0x110000; i++)
8052 unicode_org_gbp[i] = GBP_OTHER;
8054 stream = fopen (graphemebreakproperty_filename, "r");
8057 fprintf (stderr, "error during fopen of '%s'\n",
8058 graphemebreakproperty_filename);
8065 unsigned int i1, i2;
8066 char padding[200+1];
8067 char propname[200+1];
8071 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8074 if (buf[0] == '\0' || buf[0] == '#')
8077 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8079 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8081 fprintf (stderr, "parse error in '%s'\n",
8082 graphemebreakproperty_filename);
8087 #define PROP(name,value) \
8088 if (strcmp (propname, name) == 0) propvalue = value; else
8091 PROP ("Control", GBP_CONTROL)
8092 PROP ("Extend", GBP_EXTEND)
8093 PROP ("Prepend", GBP_PREPEND)
8094 PROP ("SpacingMark", GBP_SPACINGMARK)
8099 PROP ("LVT", GBP_LVT)
8102 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8103 graphemebreakproperty_filename, lineno);
8106 if (!(i1 <= i2 && i2 < 0x110000))
8109 for (i = i1; i <= i2; i++)
8110 unicode_org_gbp[i] = propvalue;
8113 if (ferror (stream) || fclose (stream))
8115 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8120 /* ========================================================================= */
8122 /* Composition and decomposition.
8123 Updated for Unicode TR #15 revision 33. */
8125 /* Maximum number of characters into which a single Unicode character can be
8127 #define MAX_DECOMP_LENGTH 18
8131 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
8132 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
8133 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
8134 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
8135 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
8136 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
8137 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
8138 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
8139 UC_DECOMP_SUPER, /* <super> A superscript form. */
8140 UC_DECOMP_SUB, /* <sub> A subscript form. */
8141 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
8142 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
8143 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
8144 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
8145 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
8146 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
8147 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
8150 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8151 decompositions). Return the type, or -1 for none. */
8153 get_decomposition (unsigned int ch,
8154 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8156 const char *decomposition = unicode_attributes[ch].decomposition;
8158 if (decomposition != NULL && decomposition[0] != '\0')
8160 int type = UC_DECOMP_CANONICAL;
8161 unsigned int length;
8164 if (decomposition[0] == '<')
8169 rangle = strchr (decomposition + 1, '>');
8172 typelen = rangle + 1 - decomposition;
8173 #define TYPE(t1,t2) \
8174 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8177 TYPE ("<font>", UC_DECOMP_FONT)
8178 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8179 TYPE ("<initial>", UC_DECOMP_INITIAL)
8180 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8181 TYPE ("<final>", UC_DECOMP_FINAL)
8182 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8183 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8184 TYPE ("<super>", UC_DECOMP_SUPER)
8185 TYPE ("<sub>", UC_DECOMP_SUB)
8186 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8187 TYPE ("<wide>", UC_DECOMP_WIDE)
8188 TYPE ("<narrow>", UC_DECOMP_NARROW)
8189 TYPE ("<small>", UC_DECOMP_SMALL)
8190 TYPE ("<square>", UC_DECOMP_SQUARE)
8191 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8192 TYPE ("<compat>", UC_DECOMP_COMPAT)
8194 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8198 decomposition = rangle + 1;
8199 if (decomposition[0] == ' ')
8202 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8204 decomposed[length] = strtoul (decomposition, &endptr, 16);
8205 if (endptr == decomposition)
8207 decomposition = endptr;
8208 if (decomposition[0] == ' ')
8211 if (*decomposition != '\0')
8212 /* MAX_DECOMP_LENGTH is too small. */
8222 /* Construction of sparse 3-level tables. */
8223 #define TABLE decomp_table
8224 #define ELEMENT uint16_t
8225 #define DEFAULT (uint16_t)(-1)
8226 #define xmalloc malloc
8227 #define xrealloc realloc
8231 output_decomposition (FILE *stream1, FILE *stream2)
8233 struct decomp_table t;
8234 unsigned int level1_offset, level2_offset, level3_offset;
8235 unsigned int offset;
8241 decomp_table_init (&t);
8243 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8244 fprintf (stream1, "\n");
8245 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8248 for (ch = 0; ch < 0x110000; ch++)
8250 unsigned int length;
8251 unsigned int decomposed[MAX_DECOMP_LENGTH];
8252 int type = get_decomposition (ch, &length, decomposed);
8256 if (!(offset < (1 << 15)))
8258 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
8260 /* Produce length 3-bytes entries. */
8262 /* We would need a special representation of zero-length entries. */
8264 for (i = 0; i < length; i++)
8267 fprintf (stream2, ",");
8268 if ((offset % 4) == 0)
8269 fprintf (stream2, "\n ");
8270 if (!(decomposed[i] < (1 << 18)))
8272 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
8273 (((i+1 < length ? (1 << 23) : 0)
8274 | (i == 0 ? (type << 18) : 0)
8275 | decomposed[i]) >> 16) & 0xff,
8276 (decomposed[i] >> 8) & 0xff,
8277 decomposed[i] & 0xff);
8283 fprintf (stream2, "\n};\n");
8284 fprintf (stream2, "\n");
8286 decomp_table_finalize (&t);
8289 5 * sizeof (uint32_t);
8291 5 * sizeof (uint32_t)
8292 + t.level1_size * sizeof (uint32_t);
8294 5 * sizeof (uint32_t)
8295 + t.level1_size * sizeof (uint32_t)
8296 + (t.level2_size << t.q) * sizeof (uint32_t);
8298 for (i = 0; i < 5; i++)
8299 fprintf (stream1, "#define decomp_header_%d %d\n", i,
8300 ((uint32_t *) t.result)[i]);
8301 fprintf (stream1, "\n");
8302 fprintf (stream1, "typedef struct\n");
8303 fprintf (stream1, " {\n");
8304 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8305 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8306 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
8307 fprintf (stream1, " }\n");
8308 fprintf (stream1, "decomp_index_table_t;\n");
8309 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8310 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8311 fprintf (stream2, "{\n");
8312 fprintf (stream2, " {");
8313 if (t.level1_size > 8)
8314 fprintf (stream2, "\n ");
8315 for (i = 0; i < t.level1_size; i++)
8318 if (i > 0 && (i % 8) == 0)
8319 fprintf (stream2, "\n ");
8320 offset = ((uint32_t *) (t.result + level1_offset))[i];
8322 fprintf (stream2, " %5d", -1);
8324 fprintf (stream2, " %5zu",
8325 (offset - level2_offset) / sizeof (uint32_t));
8326 if (i+1 < t.level1_size)
8327 fprintf (stream2, ",");
8329 if (t.level1_size > 8)
8330 fprintf (stream2, "\n ");
8331 fprintf (stream2, " },\n");
8332 fprintf (stream2, " {");
8333 if (t.level2_size << t.q > 8)
8334 fprintf (stream2, "\n ");
8335 for (i = 0; i < t.level2_size << t.q; i++)
8338 if (i > 0 && (i % 8) == 0)
8339 fprintf (stream2, "\n ");
8340 offset = ((uint32_t *) (t.result + level2_offset))[i];
8342 fprintf (stream2, " %5d", -1);
8344 fprintf (stream2, " %5zu",
8345 (offset - level3_offset) / sizeof (uint16_t));
8346 if (i+1 < t.level2_size << t.q)
8347 fprintf (stream2, ",");
8349 if (t.level2_size << t.q > 8)
8350 fprintf (stream2, "\n ");
8351 fprintf (stream2, " },\n");
8352 fprintf (stream2, " {");
8353 if (t.level3_size << t.p > 8)
8354 fprintf (stream2, "\n ");
8355 for (i = 0; i < t.level3_size << t.p; i++)
8357 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
8358 if (i > 0 && (i % 8) == 0)
8359 fprintf (stream2, "\n ");
8360 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
8361 if (i+1 < t.level3_size << t.p)
8362 fprintf (stream2, ",");
8364 if (t.level3_size << t.p > 8)
8365 fprintf (stream2, "\n ");
8366 fprintf (stream2, " }\n");
8367 fprintf (stream2, "};\n");
8371 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
8373 const char *filenames[2];
8377 filenames[0] = filename1;
8378 filenames[1] = filename2;
8380 for (i = 0; i < 2; i++)
8382 streams[i] = fopen (filenames[i], "w");
8383 if (streams[i] == NULL)
8385 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
8390 for (i = 0; i < 2; i++)
8392 FILE *stream = streams[i];
8394 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8395 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
8396 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8398 fprintf (stream, "\n");
8401 output_decomposition (streams[0], streams[1]);
8403 for (i = 0; i < 2; i++)
8405 if (ferror (streams[i]) || fclose (streams[i]))
8407 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
8413 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
8414 char unicode_composition_exclusions[0x110000];
8417 fill_composition_exclusions (const char *compositionexclusions_filename)
8422 stream = fopen (compositionexclusions_filename, "r");
8425 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
8429 for (i = 0; i < 0x110000; i++)
8430 unicode_composition_exclusions[i] = 0;
8437 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8440 if (buf[0] == '\0' || buf[0] == '#')
8443 if (sscanf (buf, "%X", &i) != 1)
8445 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
8448 if (!(i < 0x110000))
8451 unicode_composition_exclusions[i] = 1;
8454 if (ferror (stream) || fclose (stream))
8456 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
8462 debug_output_composition_tables (const char *filename)
8467 stream = fopen (filename, "w");
8470 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8474 for (ch = 0; ch < 0x110000; ch++)
8476 unsigned int length;
8477 unsigned int decomposed[MAX_DECOMP_LENGTH];
8478 int type = get_decomposition (ch, &length, decomposed);
8480 if (type == UC_DECOMP_CANONICAL
8481 /* Consider only binary decompositions.
8482 Exclude singleton decompositions. */
8485 unsigned int code1 = decomposed[0];
8486 unsigned int code2 = decomposed[1];
8487 unsigned int combined = ch;
8489 /* Exclude decompositions where the first part is not a starter,
8490 i.e. is not of canonical combining class 0. */
8491 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8492 /* Exclude characters listed in CompositionExclusions.txt. */
8493 && !unicode_composition_exclusions[combined])
8495 /* The combined character must now also be a starter.
8497 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
8500 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
8504 unicode_attributes[code2].combining);
8509 if (ferror (stream) || fclose (stream))
8511 fprintf (stderr, "error writing to '%s'\n", filename);
8517 output_composition_tables (const char *filename, const char *version)
8522 stream = fopen (filename, "w");
8525 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8529 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8530 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
8531 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8533 fprintf (stream, "\n");
8535 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8536 still carries the GPL header), and it's gnulib-tool which replaces the
8537 GPL header with an LGPL header. */
8538 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
8539 fprintf (stream, "\n");
8540 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8541 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8542 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8543 fprintf (stream, " (at your option) any later version.\n");
8544 fprintf (stream, "\n");
8545 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8546 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8547 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8548 fprintf (stream, " GNU General Public License for more details.\n");
8549 fprintf (stream, "\n");
8550 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8551 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8552 fprintf (stream, "\n");
8554 /* The composition table is a set of mappings (code1, code2) -> combined,
8556 367 values for code1 (from 0x003C to 0x30FD),
8557 54 values for code2 (from 0x0300 to 0x309A).
8558 For a fixed code1, there are from 1 to 19 possible values for code2.
8559 For a fixed code2, there are from 1 to 117 possible values for code1.
8560 This is a very sparse matrix.
8562 We want an O(1) hash lookup.
8564 We could implement the hash lookup by mapping (code1, code2) to a linear
8565 combination mul1*code1 + mul2*code2, which is then used as an index into
8566 a 3-level table. But this leads to a table of size 37 KB.
8568 We use gperf to implement the hash lookup, giving it the 928 sets of
8569 4 bytes (code1, code2) as input. gperf generates a hash table of size
8570 1527, which is quite good (60% filled). It requires an auxiliary table
8571 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
8573 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
8574 fprintf (stream, "%%struct-type\n");
8575 fprintf (stream, "%%language=ANSI-C\n");
8576 fprintf (stream, "%%define slot-name codes\n");
8577 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
8578 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
8579 fprintf (stream, "%%compare-lengths\n");
8580 fprintf (stream, "%%compare-strncmp\n");
8581 fprintf (stream, "%%readonly-tables\n");
8582 fprintf (stream, "%%omit-struct-type\n");
8583 fprintf (stream, "%%%%\n");
8585 for (ch = 0; ch < 0x110000; ch++)
8587 unsigned int length;
8588 unsigned int decomposed[MAX_DECOMP_LENGTH];
8589 int type = get_decomposition (ch, &length, decomposed);
8591 if (type == UC_DECOMP_CANONICAL
8592 /* Consider only binary decompositions.
8593 Exclude singleton decompositions. */
8596 unsigned int code1 = decomposed[0];
8597 unsigned int code2 = decomposed[1];
8598 unsigned int combined = ch;
8600 /* Exclude decompositions where the first part is not a starter,
8601 i.e. is not of canonical combining class 0. */
8602 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8603 /* Exclude characters listed in CompositionExclusions.txt. */
8604 && !unicode_composition_exclusions[combined])
8606 /* The combined character must now also be a starter.
8608 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
8611 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
8612 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
8613 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
8619 if (ferror (stream) || fclose (stream))
8621 fprintf (stderr, "error writing to '%s'\n", filename);
8626 /* ========================================================================= */
8628 /* Output the test for a simple character mapping table to the given file. */
8631 output_simple_mapping_test (const char *filename,
8632 const char *function_name,
8633 unsigned int (*func) (unsigned int),
8634 const char *version)
8640 stream = fopen (filename, "w");
8643 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8647 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8648 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
8649 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
8650 fprintf (stream, "\n");
8651 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8652 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8653 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8654 fprintf (stream, " (at your option) any later version.\n");
8655 fprintf (stream, "\n");
8656 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8657 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8658 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8659 fprintf (stream, " GNU General Public License for more details.\n");
8660 fprintf (stream, "\n");
8661 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8662 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8663 fprintf (stream, "\n");
8664 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8666 fprintf (stream, "\n");
8667 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
8668 fprintf (stream, "\n");
8671 for (ch = 0; ch < 0x110000; ch++)
8673 unsigned int value = func (ch);
8678 fprintf (stream, ",\n");
8679 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
8684 fprintf (stream, "\n");
8686 fprintf (stream, "\n");
8687 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
8688 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
8690 if (ferror (stream) || fclose (stream))
8692 fprintf (stderr, "error writing to '%s'\n", filename);
8697 /* Construction of sparse 3-level tables. */
8698 #define TABLE mapping_table
8699 #define ELEMENT int32_t
8701 #define xmalloc malloc
8702 #define xrealloc realloc
8705 /* Output a simple character mapping table to the given file. */
8708 output_simple_mapping (const char *filename,
8709 unsigned int (*func) (unsigned int),
8710 const char *version)
8714 struct mapping_table t;
8715 unsigned int level1_offset, level2_offset, level3_offset;
8717 stream = fopen (filename, "w");
8720 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8724 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8725 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
8726 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8731 mapping_table_init (&t);
8733 for (ch = 0; ch < 0x110000; ch++)
8735 int value = (int) func (ch) - (int) ch;
8737 mapping_table_add (&t, ch, value);
8740 mapping_table_finalize (&t);
8742 /* Offsets in t.result, in memory of this process. */
8744 5 * sizeof (uint32_t);
8746 5 * sizeof (uint32_t)
8747 + t.level1_size * sizeof (uint32_t);
8749 5 * sizeof (uint32_t)
8750 + t.level1_size * sizeof (uint32_t)
8751 + (t.level2_size << t.q) * sizeof (uint32_t);
8753 for (i = 0; i < 5; i++)
8754 fprintf (stream, "#define mapping_header_%d %d\n", i,
8755 ((uint32_t *) t.result)[i]);
8756 fprintf (stream, "static const\n");
8757 fprintf (stream, "struct\n");
8758 fprintf (stream, " {\n");
8759 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8760 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8761 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
8762 fprintf (stream, " }\n");
8763 fprintf (stream, "u_mapping =\n");
8764 fprintf (stream, "{\n");
8765 fprintf (stream, " {");
8766 if (t.level1_size > 8)
8767 fprintf (stream, "\n ");
8768 for (i = 0; i < t.level1_size; i++)
8771 if (i > 0 && (i % 8) == 0)
8772 fprintf (stream, "\n ");
8773 offset = ((uint32_t *) (t.result + level1_offset))[i];
8775 fprintf (stream, " %5d", -1);
8777 fprintf (stream, " %5zu",
8778 (offset - level2_offset) / sizeof (uint32_t));
8779 if (i+1 < t.level1_size)
8780 fprintf (stream, ",");
8782 if (t.level1_size > 8)
8783 fprintf (stream, "\n ");
8784 fprintf (stream, " },\n");
8785 fprintf (stream, " {");
8786 if (t.level2_size << t.q > 8)
8787 fprintf (stream, "\n ");
8788 for (i = 0; i < t.level2_size << t.q; i++)
8791 if (i > 0 && (i % 8) == 0)
8792 fprintf (stream, "\n ");
8793 offset = ((uint32_t *) (t.result + level2_offset))[i];
8795 fprintf (stream, " %5d", -1);
8797 fprintf (stream, " %5zu",
8798 (offset - level3_offset) / sizeof (int32_t));
8799 if (i+1 < t.level2_size << t.q)
8800 fprintf (stream, ",");
8802 if (t.level2_size << t.q > 8)
8803 fprintf (stream, "\n ");
8804 fprintf (stream, " },\n");
8805 fprintf (stream, " {");
8806 if (t.level3_size << t.p > 8)
8807 fprintf (stream, "\n ");
8808 for (i = 0; i < t.level3_size << t.p; i++)
8810 if (i > 0 && (i % 8) == 0)
8811 fprintf (stream, "\n ");
8812 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
8813 if (i+1 < t.level3_size << t.p)
8814 fprintf (stream, ",");
8816 if (t.level3_size << t.p > 8)
8817 fprintf (stream, "\n ");
8818 fprintf (stream, " }\n");
8819 fprintf (stream, "};\n");
8821 if (ferror (stream) || fclose (stream))
8823 fprintf (stderr, "error writing to '%s'\n", filename);
8828 /* ========================================================================= */
8830 /* A special casing context.
8831 A context is negated through x -> -x. */
8836 SCC_AFTER_SOFT_DOTTED,
8842 /* A special casing rule. */
8843 struct special_casing_rule
8846 unsigned int lower_mapping[3];
8847 unsigned int title_mapping[3];
8848 unsigned int upper_mapping[3];
8849 unsigned int casefold_mapping[3];
8850 const char *language;
8854 /* The special casing rules. */
8855 struct special_casing_rule **casing_rules;
8856 unsigned int num_casing_rules;
8857 unsigned int allocated_casing_rules;
8860 add_casing_rule (struct special_casing_rule *new_rule)
8862 if (num_casing_rules == allocated_casing_rules)
8864 allocated_casing_rules = 2 * allocated_casing_rules;
8865 if (allocated_casing_rules < 16)
8866 allocated_casing_rules = 16;
8868 (struct special_casing_rule **)
8869 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
8871 casing_rules[num_casing_rules++] = new_rule;
8874 /* Stores in casing_rules the special casing rules found in
8875 specialcasing_filename. */
8877 fill_casing_rules (const char *specialcasing_filename)
8881 stream = fopen (specialcasing_filename, "r");
8884 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
8888 casing_rules = NULL;
8889 num_casing_rules = 0;
8890 allocated_casing_rules = 0;
8900 unsigned int lower_mapping[3];
8901 unsigned int title_mapping[3];
8902 unsigned int upper_mapping[3];
8906 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8909 if (buf[0] == '\0' || buf[0] == '#')
8914 code = strtoul (scanptr, &endptr, 16);
8915 if (endptr == scanptr)
8917 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8921 if (*scanptr != ';')
8923 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8928 /* Scan lower mapping. */
8929 for (i = 0; i < 3; i++)
8930 lower_mapping[i] = 0;
8931 for (i = 0; i < 3; i++)
8933 while (*scanptr == ' ')
8935 if (*scanptr == ';')
8937 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
8938 if (endptr == scanptr)
8940 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8945 if (*scanptr != ';')
8947 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8952 /* Scan title mapping. */
8953 for (i = 0; i < 3; i++)
8954 title_mapping[i] = 0;
8955 for (i = 0; i < 3; i++)
8957 while (*scanptr == ' ')
8959 if (*scanptr == ';')
8961 title_mapping[i] = strtoul (scanptr, &endptr, 16);
8962 if (endptr == scanptr)
8964 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8969 if (*scanptr != ';')
8971 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8976 /* Scan upper mapping. */
8977 for (i = 0; i < 3; i++)
8978 upper_mapping[i] = 0;
8979 for (i = 0; i < 3; i++)
8981 while (*scanptr == ' ')
8983 if (*scanptr == ';')
8985 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
8986 if (endptr == scanptr)
8988 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8993 if (*scanptr != ';')
8995 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9000 /* Scan language and context. */
9002 context = SCC_ALWAYS;
9003 while (*scanptr == ' ')
9005 if (*scanptr != '\0' && *scanptr != '#')
9007 const char *word_begin = scanptr;
9008 const char *word_end;
9010 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9014 while (*scanptr == ' ')
9017 if (word_end - word_begin == 2)
9019 language = (char *) malloc ((word_end - word_begin) + 1);
9020 memcpy (language, word_begin, 2);
9021 language[word_end - word_begin] = '\0';
9022 word_begin = word_end = NULL;
9024 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9026 word_begin = scanptr;
9027 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9033 if (word_end > word_begin)
9035 bool negate = false;
9037 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9042 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9043 context = SCC_FINAL_SIGMA;
9044 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9045 context = SCC_AFTER_SOFT_DOTTED;
9046 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9047 context = SCC_MORE_ABOVE;
9048 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9049 context = SCC_BEFORE_DOT;
9050 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9051 context = SCC_AFTER_I;
9054 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9058 context = - context;
9061 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9063 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9068 /* Store the rule. */
9070 struct special_casing_rule *new_rule =
9071 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9072 new_rule->code = code;
9073 new_rule->language = language;
9074 new_rule->context = context;
9075 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9076 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9077 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9079 add_casing_rule (new_rule);
9083 if (ferror (stream) || fclose (stream))
9085 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9090 /* A casefolding rule. */
9091 struct casefold_rule
9094 unsigned int mapping[3];
9095 const char *language;
9098 /* The casefolding rules. */
9099 struct casefold_rule **casefolding_rules;
9100 unsigned int num_casefolding_rules;
9101 unsigned int allocated_casefolding_rules;
9103 /* Stores in casefolding_rules the case folding rules found in
9104 casefolding_filename. */
9106 fill_casefolding_rules (const char *casefolding_filename)
9110 stream = fopen (casefolding_filename, "r");
9113 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9117 casefolding_rules = NULL;
9118 num_casefolding_rules = 0;
9119 allocated_casefolding_rules = 0;
9130 unsigned int mapping[3];
9132 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9135 if (buf[0] == '\0' || buf[0] == '#')
9140 code = strtoul (scanptr, &endptr, 16);
9141 if (endptr == scanptr)
9143 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9147 if (*scanptr != ';')
9149 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9155 while (*scanptr == ' ')
9160 case 'C': case 'F': case 'S': case 'T':
9164 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9168 if (*scanptr != ';')
9170 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9175 /* Scan casefold mapping. */
9176 for (i = 0; i < 3; i++)
9178 for (i = 0; i < 3; i++)
9180 while (*scanptr == ' ')
9182 if (*scanptr == ';')
9184 mapping[i] = strtoul (scanptr, &endptr, 16);
9185 if (endptr == scanptr)
9187 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9192 if (*scanptr != ';')
9194 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9199 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9202 const char * const *languages;
9203 unsigned int languages_count;
9205 /* Type 'T' indicates that the rule is applicable to Turkish
9209 static const char * const turkish_languages[] = { "tr", "az" };
9210 languages = turkish_languages;
9211 languages_count = 2;
9215 static const char * const all_languages[] = { NULL };
9216 languages = all_languages;
9217 languages_count = 1;
9220 for (i = 0; i < languages_count; i++)
9222 /* Store a new rule. */
9223 struct casefold_rule *new_rule =
9224 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9225 new_rule->code = code;
9226 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9227 new_rule->language = languages[i];
9229 if (num_casefolding_rules == allocated_casefolding_rules)
9231 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9232 if (allocated_casefolding_rules < 16)
9233 allocated_casefolding_rules = 16;
9235 (struct casefold_rule **)
9236 realloc (casefolding_rules,
9237 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9239 casefolding_rules[num_casefolding_rules++] = new_rule;
9244 if (ferror (stream) || fclose (stream))
9246 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9251 /* Casefold mapping, when it maps to a single character. */
9252 unsigned int unicode_casefold[0x110000];
9255 to_casefold (unsigned int ch)
9257 return unicode_casefold[ch];
9260 /* Redistribute the casefolding_rules:
9261 - Rules that map to a single character, language independently, are stored
9262 in unicode_casefold.
9263 - Other rules are merged into casing_rules. */
9265 redistribute_casefolding_rules (void)
9267 unsigned int ch, i, j;
9269 /* Fill unicode_casefold[]. */
9270 for (ch = 0; ch < 0x110000; ch++)
9271 unicode_casefold[ch] = ch;
9272 for (i = 0; i < num_casefolding_rules; i++)
9274 struct casefold_rule *cfrule = casefolding_rules[i];
9276 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
9279 if (!(ch < 0x110000))
9281 unicode_casefold[ch] = cfrule->mapping[0];
9285 /* Extend the special casing rules by filling in their casefold_mapping[]
9287 for (j = 0; j < num_casing_rules; j++)
9289 struct special_casing_rule *rule = casing_rules[j];
9292 rule->casefold_mapping[0] = to_casefold (rule->code);
9293 for (k = 1; k < 3; k++)
9294 rule->casefold_mapping[k] = 0;
9297 /* Now merge the other casefolding rules into casing_rules. */
9298 for (i = 0; i < num_casefolding_rules; i++)
9300 struct casefold_rule *cfrule = casefolding_rules[i];
9302 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
9304 /* Find a rule that applies to the same code, same language, and it
9305 has context SCC_ALWAYS. At the same time, update all rules that
9306 have the same code and same or more specific language. */
9307 struct special_casing_rule *found_rule = NULL;
9309 for (j = 0; j < num_casing_rules; j++)
9311 struct special_casing_rule *rule = casing_rules[j];
9313 if (rule->code == cfrule->code
9314 && (cfrule->language == NULL
9315 || (rule->language != NULL
9316 && strcmp (rule->language, cfrule->language) == 0)))
9318 memcpy (rule->casefold_mapping, cfrule->mapping,
9319 sizeof (rule->casefold_mapping));
9321 if ((cfrule->language == NULL
9322 ? rule->language == NULL
9323 : rule->language != NULL
9324 && strcmp (rule->language, cfrule->language) == 0)
9325 && rule->context == SCC_ALWAYS)
9333 if (found_rule == NULL)
9335 /* Create a new rule. */
9336 struct special_casing_rule *new_rule =
9337 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9339 /* Try to find a rule that applies to the same code, no language
9340 restriction, and with context SCC_ALWAYS. */
9341 for (j = 0; j < num_casing_rules; j++)
9343 struct special_casing_rule *rule = casing_rules[j];
9345 if (rule->code == cfrule->code
9346 && rule->context == SCC_ALWAYS
9347 && rule->language == NULL)
9355 new_rule->code = cfrule->code;
9356 new_rule->language = cfrule->language;
9357 new_rule->context = SCC_ALWAYS;
9358 if (found_rule != NULL)
9360 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
9361 sizeof (new_rule->lower_mapping));
9362 memcpy (new_rule->title_mapping, found_rule->title_mapping,
9363 sizeof (new_rule->title_mapping));
9364 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
9365 sizeof (new_rule->upper_mapping));
9371 new_rule->lower_mapping[0] = to_lower (cfrule->code);
9372 for (k = 1; k < 3; k++)
9373 new_rule->lower_mapping[k] = 0;
9374 new_rule->title_mapping[0] = to_title (cfrule->code);
9375 for (k = 1; k < 3; k++)
9376 new_rule->title_mapping[k] = 0;
9377 new_rule->upper_mapping[0] = to_upper (cfrule->code);
9378 for (k = 1; k < 3; k++)
9379 new_rule->upper_mapping[k] = 0;
9381 memcpy (new_rule->casefold_mapping, cfrule->mapping,
9382 sizeof (new_rule->casefold_mapping));
9384 add_casing_rule (new_rule);
9391 compare_casing_rules (const void *a, const void *b)
9393 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
9394 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
9395 unsigned int a_code = a_rule->code;
9396 unsigned int b_code = b_rule->code;
9398 if (a_code < b_code)
9400 if (a_code > b_code)
9403 /* Sort the more specific rules before the more general ones. */
9404 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
9405 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
9409 sort_casing_rules (void)
9411 /* Sort the rules 1. by code, 2. by specificity. */
9412 if (num_casing_rules > 1)
9413 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
9414 compare_casing_rules);
9417 /* Output the special casing rules. */
9419 output_casing_rules (const char *filename, const char *version)
9425 stream = fopen (filename, "w");
9428 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9432 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9433 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
9434 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9436 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
9437 fprintf (stream, "%%struct-type\n");
9438 fprintf (stream, "%%language=ANSI-C\n");
9439 fprintf (stream, "%%define slot-name code\n");
9440 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
9441 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
9442 fprintf (stream, "%%compare-lengths\n");
9443 fprintf (stream, "%%compare-strncmp\n");
9444 fprintf (stream, "%%readonly-tables\n");
9445 fprintf (stream, "%%omit-struct-type\n");
9446 fprintf (stream, "%%%%\n");
9449 for (i = 0; i < num_casing_rules; i++)
9451 struct special_casing_rule *rule = casing_rules[i];
9454 if (i > 0 && rule->code == casing_rules[i - 1]->code)
9459 if (!(rule->code < 0x10000))
9461 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
9465 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
9466 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
9468 fprintf (stream, "%d, ",
9469 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
9471 context = rule->context;
9474 fprintf (stream, "-");
9475 context = - context;
9478 fprintf (stream, " ");
9482 fprintf (stream, "SCC_ALWAYS ");
9484 case SCC_FINAL_SIGMA:
9485 fprintf (stream, "SCC_FINAL_SIGMA ");
9487 case SCC_AFTER_SOFT_DOTTED:
9488 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
9490 case SCC_MORE_ABOVE:
9491 fprintf (stream, "SCC_MORE_ABOVE ");
9493 case SCC_BEFORE_DOT:
9494 fprintf (stream, "SCC_BEFORE_DOT ");
9497 fprintf (stream, "SCC_AFTER_I ");
9502 fprintf (stream, ", ");
9504 if (rule->language != NULL)
9506 if (strlen (rule->language) != 2)
9508 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
9511 fprintf (stream, "{ '\\0', '\\0' }, ");
9513 fprintf (stream, "{ ");
9514 for (j = 0; j < 3; j++)
9517 fprintf (stream, ", ");
9518 if (!(rule->upper_mapping[j] < 0x10000))
9520 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
9523 if (rule->upper_mapping[j] != 0)
9524 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
9526 fprintf (stream, " 0");
9528 fprintf (stream, " }, { ");
9529 for (j = 0; j < 3; j++)
9532 fprintf (stream, ", ");
9533 if (!(rule->lower_mapping[j] < 0x10000))
9535 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
9538 if (rule->lower_mapping[j] != 0)
9539 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
9541 fprintf (stream, " 0");
9543 fprintf (stream, " }, { ");
9544 for (j = 0; j < 3; j++)
9547 fprintf (stream, ", ");
9548 if (!(rule->title_mapping[j] < 0x10000))
9550 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
9553 if (rule->title_mapping[j] != 0)
9554 fprintf (stream, "0x%04X", rule->title_mapping[j]);
9556 fprintf (stream, " 0");
9558 fprintf (stream, " }, { ");
9559 for (j = 0; j < 3; j++)
9562 fprintf (stream, ", ");
9563 if (!(rule->casefold_mapping[j] < 0x10000))
9565 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
9568 if (rule->casefold_mapping[j] != 0)
9569 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
9571 fprintf (stream, " 0");
9573 fprintf (stream, " }\n");
9576 if (ferror (stream) || fclose (stream))
9578 fprintf (stderr, "error writing to '%s'\n", filename);
9583 /* ========================================================================= */
9585 /* Quoting the Unicode standard:
9586 Definition: A character is defined to be "cased" if it has the Lowercase
9587 or Uppercase property or has a General_Category value of
9588 Titlecase_Letter. */
9590 is_cased (unsigned int ch)
9592 return (is_property_lowercase (ch)
9593 || is_property_uppercase (ch)
9594 || is_category_Lt (ch));
9597 /* Quoting the Unicode standard:
9598 Definition: A character is defined to be "case-ignorable" if it has the
9599 value MidLetter {or the value MidNumLet} for the Word_Break property or
9600 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
9601 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
9602 The text marked in braces was added in Unicode 5.1.0, see
9603 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
9604 Definition of case-ignorable". */
9605 /* Since this predicate is only used for the "Before C" and "After C"
9606 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
9607 This simplifies the evaluation of the regular expressions
9608 \p{cased} (\p{case-ignorable})* C
9610 C (\p{case-ignorable})* \p{cased}
9613 is_case_ignorable (unsigned int ch)
9615 return (unicode_org_wbp[ch] == WBP_MIDLETTER
9616 || unicode_org_wbp[ch] == WBP_MIDNUMLET
9617 || is_category_Mn (ch)
9618 || is_category_Me (ch)
9619 || is_category_Cf (ch)
9620 || is_category_Lm (ch)
9621 || is_category_Sk (ch))
9625 /* ------------------------------------------------------------------------- */
9627 /* Output all case related properties. */
9629 output_casing_properties (const char *version)
9631 #define PROPERTY(FN,P) \
9632 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
9633 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
9634 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
9635 PROPERTY(cased, cased)
9636 PROPERTY(ignorable, case_ignorable)
9640 /* ========================================================================= */
9643 main (int argc, char * argv[])
9645 const char *unicodedata_filename;
9646 const char *proplist_filename;
9647 const char *derivedproplist_filename;
9648 const char *arabicshaping_filename;
9649 const char *scripts_filename;
9650 const char *blocks_filename;
9651 const char *proplist30_filename;
9652 const char *eastasianwidth_filename;
9653 const char *linebreak_filename;
9654 const char *wordbreakproperty_filename;
9655 const char *graphemebreakproperty_filename;
9656 const char *compositionexclusions_filename;
9657 const char *specialcasing_filename;
9658 const char *casefolding_filename;
9659 const char *version;
9663 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
9668 unicodedata_filename = argv[1];
9669 proplist_filename = argv[2];
9670 derivedproplist_filename = argv[3];
9671 arabicshaping_filename = argv[4];
9672 scripts_filename = argv[5];
9673 blocks_filename = argv[6];
9674 proplist30_filename = argv[7];
9675 eastasianwidth_filename = argv[8];
9676 linebreak_filename = argv[9];
9677 wordbreakproperty_filename = argv[10];
9678 graphemebreakproperty_filename = argv[11];
9679 compositionexclusions_filename = argv[12];
9680 specialcasing_filename = argv[13];
9681 casefolding_filename = argv[14];
9684 fill_attributes (unicodedata_filename);
9685 clear_properties ();
9686 fill_properties (proplist_filename);
9687 fill_properties (derivedproplist_filename);
9688 fill_properties30 (proplist30_filename);
9689 fill_arabicshaping (arabicshaping_filename);
9690 fill_scripts (scripts_filename);
9691 fill_blocks (blocks_filename);
9692 fill_width (eastasianwidth_filename);
9693 fill_org_lbp (linebreak_filename);
9694 fill_org_wbp (wordbreakproperty_filename);
9695 fill_org_gbp (graphemebreakproperty_filename);
9696 fill_composition_exclusions (compositionexclusions_filename);
9697 fill_casing_rules (specialcasing_filename);
9698 fill_casefolding_rules (casefolding_filename);
9699 redistribute_casefolding_rules ();
9700 sort_casing_rules ();
9702 output_categories (version);
9703 output_category ("unictype/categ_of.h", version);
9704 output_combclass ("unictype/combiningclass.h", version);
9705 output_bidi_category ("unictype/bidi_of.h", version);
9706 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
9707 output_decimal_digit ("unictype/decdigit.h", version);
9708 output_digit_test ("../tests/unictype/test-digit.h", version);
9709 output_digit ("unictype/digit.h", version);
9710 output_numeric_test ("../tests/unictype/test-numeric.h", version);
9711 output_numeric ("unictype/numeric.h", version);
9712 output_mirror ("unictype/mirror.h", version);
9713 output_properties (version);
9714 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
9715 output_joining_type ("unictype/joiningtype_of.h", version);
9716 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
9717 output_joining_group ("unictype/joininggroup_of.h", version);
9719 output_scripts (version);
9720 output_scripts_byname (version);
9721 output_blocks (version);
9722 output_ident_properties (version);
9723 output_nonspacing_property ("uniwidth/width.c.part");
9724 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
9725 output_old_ctype (version);
9727 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
9728 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
9729 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
9731 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
9732 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
9733 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
9735 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
9736 output_gbp_table ("unigbrk/gbrkprop.h", version);
9738 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
9739 debug_output_composition_tables ("uninorm/composition.txt");
9740 output_composition_tables ("uninorm/composition-table.gperf", version);
9742 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
9743 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
9744 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
9745 output_simple_mapping ("unicase/toupper.h", to_upper, version);
9746 output_simple_mapping ("unicase/tolower.h", to_lower, version);
9747 output_simple_mapping ("unicase/totitle.h", to_title, version);
9748 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
9749 output_casing_rules ("unicase/special-casing-table.gperf", version);
9750 output_casing_properties (version);
9756 * For Emacs M-x compile
9758 * compile-command: "
9759 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
9761 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
9762 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
9763 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
9764 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \
9765 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
9766 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
9767 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
9768 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
9769 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
9770 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
9771 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
9772 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
9773 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
9774 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
9776 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
9777 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt