1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2494 /* Particular values of the word break property. */
2497 is_WBP_MIDNUMLET (unsigned int ch)
2499 return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
2500 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2504 is_WBP_MIDLETTER (unsigned int ch)
2506 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2507 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
2510 /* ========================================================================= */
2514 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2523 PROP_QUOTATION_MARK,
2524 PROP_TERMINAL_PUNCTUATION,
2527 PROP_ASCII_HEX_DIGIT,
2528 PROP_OTHER_ALPHABETIC,
2532 PROP_OTHER_LOWERCASE,
2533 PROP_OTHER_UPPERCASE,
2534 PROP_NONCHARACTER_CODE_POINT,
2535 PROP_OTHER_GRAPHEME_EXTEND,
2536 PROP_IDS_BINARY_OPERATOR,
2537 PROP_IDS_TRINARY_OPERATOR,
2539 PROP_UNIFIED_IDEOGRAPH,
2540 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2543 PROP_LOGICAL_ORDER_EXCEPTION,
2544 PROP_OTHER_ID_START,
2545 PROP_OTHER_ID_CONTINUE,
2547 PROP_VARIATION_SELECTOR,
2548 PROP_PATTERN_WHITE_SPACE,
2549 PROP_PATTERN_SYNTAX,
2550 /* DerivedCoreProperties.txt */
2556 PROP_CASE_IGNORABLE,
2557 PROP_CHANGES_WHEN_LOWERCASED,
2558 PROP_CHANGES_WHEN_UPPERCASED,
2559 PROP_CHANGES_WHEN_TITLECASED,
2560 PROP_CHANGES_WHEN_CASEFOLDED,
2561 PROP_CHANGES_WHEN_CASEMAPPED,
2566 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2567 PROP_GRAPHEME_EXTEND,
2571 unsigned long long unicode_properties[0x110000];
2574 clear_properties (void)
2578 for (i = 0; i < 0x110000; i++)
2579 unicode_properties[i] = 0;
2582 /* Stores in unicode_properties[] the properties from the
2583 PropList.txt or DerivedCoreProperties.txt file. */
2585 fill_properties (const char *proplist_filename)
2590 stream = fopen (proplist_filename, "r");
2593 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2600 unsigned int i1, i2;
2601 char padding[200+1];
2602 char propname[200+1];
2603 unsigned int propvalue;
2605 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2608 if (buf[0] == '\0' || buf[0] == '#')
2611 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2613 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2615 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2620 #define PROP(name,value) \
2621 if (strcmp (propname, name) == 0) propvalue = value; else
2623 PROP ("White_Space", PROP_WHITE_SPACE)
2624 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2625 PROP ("Join_Control", PROP_JOIN_CONTROL)
2626 PROP ("Dash", PROP_DASH)
2627 PROP ("Hyphen", PROP_HYPHEN)
2628 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2629 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2630 PROP ("Other_Math", PROP_OTHER_MATH)
2631 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2632 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2633 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2634 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2635 PROP ("Diacritic", PROP_DIACRITIC)
2636 PROP ("Extender", PROP_EXTENDER)
2637 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2638 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2639 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2640 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2641 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2642 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2643 PROP ("Radical", PROP_RADICAL)
2644 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2645 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2646 PROP ("Deprecated", PROP_DEPRECATED)
2647 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2648 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2649 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2650 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2651 PROP ("STerm", PROP_STERM)
2652 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2653 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2654 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2655 /* DerivedCoreProperties.txt */
2656 PROP ("Math", PROP_MATH)
2657 PROP ("Alphabetic", PROP_ALPHABETIC)
2658 PROP ("Lowercase", PROP_LOWERCASE)
2659 PROP ("Uppercase", PROP_UPPERCASE)
2660 PROP ("Cased", PROP_CASED)
2661 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2662 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2663 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2664 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2665 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2666 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2667 PROP ("ID_Start", PROP_ID_START)
2668 PROP ("ID_Continue", PROP_ID_CONTINUE)
2669 PROP ("XID_Start", PROP_XID_START)
2670 PROP ("XID_Continue", PROP_XID_CONTINUE)
2671 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2672 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2673 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2674 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2677 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2681 if (!(i1 <= i2 && i2 < 0x110000))
2684 for (i = i1; i <= i2; i++)
2685 unicode_properties[i] |= 1ULL << propvalue;
2688 if (ferror (stream) || fclose (stream))
2690 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2695 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2698 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2704 for (i = 0; i < 0x110000; i++)
2707 stream = fopen (proplist_filename, "r");
2710 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2714 /* Search for the "Property dump for: ..." line. */
2717 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2719 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2723 while (strstr (buf, property_name) == NULL);
2727 unsigned int i1, i2;
2729 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2733 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2735 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2737 fprintf (stderr, "parse error in property in '%s'\n",
2742 else if (strlen (buf) >= 4)
2744 if (sscanf (buf, "%4X", &i1) < 1)
2746 fprintf (stderr, "parse error in property in '%s'\n",
2754 fprintf (stderr, "parse error in property in '%s'\n",
2758 if (!(i1 <= i2 && i2 < 0x110000))
2760 for (i = i1; i <= i2; i++)
2763 if (ferror (stream) || fclose (stream))
2765 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2770 /* Properties from Unicode 3.0 PropList.txt file. */
2772 /* The paired punctuation property from the PropList.txt file. */
2773 char unicode_pairedpunctuation[0x110000];
2775 /* The left of pair property from the PropList.txt file. */
2776 char unicode_leftofpair[0x110000];
2779 fill_properties30 (const char *proplist30_filename)
2781 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2782 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2785 /* ------------------------------------------------------------------------- */
2787 /* See PropList.txt, UCD.html. */
2789 is_property_white_space (unsigned int ch)
2791 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2794 /* See Unicode 3.0 book, section 4.10,
2795 PropList.txt, UCD.html,
2796 DerivedCoreProperties.txt, UCD.html. */
2798 is_property_alphabetic (unsigned int ch)
2802 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2803 /* For some reason, the following are listed as having property
2804 Alphabetic but not as having property Other_Alphabetic. */
2805 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2806 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2807 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2808 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2809 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2810 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2811 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2812 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2813 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2814 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2815 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2816 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2817 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2819 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_alphabetic (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_not_a_character (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2840 /* See PropList.txt, UCD.html,
2841 DerivedCoreProperties.txt, UCD.html. */
2843 is_property_default_ignorable_code_point (unsigned int ch)
2846 (is_category_Cf (ch)
2847 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2848 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
2849 /* For some reason, the following are not listed as having property
2850 Default_Ignorable_Code_Point. */
2851 && !(ch == 0x110BD))
2852 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2853 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2855 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2857 if (result1 != result2)
2862 /* See PropList.txt, UCD.html. */
2864 is_property_other_default_ignorable_code_point (unsigned int ch)
2866 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2869 /* See PropList.txt, UCD.html. */
2871 is_property_deprecated (unsigned int ch)
2873 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2876 /* See PropList.txt, UCD.html. */
2878 is_property_logical_order_exception (unsigned int ch)
2880 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2883 /* See PropList.txt, UCD.html. */
2885 is_property_variation_selector (unsigned int ch)
2887 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2890 /* See PropList-3.0.1.txt. */
2892 is_property_private_use (unsigned int ch)
2894 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2895 return (ch >= 0xE000 && ch <= 0xF8FF)
2896 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2897 || (ch >= 0x100000 && ch <= 0x10FFFD);
2900 /* See PropList-3.0.1.txt. */
2902 is_property_unassigned_code_value (unsigned int ch)
2904 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2907 /* See PropList.txt, UCD.html,
2908 DerivedCoreProperties.txt, UCD.html. */
2910 is_property_uppercase (unsigned int ch)
2914 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2916 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2918 if (result1 != result2)
2923 /* See PropList.txt, UCD.html. */
2925 is_property_other_uppercase (unsigned int ch)
2927 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2930 /* See PropList.txt, UCD.html,
2931 DerivedCoreProperties.txt, UCD.html. */
2933 is_property_lowercase (unsigned int ch)
2937 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2939 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2941 if (result1 != result2)
2946 /* See PropList.txt, UCD.html. */
2948 is_property_other_lowercase (unsigned int ch)
2950 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2953 /* See PropList-3.0.1.txt. */
2955 is_property_titlecase (unsigned int ch)
2957 return is_category_Lt (ch);
2960 /* See DerivedCoreProperties.txt. */
2962 is_property_cased (unsigned int ch)
2964 bool result1 = (is_property_lowercase (ch)
2965 || is_property_uppercase (ch)
2966 || is_category_Lt (ch));
2967 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
2969 if (result1 != result2)
2974 /* See DerivedCoreProperties.txt. */
2976 is_property_case_ignorable (unsigned int ch)
2978 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
2979 || is_category_Mn (ch)
2980 || is_category_Me (ch)
2981 || is_category_Cf (ch)
2982 || is_category_Lm (ch)
2983 || is_category_Sk (ch));
2984 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
2986 if (result1 != result2)
2991 /* See DerivedCoreProperties.txt. */
2993 is_property_changes_when_lowercased (unsigned int ch)
2995 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
2996 bool result2 = (unicode_attributes[ch].name != NULL
2997 && unicode_attributes[ch].lower != NONE
2998 && unicode_attributes[ch].lower != ch);
3000 if (result1 != result2)
3005 /* See DerivedCoreProperties.txt. */
3007 is_property_changes_when_uppercased (unsigned int ch)
3009 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3012 /* See DerivedCoreProperties.txt. */
3014 is_property_changes_when_titlecased (unsigned int ch)
3016 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3019 /* See DerivedCoreProperties.txt. */
3021 is_property_changes_when_casefolded (unsigned int ch)
3023 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3026 /* See DerivedCoreProperties.txt. */
3028 is_property_changes_when_casemapped (unsigned int ch)
3030 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3033 /* See PropList.txt, UCD.html. */
3035 is_property_soft_dotted (unsigned int ch)
3037 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3040 /* See DerivedCoreProperties.txt, UCD.html. */
3042 is_property_id_start (unsigned int ch)
3044 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3047 /* See PropList.txt, UCD.html. */
3049 is_property_other_id_start (unsigned int ch)
3051 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3054 /* See DerivedCoreProperties.txt, UCD.html. */
3056 is_property_id_continue (unsigned int ch)
3058 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3061 /* See PropList.txt, UCD.html. */
3063 is_property_other_id_continue (unsigned int ch)
3065 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3068 /* See DerivedCoreProperties.txt, UCD.html. */
3070 is_property_xid_start (unsigned int ch)
3072 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3075 /* See DerivedCoreProperties.txt, UCD.html. */
3077 is_property_xid_continue (unsigned int ch)
3079 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3082 /* See PropList.txt, UCD.html. */
3084 is_property_pattern_white_space (unsigned int ch)
3086 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3089 /* See PropList.txt, UCD.html. */
3091 is_property_pattern_syntax (unsigned int ch)
3093 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3096 /* See PropList.txt, UCD.html. */
3098 is_property_join_control (unsigned int ch)
3100 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3103 /* See DerivedCoreProperties.txt, UCD.html. */
3105 is_property_grapheme_base (unsigned int ch)
3107 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3110 /* See DerivedCoreProperties.txt, UCD.html. */
3112 is_property_grapheme_extend (unsigned int ch)
3114 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3117 /* See PropList.txt, UCD.html. */
3119 is_property_other_grapheme_extend (unsigned int ch)
3121 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3124 /* See DerivedCoreProperties.txt, UCD.html. */
3126 is_property_grapheme_link (unsigned int ch)
3128 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3131 /* See PropList.txt, UCD.html. */
3133 is_property_bidi_control (unsigned int ch)
3135 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3138 /* See PropList-3.0.1.txt. */
3140 is_property_bidi_left_to_right (unsigned int ch)
3142 return (get_bidi_category (ch) == UC_BIDI_L);
3145 /* See PropList-3.0.1.txt. */
3147 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3149 return (get_bidi_category (ch) == UC_BIDI_R);
3152 /* See PropList-3.0.1.txt. */
3154 is_property_bidi_arabic_right_to_left (unsigned int ch)
3156 return (get_bidi_category (ch) == UC_BIDI_AL);
3159 /* See PropList-3.0.1.txt. */
3161 is_property_bidi_european_digit (unsigned int ch)
3163 return (get_bidi_category (ch) == UC_BIDI_EN);
3166 /* See PropList-3.0.1.txt. */
3168 is_property_bidi_eur_num_separator (unsigned int ch)
3170 return (get_bidi_category (ch) == UC_BIDI_ES);
3173 /* See PropList-3.0.1.txt. */
3175 is_property_bidi_eur_num_terminator (unsigned int ch)
3177 return (get_bidi_category (ch) == UC_BIDI_ET);
3180 /* See PropList-3.0.1.txt. */
3182 is_property_bidi_arabic_digit (unsigned int ch)
3184 return (get_bidi_category (ch) == UC_BIDI_AN);
3187 /* See PropList-3.0.1.txt. */
3189 is_property_bidi_common_separator (unsigned int ch)
3191 return (get_bidi_category (ch) == UC_BIDI_CS);
3194 /* See PropList-3.0.1.txt. */
3196 is_property_bidi_block_separator (unsigned int ch)
3198 return (get_bidi_category (ch) == UC_BIDI_B);
3201 /* See PropList-3.0.1.txt. */
3203 is_property_bidi_segment_separator (unsigned int ch)
3205 return (get_bidi_category (ch) == UC_BIDI_S);
3208 /* See PropList-3.0.1.txt. */
3210 is_property_bidi_whitespace (unsigned int ch)
3212 return (get_bidi_category (ch) == UC_BIDI_WS);
3215 /* See PropList-3.0.1.txt. */
3217 is_property_bidi_non_spacing_mark (unsigned int ch)
3219 return (get_bidi_category (ch) == UC_BIDI_NSM);
3222 /* See PropList-3.0.1.txt. */
3224 is_property_bidi_boundary_neutral (unsigned int ch)
3226 return (get_bidi_category (ch) == UC_BIDI_BN);
3229 /* See PropList-3.0.1.txt. */
3231 is_property_bidi_pdf (unsigned int ch)
3233 return (get_bidi_category (ch) == UC_BIDI_PDF);
3236 /* See PropList-3.0.1.txt. */
3238 is_property_bidi_embedding_or_override (unsigned int ch)
3240 int category = get_bidi_category (ch);
3241 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3242 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3245 /* See PropList-3.0.1.txt. */
3247 is_property_bidi_other_neutral (unsigned int ch)
3249 return (get_bidi_category (ch) == UC_BIDI_ON);
3252 /* See PropList.txt, UCD.html. */
3254 is_property_hex_digit (unsigned int ch)
3256 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3259 /* See PropList.txt, UCD.html. */
3261 is_property_ascii_hex_digit (unsigned int ch)
3263 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3266 /* See Unicode 3.0 book, section 4.10,
3267 PropList.txt, UCD.html. */
3269 is_property_ideographic (unsigned int ch)
3271 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3274 /* See PropList.txt, UCD.html. */
3276 is_property_unified_ideograph (unsigned int ch)
3278 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3281 /* See PropList.txt, UCD.html. */
3283 is_property_radical (unsigned int ch)
3285 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3288 /* See PropList.txt, UCD.html. */
3290 is_property_ids_binary_operator (unsigned int ch)
3292 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3295 /* See PropList.txt, UCD.html. */
3297 is_property_ids_trinary_operator (unsigned int ch)
3299 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3302 /* See PropList-3.0.1.txt. */
3304 is_property_zero_width (unsigned int ch)
3306 return is_category_Cf (ch)
3307 || (unicode_attributes[ch].name != NULL
3308 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3311 /* See PropList-3.0.1.txt. */
3313 is_property_space (unsigned int ch)
3315 return is_category_Zs (ch);
3318 /* See PropList-3.0.1.txt. */
3320 is_property_non_break (unsigned int ch)
3322 /* This is exactly the set of characters having line breaking
3324 return (ch == 0x00A0 /* NO-BREAK SPACE */
3325 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3326 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3327 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3328 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3329 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3330 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3331 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3332 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3333 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3334 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3335 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3336 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3337 || ch == 0x2007 /* FIGURE SPACE */
3338 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3339 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3342 /* See PropList-3.0.1.txt. */
3344 is_property_iso_control (unsigned int ch)
3347 (unicode_attributes[ch].name != NULL
3348 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3350 is_category_Cc (ch);
3352 if (result1 != result2)
3357 /* See PropList-3.0.1.txt. */
3359 is_property_format_control (unsigned int ch)
3361 return (is_category_Cf (ch)
3362 && get_bidi_category (ch) == UC_BIDI_BN
3363 && !is_property_join_control (ch)
3367 /* See PropList.txt, UCD.html. */
3369 is_property_dash (unsigned int ch)
3371 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3374 /* See PropList.txt, UCD.html. */
3376 is_property_hyphen (unsigned int ch)
3378 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3381 /* See PropList-3.0.1.txt. */
3383 is_property_punctuation (unsigned int ch)
3385 return is_category_P (ch);
3388 /* See PropList-3.0.1.txt. */
3390 is_property_line_separator (unsigned int ch)
3392 return is_category_Zl (ch);
3395 /* See PropList-3.0.1.txt. */
3397 is_property_paragraph_separator (unsigned int ch)
3399 return is_category_Zp (ch);
3402 /* See PropList.txt, UCD.html. */
3404 is_property_quotation_mark (unsigned int ch)
3406 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3409 /* See PropList.txt, UCD.html. */
3411 is_property_sentence_terminal (unsigned int ch)
3413 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3416 /* See PropList.txt, UCD.html. */
3418 is_property_terminal_punctuation (unsigned int ch)
3420 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3423 /* See PropList-3.0.1.txt. */
3425 is_property_currency_symbol (unsigned int ch)
3427 return is_category_Sc (ch);
3430 /* See Unicode 3.0 book, section 4.9,
3431 PropList.txt, UCD.html,
3432 DerivedCoreProperties.txt, UCD.html. */
3434 is_property_math (unsigned int ch)
3438 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3440 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3442 if (result1 != result2)
3447 /* See PropList.txt, UCD.html. */
3449 is_property_other_math (unsigned int ch)
3451 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3454 /* See PropList-3.0.1.txt. */
3456 is_property_paired_punctuation (unsigned int ch)
3458 return unicode_pairedpunctuation[ch];
3461 /* See PropList-3.0.1.txt. */
3463 is_property_left_of_pair (unsigned int ch)
3465 return unicode_leftofpair[ch];
3468 /* See PropList-3.0.1.txt. */
3470 is_property_combining (unsigned int ch)
3472 return (unicode_attributes[ch].name != NULL
3473 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3474 || is_category_Mc (ch)
3475 || is_category_Me (ch)
3476 || is_category_Mn (ch)));
3479 #if 0 /* same as is_property_bidi_non_spacing_mark */
3480 /* See PropList-3.0.1.txt. */
3482 is_property_non_spacing (unsigned int ch)
3484 return (unicode_attributes[ch].name != NULL
3485 && get_bidi_category (ch) == UC_BIDI_NSM);
3489 /* See PropList-3.0.1.txt. */
3491 is_property_composite (unsigned int ch)
3493 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3494 logical in some sense. */
3495 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3497 if (unicode_attributes[ch].name != NULL
3498 && unicode_attributes[ch].decomposition != NULL)
3500 /* Test whether the decomposition contains more than one character,
3501 and the first is not a space. */
3502 const char *decomp = unicode_attributes[ch].decomposition;
3503 if (decomp[0] == '<')
3505 decomp = strchr (decomp, '>') + 1;
3506 if (decomp[0] == ' ')
3509 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3514 /* See PropList-3.0.1.txt. */
3516 is_property_decimal_digit (unsigned int ch)
3518 return is_category_Nd (ch);
3521 /* See PropList-3.0.1.txt. */
3523 is_property_numeric (unsigned int ch)
3525 return ((get_numeric_value (ch)).denominator > 0)
3526 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3527 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3530 /* See PropList.txt, UCD.html. */
3532 is_property_diacritic (unsigned int ch)
3534 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3537 /* See PropList.txt, UCD.html. */
3539 is_property_extender (unsigned int ch)
3541 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3544 /* See PropList-3.0.1.txt. */
3546 is_property_ignorable_control (unsigned int ch)
3548 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3549 || is_category_Cf (ch))
3553 /* ------------------------------------------------------------------------- */
3555 /* Output all properties. */
3557 output_properties (const char *version)
3559 #define PROPERTY(P) \
3560 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3561 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3562 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3563 PROPERTY(white_space)
3564 PROPERTY(alphabetic)
3565 PROPERTY(other_alphabetic)
3566 PROPERTY(not_a_character)
3567 PROPERTY(default_ignorable_code_point)
3568 PROPERTY(other_default_ignorable_code_point)
3569 PROPERTY(deprecated)
3570 PROPERTY(logical_order_exception)
3571 PROPERTY(variation_selector)
3572 PROPERTY(private_use)
3573 PROPERTY(unassigned_code_value)
3575 PROPERTY(other_uppercase)
3577 PROPERTY(other_lowercase)
3580 PROPERTY(case_ignorable)
3581 PROPERTY(changes_when_lowercased)
3582 PROPERTY(changes_when_uppercased)
3583 PROPERTY(changes_when_titlecased)
3584 PROPERTY(changes_when_casefolded)
3585 PROPERTY(changes_when_casemapped)
3586 PROPERTY(soft_dotted)
3588 PROPERTY(other_id_start)
3589 PROPERTY(id_continue)
3590 PROPERTY(other_id_continue)
3592 PROPERTY(xid_continue)
3593 PROPERTY(pattern_white_space)
3594 PROPERTY(pattern_syntax)
3595 PROPERTY(join_control)
3596 PROPERTY(grapheme_base)
3597 PROPERTY(grapheme_extend)
3598 PROPERTY(other_grapheme_extend)
3599 PROPERTY(grapheme_link)
3600 PROPERTY(bidi_control)
3601 PROPERTY(bidi_left_to_right)
3602 PROPERTY(bidi_hebrew_right_to_left)
3603 PROPERTY(bidi_arabic_right_to_left)
3604 PROPERTY(bidi_european_digit)
3605 PROPERTY(bidi_eur_num_separator)
3606 PROPERTY(bidi_eur_num_terminator)
3607 PROPERTY(bidi_arabic_digit)
3608 PROPERTY(bidi_common_separator)
3609 PROPERTY(bidi_block_separator)
3610 PROPERTY(bidi_segment_separator)
3611 PROPERTY(bidi_whitespace)
3612 PROPERTY(bidi_non_spacing_mark)
3613 PROPERTY(bidi_boundary_neutral)
3615 PROPERTY(bidi_embedding_or_override)
3616 PROPERTY(bidi_other_neutral)
3618 PROPERTY(ascii_hex_digit)
3619 PROPERTY(ideographic)
3620 PROPERTY(unified_ideograph)
3622 PROPERTY(ids_binary_operator)
3623 PROPERTY(ids_trinary_operator)
3624 PROPERTY(zero_width)
3627 PROPERTY(iso_control)
3628 PROPERTY(format_control)
3631 PROPERTY(punctuation)
3632 PROPERTY(line_separator)
3633 PROPERTY(paragraph_separator)
3634 PROPERTY(quotation_mark)
3635 PROPERTY(sentence_terminal)
3636 PROPERTY(terminal_punctuation)
3637 PROPERTY(currency_symbol)
3639 PROPERTY(other_math)
3640 PROPERTY(paired_punctuation)
3641 PROPERTY(left_of_pair)
3644 PROPERTY(decimal_digit)
3648 PROPERTY(ignorable_control)
3652 /* ========================================================================= */
3656 static const char *scripts[256];
3657 static unsigned int numscripts;
3659 static uint8_t unicode_scripts[0x110000];
3662 fill_scripts (const char *scripts_filename)
3667 stream = fopen (scripts_filename, "r");
3670 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3676 for (i = 0; i < 0x110000; i++)
3677 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3682 unsigned int i1, i2;
3683 char padding[200+1];
3684 char scriptname[200+1];
3687 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3690 if (buf[0] == '\0' || buf[0] == '#')
3693 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3695 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3697 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3707 for (script = numscripts - 1; script >= 0; script--)
3708 if (strcmp (scripts[script], scriptname) == 0)
3712 scripts[numscripts] = strdup (scriptname);
3713 script = numscripts;
3715 if (numscripts == 256)
3719 for (i = i1; i <= i2; i++)
3721 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3722 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3723 unicode_scripts[i] = script;
3727 if (ferror (stream) || fclose (stream))
3729 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3734 /* Construction of sparse 3-level tables. */
3735 #define TABLE script_table
3736 #define ELEMENT uint8_t
3737 #define DEFAULT (uint8_t)~(uint8_t)0
3738 #define xmalloc malloc
3739 #define xrealloc realloc
3743 output_scripts (const char *version)
3745 const char *filename = "unictype/scripts.h";
3747 unsigned int ch, s, i;
3748 struct script_table t;
3749 unsigned int level1_offset, level2_offset, level3_offset;
3753 const char *lowercase_name;
3756 scriptinfo_t scriptinfo[256];
3758 stream = fopen (filename, "w");
3761 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3765 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3766 fprintf (stream, "/* Unicode scripts. */\n");
3767 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3770 for (s = 0; s < numscripts; s++)
3772 char *lcp = strdup (scripts[s]);
3775 for (cp = lcp; *cp != '\0'; cp++)
3776 if (*cp >= 'A' && *cp <= 'Z')
3779 scriptinfo[s].lowercase_name = lcp;
3782 for (s = 0; s < numscripts; s++)
3784 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3785 scriptinfo[s].lowercase_name);
3786 fprintf (stream, "{\n");
3788 for (ch = 0; ch < 0x110000; ch++)
3789 if (unicode_scripts[ch] == s)
3795 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3800 fprintf (stream, ",\n");
3802 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3804 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3808 fprintf (stream, "\n");
3809 fprintf (stream, "};\n");
3812 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3813 fprintf (stream, "{\n");
3814 for (s = 0; s < numscripts; s++)
3816 fprintf (stream, " {\n");
3817 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3818 scriptinfo[s].lowercase_name);
3819 fprintf (stream, " script_%s_intervals,\n",
3820 scriptinfo[s].lowercase_name);
3821 fprintf (stream, " \"%s\"\n", scripts[s]);
3822 fprintf (stream, " }");
3823 if (s+1 < numscripts)
3824 fprintf (stream, ",");
3825 fprintf (stream, "\n");
3827 fprintf (stream, "};\n");
3831 script_table_init (&t);
3833 for (ch = 0; ch < 0x110000; ch++)
3835 unsigned int s = unicode_scripts[ch];
3836 if (s != (uint8_t)~(uint8_t)0)
3837 script_table_add (&t, ch, s);
3840 script_table_finalize (&t);
3842 /* Offsets in t.result, in memory of this process. */
3844 5 * sizeof (uint32_t);
3846 5 * sizeof (uint32_t)
3847 + t.level1_size * sizeof (uint32_t);
3849 5 * sizeof (uint32_t)
3850 + t.level1_size * sizeof (uint32_t)
3851 + (t.level2_size << t.q) * sizeof (uint32_t);
3853 for (i = 0; i < 5; i++)
3854 fprintf (stream, "#define script_header_%d %d\n", i,
3855 ((uint32_t *) t.result)[i]);
3856 fprintf (stream, "static const\n");
3857 fprintf (stream, "struct\n");
3858 fprintf (stream, " {\n");
3859 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3860 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3861 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3862 fprintf (stream, " }\n");
3863 fprintf (stream, "u_script =\n");
3864 fprintf (stream, "{\n");
3865 fprintf (stream, " {");
3866 if (t.level1_size > 8)
3867 fprintf (stream, "\n ");
3868 for (i = 0; i < t.level1_size; i++)
3871 if (i > 0 && (i % 8) == 0)
3872 fprintf (stream, "\n ");
3873 offset = ((uint32_t *) (t.result + level1_offset))[i];
3875 fprintf (stream, " %5d", -1);
3877 fprintf (stream, " %5zu",
3878 (offset - level2_offset) / sizeof (uint32_t));
3879 if (i+1 < t.level1_size)
3880 fprintf (stream, ",");
3882 if (t.level1_size > 8)
3883 fprintf (stream, "\n ");
3884 fprintf (stream, " },\n");
3885 fprintf (stream, " {");
3886 if (t.level2_size << t.q > 8)
3887 fprintf (stream, "\n ");
3888 for (i = 0; i < t.level2_size << t.q; i++)
3891 if (i > 0 && (i % 8) == 0)
3892 fprintf (stream, "\n ");
3893 offset = ((uint32_t *) (t.result + level2_offset))[i];
3895 fprintf (stream, " %5d", -1);
3897 fprintf (stream, " %5zu",
3898 (offset - level3_offset) / sizeof (uint8_t));
3899 if (i+1 < t.level2_size << t.q)
3900 fprintf (stream, ",");
3902 if (t.level2_size << t.q > 8)
3903 fprintf (stream, "\n ");
3904 fprintf (stream, " },\n");
3905 fprintf (stream, " {");
3906 if (t.level3_size << t.p > 8)
3907 fprintf (stream, "\n ");
3908 for (i = 0; i < t.level3_size << t.p; i++)
3910 if (i > 0 && (i % 8) == 0)
3911 fprintf (stream, "\n ");
3912 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3913 if (i+1 < t.level3_size << t.p)
3914 fprintf (stream, ",");
3916 if (t.level3_size << t.p > 8)
3917 fprintf (stream, "\n ");
3918 fprintf (stream, " }\n");
3919 fprintf (stream, "};\n");
3921 if (ferror (stream) || fclose (stream))
3923 fprintf (stderr, "error writing to '%s'\n", filename);
3929 output_scripts_byname (const char *version)
3931 const char *filename = "unictype/scripts_byname.gperf";
3935 stream = fopen (filename, "w");
3938 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3942 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3943 fprintf (stream, "/* Unicode scripts. */\n");
3944 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3946 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3947 fprintf (stream, "%%struct-type\n");
3948 fprintf (stream, "%%language=ANSI-C\n");
3949 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3950 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3951 fprintf (stream, "%%readonly-tables\n");
3952 fprintf (stream, "%%global-table\n");
3953 fprintf (stream, "%%define word-array-name script_names\n");
3954 fprintf (stream, "%%%%\n");
3955 for (s = 0; s < numscripts; s++)
3956 fprintf (stream, "%s, %u\n", scripts[s], s);
3958 if (ferror (stream) || fclose (stream))
3960 fprintf (stderr, "error writing to '%s'\n", filename);
3965 /* ========================================================================= */
3969 typedef struct { unsigned int start; unsigned int end; const char *name; }
3971 static block_t blocks[256];
3972 static unsigned int numblocks;
3975 fill_blocks (const char *blocks_filename)
3979 stream = fopen (blocks_filename, "r");
3982 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3989 unsigned int i1, i2;
3990 char padding[200+1];
3991 char blockname[200+1];
3993 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3996 if (buf[0] == '\0' || buf[0] == '#')
3999 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4001 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4004 blocks[numblocks].start = i1;
4005 blocks[numblocks].end = i2;
4006 blocks[numblocks].name = strdup (blockname);
4007 /* It must be sorted. */
4008 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
4011 if (numblocks == 256)
4015 if (ferror (stream) || fclose (stream))
4017 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4022 /* Return the smallest block index among the blocks for characters >= ch. */
4024 block_first_index (unsigned int ch)
4026 /* Binary search. */
4027 unsigned int lo = 0;
4028 unsigned int hi = numblocks;
4030 All blocks[i], i < lo, have blocks[i].end < ch,
4031 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4034 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4035 if (blocks[mid].end < ch)
4043 /* Return the largest block index among the blocks for characters <= ch,
4046 block_last_index (unsigned int ch)
4048 /* Binary search. */
4049 unsigned int lo = 0;
4050 unsigned int hi = numblocks;
4052 All blocks[i], i < lo, have blocks[i].start <= ch,
4053 all blocks[i], i >= hi, have blocks[i].start > ch. */
4056 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4057 if (blocks[mid].start <= ch)
4066 output_blocks (const char *version)
4068 const char *filename = "unictype/blocks.h";
4069 const unsigned int shift = 8; /* bits to shift away for array access */
4070 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
4075 stream = fopen (filename, "w");
4078 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4082 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4083 fprintf (stream, "/* Unicode blocks. */\n");
4084 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4087 fprintf (stream, "static const uc_block_t blocks[] =\n");
4088 fprintf (stream, "{\n");
4089 for (i = 0; i < numblocks; i++)
4091 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4092 blocks[i].end, blocks[i].name);
4093 if (i+1 < numblocks)
4094 fprintf (stream, ",");
4095 fprintf (stream, "\n");
4097 fprintf (stream, "};\n");
4098 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4099 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4100 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4101 threshold >> shift);
4102 fprintf (stream, "{\n");
4103 for (i1 = 0; i1 < (threshold >> shift); i1++)
4105 unsigned int first_index = block_first_index (i1 << shift);
4106 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4107 fprintf (stream, " %3d, %3d", first_index, last_index);
4108 if (i1+1 < (threshold >> shift))
4109 fprintf (stream, ",");
4110 fprintf (stream, "\n");
4112 fprintf (stream, "};\n");
4113 fprintf (stream, "#define blocks_upper_first_index %d\n",
4114 block_first_index (threshold));
4115 fprintf (stream, "#define blocks_upper_last_index %d\n",
4116 block_last_index (0x10FFFF));
4118 if (ferror (stream) || fclose (stream))
4120 fprintf (stderr, "error writing to '%s'\n", filename);
4125 /* ========================================================================= */
4127 /* C and Java syntax. */
4131 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4132 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4133 UC_IDENTIFIER_INVALID, /* not valid */
4134 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4137 /* ISO C 99 section 6.4.(3). */
4139 is_c_whitespace (unsigned int ch)
4141 return (ch == ' ' /* space */
4142 || ch == '\t' /* horizontal tab */
4143 || ch == '\n' || ch == '\r' /* new-line */
4144 || ch == '\v' /* vertical tab */
4145 || ch == '\f'); /* form-feed */
4148 /* ISO C 99 section 6.4.2.1 and appendix D. */
4150 c_ident_category (unsigned int ch)
4152 /* Section 6.4.2.1. */
4153 if (ch >= '0' && ch <= '9')
4154 return UC_IDENTIFIER_VALID;
4155 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4156 return UC_IDENTIFIER_START;
4162 || (ch >= 0x00C0 && ch <= 0x00D6)
4163 || (ch >= 0x00D8 && ch <= 0x00F6)
4164 || (ch >= 0x00F8 && ch <= 0x01F5)
4165 || (ch >= 0x01FA && ch <= 0x0217)
4166 || (ch >= 0x0250 && ch <= 0x02A8)
4167 || (ch >= 0x1E00 && ch <= 0x1E9B)
4168 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4172 || (ch >= 0x0388 && ch <= 0x038A)
4174 || (ch >= 0x038E && ch <= 0x03A1)
4175 || (ch >= 0x03A3 && ch <= 0x03CE)
4176 || (ch >= 0x03D0 && ch <= 0x03D6)
4181 || (ch >= 0x03E2 && ch <= 0x03F3)
4182 || (ch >= 0x1F00 && ch <= 0x1F15)
4183 || (ch >= 0x1F18 && ch <= 0x1F1D)
4184 || (ch >= 0x1F20 && ch <= 0x1F45)
4185 || (ch >= 0x1F48 && ch <= 0x1F4D)
4186 || (ch >= 0x1F50 && ch <= 0x1F57)
4190 || (ch >= 0x1F5F && ch <= 0x1F7D)
4191 || (ch >= 0x1F80 && ch <= 0x1FB4)
4192 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4193 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4194 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4195 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4196 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4197 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4198 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4199 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4201 || (ch >= 0x0401 && ch <= 0x040C)
4202 || (ch >= 0x040E && ch <= 0x044F)
4203 || (ch >= 0x0451 && ch <= 0x045C)
4204 || (ch >= 0x045E && ch <= 0x0481)
4205 || (ch >= 0x0490 && ch <= 0x04C4)
4206 || (ch >= 0x04C7 && ch <= 0x04C8)
4207 || (ch >= 0x04CB && ch <= 0x04CC)
4208 || (ch >= 0x04D0 && ch <= 0x04EB)
4209 || (ch >= 0x04EE && ch <= 0x04F5)
4210 || (ch >= 0x04F8 && ch <= 0x04F9)
4212 || (ch >= 0x0531 && ch <= 0x0556)
4213 || (ch >= 0x0561 && ch <= 0x0587)
4215 || (ch >= 0x05B0 && ch <= 0x05B9)
4216 || (ch >= 0x05BB && ch <= 0x05BD)
4218 || (ch >= 0x05C1 && ch <= 0x05C2)
4219 || (ch >= 0x05D0 && ch <= 0x05EA)
4220 || (ch >= 0x05F0 && ch <= 0x05F2)
4222 || (ch >= 0x0621 && ch <= 0x063A)
4223 || (ch >= 0x0640 && ch <= 0x0652)
4224 || (ch >= 0x0670 && ch <= 0x06B7)
4225 || (ch >= 0x06BA && ch <= 0x06BE)
4226 || (ch >= 0x06C0 && ch <= 0x06CE)
4227 || (ch >= 0x06D0 && ch <= 0x06DC)
4228 || (ch >= 0x06E5 && ch <= 0x06E8)
4229 || (ch >= 0x06EA && ch <= 0x06ED)
4231 || (ch >= 0x0901 && ch <= 0x0903)
4232 || (ch >= 0x0905 && ch <= 0x0939)
4233 || (ch >= 0x093E && ch <= 0x094D)
4234 || (ch >= 0x0950 && ch <= 0x0952)
4235 || (ch >= 0x0958 && ch <= 0x0963)
4237 || (ch >= 0x0981 && ch <= 0x0983)
4238 || (ch >= 0x0985 && ch <= 0x098C)
4239 || (ch >= 0x098F && ch <= 0x0990)
4240 || (ch >= 0x0993 && ch <= 0x09A8)
4241 || (ch >= 0x09AA && ch <= 0x09B0)
4243 || (ch >= 0x09B6 && ch <= 0x09B9)
4244 || (ch >= 0x09BE && ch <= 0x09C4)
4245 || (ch >= 0x09C7 && ch <= 0x09C8)
4246 || (ch >= 0x09CB && ch <= 0x09CD)
4247 || (ch >= 0x09DC && ch <= 0x09DD)
4248 || (ch >= 0x09DF && ch <= 0x09E3)
4249 || (ch >= 0x09F0 && ch <= 0x09F1)
4252 || (ch >= 0x0A05 && ch <= 0x0A0A)
4253 || (ch >= 0x0A0F && ch <= 0x0A10)
4254 || (ch >= 0x0A13 && ch <= 0x0A28)
4255 || (ch >= 0x0A2A && ch <= 0x0A30)
4256 || (ch >= 0x0A32 && ch <= 0x0A33)
4257 || (ch >= 0x0A35 && ch <= 0x0A36)
4258 || (ch >= 0x0A38 && ch <= 0x0A39)
4259 || (ch >= 0x0A3E && ch <= 0x0A42)
4260 || (ch >= 0x0A47 && ch <= 0x0A48)
4261 || (ch >= 0x0A4B && ch <= 0x0A4D)
4262 || (ch >= 0x0A59 && ch <= 0x0A5C)
4266 || (ch >= 0x0A81 && ch <= 0x0A83)
4267 || (ch >= 0x0A85 && ch <= 0x0A8B)
4269 || (ch >= 0x0A8F && ch <= 0x0A91)
4270 || (ch >= 0x0A93 && ch <= 0x0AA8)
4271 || (ch >= 0x0AAA && ch <= 0x0AB0)
4272 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4273 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4274 || (ch >= 0x0ABD && ch <= 0x0AC5)
4275 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4276 || (ch >= 0x0ACB && ch <= 0x0ACD)
4280 || (ch >= 0x0B01 && ch <= 0x0B03)
4281 || (ch >= 0x0B05 && ch <= 0x0B0C)
4282 || (ch >= 0x0B0F && ch <= 0x0B10)
4283 || (ch >= 0x0B13 && ch <= 0x0B28)
4284 || (ch >= 0x0B2A && ch <= 0x0B30)
4285 || (ch >= 0x0B32 && ch <= 0x0B33)
4286 || (ch >= 0x0B36 && ch <= 0x0B39)
4287 || (ch >= 0x0B3E && ch <= 0x0B43)
4288 || (ch >= 0x0B47 && ch <= 0x0B48)
4289 || (ch >= 0x0B4B && ch <= 0x0B4D)
4290 || (ch >= 0x0B5C && ch <= 0x0B5D)
4291 || (ch >= 0x0B5F && ch <= 0x0B61)
4293 || (ch >= 0x0B82 && ch <= 0x0B83)
4294 || (ch >= 0x0B85 && ch <= 0x0B8A)
4295 || (ch >= 0x0B8E && ch <= 0x0B90)
4296 || (ch >= 0x0B92 && ch <= 0x0B95)
4297 || (ch >= 0x0B99 && ch <= 0x0B9A)
4299 || (ch >= 0x0B9E && ch <= 0x0B9F)
4300 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4301 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4302 || (ch >= 0x0BAE && ch <= 0x0BB5)
4303 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4304 || (ch >= 0x0BBE && ch <= 0x0BC2)
4305 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4306 || (ch >= 0x0BCA && ch <= 0x0BCD)
4308 || (ch >= 0x0C01 && ch <= 0x0C03)
4309 || (ch >= 0x0C05 && ch <= 0x0C0C)
4310 || (ch >= 0x0C0E && ch <= 0x0C10)
4311 || (ch >= 0x0C12 && ch <= 0x0C28)
4312 || (ch >= 0x0C2A && ch <= 0x0C33)
4313 || (ch >= 0x0C35 && ch <= 0x0C39)
4314 || (ch >= 0x0C3E && ch <= 0x0C44)
4315 || (ch >= 0x0C46 && ch <= 0x0C48)
4316 || (ch >= 0x0C4A && ch <= 0x0C4D)
4317 || (ch >= 0x0C60 && ch <= 0x0C61)
4319 || (ch >= 0x0C82 && ch <= 0x0C83)
4320 || (ch >= 0x0C85 && ch <= 0x0C8C)
4321 || (ch >= 0x0C8E && ch <= 0x0C90)
4322 || (ch >= 0x0C92 && ch <= 0x0CA8)
4323 || (ch >= 0x0CAA && ch <= 0x0CB3)
4324 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4325 || (ch >= 0x0CBE && ch <= 0x0CC4)
4326 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4327 || (ch >= 0x0CCA && ch <= 0x0CCD)
4329 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4331 || (ch >= 0x0D02 && ch <= 0x0D03)
4332 || (ch >= 0x0D05 && ch <= 0x0D0C)
4333 || (ch >= 0x0D0E && ch <= 0x0D10)
4334 || (ch >= 0x0D12 && ch <= 0x0D28)
4335 || (ch >= 0x0D2A && ch <= 0x0D39)
4336 || (ch >= 0x0D3E && ch <= 0x0D43)
4337 || (ch >= 0x0D46 && ch <= 0x0D48)
4338 || (ch >= 0x0D4A && ch <= 0x0D4D)
4339 || (ch >= 0x0D60 && ch <= 0x0D61)
4341 || (ch >= 0x0E01 && ch <= 0x0E3A)
4342 || (ch >= 0x0E40 && ch <= 0x0E5B)
4344 || (ch >= 0x0E81 && ch <= 0x0E82)
4346 || (ch >= 0x0E87 && ch <= 0x0E88)
4349 || (ch >= 0x0E94 && ch <= 0x0E97)
4350 || (ch >= 0x0E99 && ch <= 0x0E9F)
4351 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4354 || (ch >= 0x0EAA && ch <= 0x0EAB)
4355 || (ch >= 0x0EAD && ch <= 0x0EAE)
4356 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4357 || (ch >= 0x0EBB && ch <= 0x0EBD)
4358 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4360 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4361 || (ch >= 0x0EDC && ch <= 0x0EDD)
4364 || (ch >= 0x0F18 && ch <= 0x0F19)
4368 || (ch >= 0x0F3E && ch <= 0x0F47)
4369 || (ch >= 0x0F49 && ch <= 0x0F69)
4370 || (ch >= 0x0F71 && ch <= 0x0F84)
4371 || (ch >= 0x0F86 && ch <= 0x0F8B)
4372 || (ch >= 0x0F90 && ch <= 0x0F95)
4374 || (ch >= 0x0F99 && ch <= 0x0FAD)
4375 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4378 || (ch >= 0x10A0 && ch <= 0x10C5)
4379 || (ch >= 0x10D0 && ch <= 0x10F6)
4381 || (ch >= 0x3041 && ch <= 0x3093)
4382 || (ch >= 0x309B && ch <= 0x309C)
4384 || (ch >= 0x30A1 && ch <= 0x30F6)
4385 || (ch >= 0x30FB && ch <= 0x30FC)
4387 || (ch >= 0x3105 && ch <= 0x312C)
4388 /* CJK Unified Ideographs */
4389 || (ch >= 0x4E00 && ch <= 0x9FA5)
4391 || (ch >= 0xAC00 && ch <= 0xD7A3)
4393 || (ch >= 0x0660 && ch <= 0x0669)
4394 || (ch >= 0x06F0 && ch <= 0x06F9)
4395 || (ch >= 0x0966 && ch <= 0x096F)
4396 || (ch >= 0x09E6 && ch <= 0x09EF)
4397 || (ch >= 0x0A66 && ch <= 0x0A6F)
4398 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4399 || (ch >= 0x0B66 && ch <= 0x0B6F)
4400 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4401 || (ch >= 0x0C66 && ch <= 0x0C6F)
4402 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4403 || (ch >= 0x0D66 && ch <= 0x0D6F)
4404 || (ch >= 0x0E50 && ch <= 0x0E59)
4405 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4406 || (ch >= 0x0F20 && ch <= 0x0F33)
4407 /* Special characters */
4410 || (ch >= 0x02B0 && ch <= 0x02B8)
4412 || (ch >= 0x02BD && ch <= 0x02C1)
4413 || (ch >= 0x02D0 && ch <= 0x02D1)
4414 || (ch >= 0x02E0 && ch <= 0x02E4)
4420 || (ch >= 0x203F && ch <= 0x2040)
4423 || (ch >= 0x210A && ch <= 0x2113)
4425 || (ch >= 0x2118 && ch <= 0x211D)
4429 || (ch >= 0x212A && ch <= 0x2131)
4430 || (ch >= 0x2133 && ch <= 0x2138)
4431 || (ch >= 0x2160 && ch <= 0x2182)
4432 || (ch >= 0x3005 && ch <= 0x3007)
4433 || (ch >= 0x3021 && ch <= 0x3029)
4435 return UC_IDENTIFIER_START;
4436 return UC_IDENTIFIER_INVALID;
4439 /* The Java Language Specification, 3rd edition, §3.6.
4440 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4442 is_java_whitespace (unsigned int ch)
4444 return (ch == ' ' || ch == '\t' || ch == '\f'
4445 || ch == '\n' || ch == '\r');
4448 /* The Java Language Specification, 3rd edition, §3.8.
4449 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4450 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4452 java_ident_category (unsigned int ch)
4454 /* FIXME: Check this against Sun's JDK implementation. */
4455 if (is_category_L (ch) /* = Character.isLetter(ch) */
4456 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4457 || is_category_Sc (ch) /* currency symbol */
4458 || is_category_Pc (ch) /* connector punctuation */
4460 return UC_IDENTIFIER_START;
4461 if (is_category_Nd (ch) /* digit */
4462 || is_category_Mc (ch) /* combining mark */
4463 || is_category_Mn (ch) /* non-spacing mark */
4465 return UC_IDENTIFIER_VALID;
4466 if ((ch >= 0x0000 && ch <= 0x0008)
4467 || (ch >= 0x000E && ch <= 0x001B)
4468 || (ch >= 0x007F && ch <= 0x009F)
4469 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4471 return UC_IDENTIFIER_IGNORABLE;
4472 return UC_IDENTIFIER_INVALID;
4475 /* Construction of sparse 3-level tables. */
4476 #define TABLE identsyntax_table
4477 #define ELEMENT uint8_t
4478 #define DEFAULT UC_IDENTIFIER_INVALID
4479 #define xmalloc malloc
4480 #define xrealloc realloc
4483 /* Output an identifier syntax categorization in a three-level bitmap. */
4485 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4489 struct identsyntax_table t;
4490 unsigned int level1_offset, level2_offset, level3_offset;
4492 stream = fopen (filename, "w");
4495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4500 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4501 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4506 identsyntax_table_init (&t);
4508 for (ch = 0; ch < 0x110000; ch++)
4510 int syntaxcode = predicate (ch);
4511 if (syntaxcode != UC_IDENTIFIER_INVALID)
4512 identsyntax_table_add (&t, ch, syntaxcode);
4515 identsyntax_table_finalize (&t);
4517 /* Offsets in t.result, in memory of this process. */
4519 5 * sizeof (uint32_t);
4521 5 * sizeof (uint32_t)
4522 + t.level1_size * sizeof (uint32_t);
4524 5 * sizeof (uint32_t)
4525 + t.level1_size * sizeof (uint32_t)
4526 + (t.level2_size << t.q) * sizeof (uint32_t);
4528 for (i = 0; i < 5; i++)
4529 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4530 ((uint32_t *) t.result)[i]);
4531 fprintf (stream, "static const\n");
4532 fprintf (stream, "struct\n");
4533 fprintf (stream, " {\n");
4534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4536 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4537 (1 << t.p) * 2 / 16);
4538 fprintf (stream, " }\n");
4539 fprintf (stream, "%s =\n", name);
4540 fprintf (stream, "{\n");
4541 fprintf (stream, " {");
4542 if (t.level1_size > 8)
4543 fprintf (stream, "\n ");
4544 for (i = 0; i < t.level1_size; i++)
4547 if (i > 0 && (i % 8) == 0)
4548 fprintf (stream, "\n ");
4549 offset = ((uint32_t *) (t.result + level1_offset))[i];
4551 fprintf (stream, " %5d", -1);
4553 fprintf (stream, " %5zu",
4554 (offset - level2_offset) / sizeof (uint32_t));
4555 if (i+1 < t.level1_size)
4556 fprintf (stream, ",");
4558 if (t.level1_size > 8)
4559 fprintf (stream, "\n ");
4560 fprintf (stream, " },\n");
4561 fprintf (stream, " {");
4562 if (t.level2_size << t.q > 8)
4563 fprintf (stream, "\n ");
4564 for (i = 0; i < t.level2_size << t.q; i++)
4567 if (i > 0 && (i % 8) == 0)
4568 fprintf (stream, "\n ");
4569 offset = ((uint32_t *) (t.result + level2_offset))[i];
4571 fprintf (stream, " %5d", -1);
4573 fprintf (stream, " %5zu",
4574 (offset - level3_offset) / sizeof (uint8_t));
4575 if (i+1 < t.level2_size << t.q)
4576 fprintf (stream, ",");
4578 if (t.level2_size << t.q > 8)
4579 fprintf (stream, "\n ");
4580 fprintf (stream, " },\n");
4581 /* Pack the level3 array. Each entry needs 2 bits only. */
4582 fprintf (stream, " {");
4583 if ((t.level3_size << t.p) * 2 / 16 > 8)
4584 fprintf (stream, "\n ");
4585 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4587 if (i > 0 && (i % 8) == 0)
4588 fprintf (stream, "\n ");
4589 fprintf (stream, " 0x%04x",
4590 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4591 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4592 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4593 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4594 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4595 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4596 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4597 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4598 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4599 fprintf (stream, ",");
4601 if ((t.level3_size << t.p) * 2 / 16 > 8)
4602 fprintf (stream, "\n ");
4603 fprintf (stream, " }\n");
4604 fprintf (stream, "};\n");
4606 if (ferror (stream) || fclose (stream))
4608 fprintf (stderr, "error writing to '%s'\n", filename);
4614 output_ident_properties (const char *version)
4616 #define PROPERTY(P) \
4617 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4618 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4619 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4620 PROPERTY(c_whitespace)
4621 PROPERTY(java_whitespace)
4624 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4625 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4628 /* ========================================================================= */
4630 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4631 glibc/localedata/locales/i18n file, generated by
4632 glibc/localedata/gen-unicode-ctype.c. */
4634 /* Character mappings. */
4637 to_upper (unsigned int ch)
4639 if (unicode_attributes[ch].name != NULL
4640 && unicode_attributes[ch].upper != NONE)
4641 return unicode_attributes[ch].upper;
4647 to_lower (unsigned int ch)
4649 if (unicode_attributes[ch].name != NULL
4650 && unicode_attributes[ch].lower != NONE)
4651 return unicode_attributes[ch].lower;
4657 to_title (unsigned int ch)
4659 if (unicode_attributes[ch].name != NULL
4660 && unicode_attributes[ch].title != NONE)
4661 return unicode_attributes[ch].title;
4666 /* Character class properties. */
4669 is_upper (unsigned int ch)
4671 return (to_lower (ch) != ch);
4675 is_lower (unsigned int ch)
4677 return (to_upper (ch) != ch)
4678 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4683 is_alpha (unsigned int ch)
4685 return (unicode_attributes[ch].name != NULL
4686 && ((unicode_attributes[ch].category[0] == 'L'
4687 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4688 <U0E2F>, <U0E46> should belong to is_punct. */
4689 && (ch != 0x0E2F) && (ch != 0x0E46))
4690 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4691 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4693 || (ch >= 0x0E34 && ch <= 0x0E3A)
4694 || (ch >= 0x0E47 && ch <= 0x0E4E)
4695 /* Avoid warning for <U0345>. */
4697 /* Avoid warnings for <U2160>..<U217F>. */
4698 || (unicode_attributes[ch].category[0] == 'N'
4699 && unicode_attributes[ch].category[1] == 'l')
4700 /* Avoid warnings for <U24B6>..<U24E9>. */
4701 || (unicode_attributes[ch].category[0] == 'S'
4702 && unicode_attributes[ch].category[1] == 'o'
4703 && strstr (unicode_attributes[ch].name, " LETTER ")
4705 /* Consider all the non-ASCII digits as alphabetic.
4706 ISO C 99 forbids us to have them in category "digit",
4707 but we want iswalnum to return true on them. */
4708 || (unicode_attributes[ch].category[0] == 'N'
4709 && unicode_attributes[ch].category[1] == 'd'
4710 && !(ch >= 0x0030 && ch <= 0x0039))));
4714 is_digit (unsigned int ch)
4717 return (unicode_attributes[ch].name != NULL
4718 && unicode_attributes[ch].category[0] == 'N'
4719 && unicode_attributes[ch].category[1] == 'd');
4720 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4721 a zero. Must add <0> in front of them by hand. */
4723 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4726 The iswdigit function tests for any wide character that corresponds
4727 to a decimal-digit character (as defined in 5.2.1).
4729 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4731 return (ch >= 0x0030 && ch <= 0x0039);
4736 is_outdigit (unsigned int ch)
4738 return (ch >= 0x0030 && ch <= 0x0039);
4742 is_alnum (unsigned int ch)
4744 return is_alpha (ch) || is_digit (ch);
4748 is_blank (unsigned int ch)
4750 return (ch == 0x0009 /* '\t' */
4751 /* Category Zs without mention of "<noBreak>" */
4752 || (unicode_attributes[ch].name != NULL
4753 && unicode_attributes[ch].category[0] == 'Z'
4754 && unicode_attributes[ch].category[1] == 's'
4755 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4759 is_space (unsigned int ch)
4761 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4762 should treat it like a punctuation character, not like a space. */
4763 return (ch == 0x0020 /* ' ' */
4764 || ch == 0x000C /* '\f' */
4765 || ch == 0x000A /* '\n' */
4766 || ch == 0x000D /* '\r' */
4767 || ch == 0x0009 /* '\t' */
4768 || ch == 0x000B /* '\v' */
4769 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4770 || (unicode_attributes[ch].name != NULL
4771 && unicode_attributes[ch].category[0] == 'Z'
4772 && (unicode_attributes[ch].category[1] == 'l'
4773 || unicode_attributes[ch].category[1] == 'p'
4774 || (unicode_attributes[ch].category[1] == 's'
4775 && !strstr (unicode_attributes[ch].decomposition,
4780 is_cntrl (unsigned int ch)
4782 return (unicode_attributes[ch].name != NULL
4783 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4784 /* Categories Zl and Zp */
4785 || (unicode_attributes[ch].category[0] == 'Z'
4786 && (unicode_attributes[ch].category[1] == 'l'
4787 || unicode_attributes[ch].category[1] == 'p'))));
4791 is_xdigit (unsigned int ch)
4794 return is_digit (ch)
4795 || (ch >= 0x0041 && ch <= 0x0046)
4796 || (ch >= 0x0061 && ch <= 0x0066);
4798 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4801 The iswxdigit function tests for any wide character that corresponds
4802 to a hexadecimal-digit character (as defined in 6.4.4.1).
4804 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4806 return (ch >= 0x0030 && ch <= 0x0039)
4807 || (ch >= 0x0041 && ch <= 0x0046)
4808 || (ch >= 0x0061 && ch <= 0x0066);
4813 is_graph (unsigned int ch)
4815 return (unicode_attributes[ch].name != NULL
4816 && strcmp (unicode_attributes[ch].name, "<control>")
4821 is_print (unsigned int ch)
4823 return (unicode_attributes[ch].name != NULL
4824 && strcmp (unicode_attributes[ch].name, "<control>")
4825 /* Categories Zl and Zp */
4826 && !(unicode_attributes[ch].name != NULL
4827 && unicode_attributes[ch].category[0] == 'Z'
4828 && (unicode_attributes[ch].category[1] == 'l'
4829 || unicode_attributes[ch].category[1] == 'p')));
4833 is_punct (unsigned int ch)
4836 return (unicode_attributes[ch].name != NULL
4837 && unicode_attributes[ch].category[0] == 'P');
4839 /* The traditional POSIX definition of punctuation is every graphic,
4840 non-alphanumeric character. */
4841 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4845 /* Output all properties. */
4847 output_old_ctype (const char *version)
4849 #define PROPERTY(P) \
4850 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4851 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4852 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4871 is_combining (unsigned int ch)
4873 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4874 file. In 3.0.1 it was identical to the union of the general categories
4875 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4876 PropList.txt file, so we take the latter definition. */
4877 return (unicode_attributes[ch].name != NULL
4878 && unicode_attributes[ch].category[0] == 'M'
4879 && (unicode_attributes[ch].category[1] == 'n'
4880 || unicode_attributes[ch].category[1] == 'c'
4881 || unicode_attributes[ch].category[1] == 'e'));
4885 is_combining_level3 (unsigned int ch)
4887 return is_combining (ch)
4888 && !(unicode_attributes[ch].combining[0] != '\0'
4889 && unicode_attributes[ch].combining[0] != '0'
4890 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4893 /* Return the UCS symbol string for a Unicode character. */
4895 ucs_symbol (unsigned int i)
4897 static char buf[11+1];
4899 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4903 /* Return the UCS symbol range string for a Unicode characters interval. */
4905 ucs_symbol_range (unsigned int low, unsigned int high)
4907 static char buf[24+1];
4909 strcpy (buf, ucs_symbol (low));
4911 strcat (buf, ucs_symbol (high));
4915 /* Output a character class (= property) table. */
4918 output_charclass (FILE *stream, const char *classname,
4919 bool (*func) (unsigned int))
4921 char table[0x110000];
4923 bool need_semicolon;
4924 const int max_column = 75;
4927 for (i = 0; i < 0x110000; i++)
4928 table[i] = (int) func (i);
4930 fprintf (stream, "%s ", classname);
4931 need_semicolon = false;
4933 for (i = 0; i < 0x110000; )
4939 unsigned int low, high;
4945 while (i < 0x110000 && table[i]);
4949 strcpy (buf, ucs_symbol (low));
4951 strcpy (buf, ucs_symbol_range (low, high));
4955 fprintf (stream, ";");
4959 if (column + strlen (buf) > max_column)
4961 fprintf (stream, "/\n ");
4965 fprintf (stream, "%s", buf);
4966 column += strlen (buf);
4967 need_semicolon = true;
4970 fprintf (stream, "\n");
4973 /* Output a character mapping table. */
4976 output_charmap (FILE *stream, const char *mapname,
4977 unsigned int (*func) (unsigned int))
4979 char table[0x110000];
4981 bool need_semicolon;
4982 const int max_column = 75;
4985 for (i = 0; i < 0x110000; i++)
4986 table[i] = (func (i) != i);
4988 fprintf (stream, "%s ", mapname);
4989 need_semicolon = false;
4991 for (i = 0; i < 0x110000; i++)
4997 strcat (buf, ucs_symbol (i));
4999 strcat (buf, ucs_symbol (func (i)));
5004 fprintf (stream, ";");
5008 if (column + strlen (buf) > max_column)
5010 fprintf (stream, "/\n ");
5014 fprintf (stream, "%s", buf);
5015 column += strlen (buf);
5016 need_semicolon = true;
5018 fprintf (stream, "\n");
5021 /* Output the width table. */
5024 output_widthmap (FILE *stream)
5028 /* Output the tables to the given file. */
5031 output_tables (const char *filename, const char *version)
5036 stream = fopen (filename, "w");
5039 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5043 fprintf (stream, "escape_char /\n");
5044 fprintf (stream, "comment_char %%\n");
5045 fprintf (stream, "\n");
5046 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
5048 fprintf (stream, "\n");
5050 fprintf (stream, "LC_IDENTIFICATION\n");
5051 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5052 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5053 fprintf (stream, "address \"\"\n");
5054 fprintf (stream, "contact \"\"\n");
5055 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5056 fprintf (stream, "tel \"\"\n");
5057 fprintf (stream, "fax \"\"\n");
5058 fprintf (stream, "language \"\"\n");
5059 fprintf (stream, "territory \"Earth\"\n");
5060 fprintf (stream, "revision \"%s\"\n", version);
5065 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5066 fprintf (stream, "date \"%s\"\n", date);
5068 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5069 fprintf (stream, "END LC_IDENTIFICATION\n");
5070 fprintf (stream, "\n");
5072 /* Verifications. */
5073 for (ch = 0; ch < 0x110000; ch++)
5075 /* toupper restriction: "Only characters specified for the keywords
5076 lower and upper shall be specified. */
5077 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5079 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5080 ucs_symbol (ch), ch, to_upper (ch));
5082 /* tolower restriction: "Only characters specified for the keywords
5083 lower and upper shall be specified. */
5084 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5086 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5087 ucs_symbol (ch), ch, to_lower (ch));
5089 /* alpha restriction: "Characters classified as either upper or lower
5090 shall automatically belong to this class. */
5091 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5092 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5094 /* alpha restriction: "No character specified for the keywords cntrl,
5095 digit, punct or space shall be specified." */
5096 if (is_alpha (ch) && is_cntrl (ch))
5097 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5098 if (is_alpha (ch) && is_digit (ch))
5099 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5100 if (is_alpha (ch) && is_punct (ch))
5101 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5102 if (is_alpha (ch) && is_space (ch))
5103 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5105 /* space restriction: "No character specified for the keywords upper,
5106 lower, alpha, digit, graph or xdigit shall be specified."
5107 upper, lower, alpha already checked above. */
5108 if (is_space (ch) && is_digit (ch))
5109 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5110 if (is_space (ch) && is_graph (ch))
5111 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5112 if (is_space (ch) && is_xdigit (ch))
5113 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5115 /* cntrl restriction: "No character specified for the keywords upper,
5116 lower, alpha, digit, punct, graph, print or xdigit shall be
5117 specified." upper, lower, alpha already checked above. */
5118 if (is_cntrl (ch) && is_digit (ch))
5119 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5120 if (is_cntrl (ch) && is_punct (ch))
5121 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5122 if (is_cntrl (ch) && is_graph (ch))
5123 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5124 if (is_cntrl (ch) && is_print (ch))
5125 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5126 if (is_cntrl (ch) && is_xdigit (ch))
5127 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5129 /* punct restriction: "No character specified for the keywords upper,
5130 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5131 be specified." upper, lower, alpha, cntrl already checked above. */
5132 if (is_punct (ch) && is_digit (ch))
5133 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5134 if (is_punct (ch) && is_xdigit (ch))
5135 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5136 if (is_punct (ch) && (ch == 0x0020))
5137 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5139 /* graph restriction: "No character specified for the keyword cntrl
5140 shall be specified." Already checked above. */
5142 /* print restriction: "No character specified for the keyword cntrl
5143 shall be specified." Already checked above. */
5145 /* graph - print relation: differ only in the <space> character.
5146 How is this possible if there are more than one space character?!
5147 I think susv2/xbd/locale.html should speak of "space characters",
5148 not "space character". */
5149 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5151 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5152 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5154 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5157 fprintf (stream, "LC_CTYPE\n");
5158 output_charclass (stream, "upper", is_upper);
5159 output_charclass (stream, "lower", is_lower);
5160 output_charclass (stream, "alpha", is_alpha);
5161 output_charclass (stream, "digit", is_digit);
5162 output_charclass (stream, "outdigit", is_outdigit);
5163 output_charclass (stream, "blank", is_blank);
5164 output_charclass (stream, "space", is_space);
5165 output_charclass (stream, "cntrl", is_cntrl);
5166 output_charclass (stream, "punct", is_punct);
5167 output_charclass (stream, "xdigit", is_xdigit);
5168 output_charclass (stream, "graph", is_graph);
5169 output_charclass (stream, "print", is_print);
5170 output_charclass (stream, "class \"combining\";", is_combining);
5171 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5172 output_charmap (stream, "toupper", to_upper);
5173 output_charmap (stream, "tolower", to_lower);
5174 output_charmap (stream, "map \"totitle\";", to_title);
5175 output_widthmap (stream);
5176 fprintf (stream, "END LC_CTYPE\n");
5178 if (ferror (stream) || fclose (stream))
5180 fprintf (stderr, "error writing to '%s'\n", filename);
5187 /* ========================================================================= */
5189 /* The width property from the EastAsianWidth.txt file.
5190 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5191 const char * unicode_width[0x110000];
5193 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5196 fill_width (const char *width_filename)
5200 char field0[FIELDLEN];
5201 char field1[FIELDLEN];
5202 char field2[FIELDLEN];
5205 for (i = 0; i < 0x110000; i++)
5206 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5208 stream = fopen (width_filename, "r");
5211 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5226 do c = getc (stream); while (c != EOF && c != '\n');
5230 n = getfield (stream, field0, ';');
5231 n += getfield (stream, field1, ' ');
5232 n += getfield (stream, field2, '\n');
5237 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5240 i = strtoul (field0, NULL, 16);
5241 if (strstr (field0, "..") != NULL)
5243 /* Deal with a range. */
5244 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5246 unicode_width[i] = strdup (field1);
5250 /* Single character line. */
5251 unicode_width[i] = strdup (field1);
5254 if (ferror (stream) || fclose (stream))
5256 fprintf (stderr, "error reading from '%s'\n", width_filename);
5261 /* ========================================================================= */
5263 /* Non-spacing attribute and width. */
5265 /* The non-spacing attribute table consists of:
5266 - Non-spacing characters; generated from PropList.txt or
5267 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
5268 - Format control characters; generated from
5269 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
5270 - Zero width characters; generated from
5271 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
5275 is_nonspacing (unsigned int ch)
5277 return (unicode_attributes[ch].name != NULL
5278 && (get_bidi_category (ch) == UC_BIDI_NSM
5279 || is_category_Cc (ch) || is_category_Cf (ch)
5280 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
5284 output_nonspacing_property (const char *filename)
5287 int ind[0x110000 / 0x200];
5292 stream = fopen (filename, "w");
5295 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5300 for (i = 0; i < 0x110000 / 0x200; i++)
5302 bool nontrivial = false;
5305 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
5306 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
5307 if (is_nonspacing (ch))
5313 ind[i] = next_ind++;
5318 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
5321 for (i = 0; i < 0x110000 / 0x200; i++)
5323 bool nontrivial = (ind[i] >= 0);
5329 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
5330 for (j = 0; j < 8; j++)
5334 fprintf (stream, " ");
5335 for (k = 0; k < 8; k++)
5338 unsigned char bits = 0;
5340 for (l = 0; l < 8; l++)
5342 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
5344 if (is_nonspacing (ch))
5347 fprintf (stream, " 0x%02x%c", bits,
5348 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
5350 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5351 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
5356 fprintf (stream, "};\n");
5358 i_max = ((i_max + 8 - 1) / 8) * 8;
5359 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
5364 for (j = 0; j < i_max / 8; j++)
5368 fprintf (stream, " ");
5369 for (k = 0; k < 8; k++)
5372 fprintf (stream, " %2d%c", ind[i],
5373 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
5375 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5376 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
5379 fprintf (stream, "};\n");
5381 if (ferror (stream) || fclose (stream))
5383 fprintf (stderr, "error writing to '%s'\n", filename);
5388 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
5390 symbolic_width (unsigned int ch)
5392 /* Test for unassigned character. */
5393 if (is_property_unassigned_code_value (ch))
5395 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
5396 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
5398 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
5399 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
5400 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
5401 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
5402 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
5408 /* Test for non-spacing or control character. */
5409 if (is_category_Cc (ch) && ch < 0x00A0)
5411 if (is_nonspacing (ch))
5413 /* Test for double-width character. */
5414 if (unicode_width[ch] != NULL
5415 && (strcmp (unicode_width[ch], "W") == 0
5416 || strcmp (unicode_width[ch], "F") == 0))
5418 /* Test for half-width character. */
5419 if (unicode_width[ch] != NULL
5420 && strcmp (unicode_width[ch], "H") == 0)
5423 /* In ancient CJK encodings, Cyrillic and most other characters are
5424 double-width as well. */
5425 if (ch >= 0x00A1 && ch < 0x10000)
5431 output_width_property_test (const char *filename)
5434 unsigned int interval_start, interval_end, ch;
5435 char interval_value;
5437 stream = fopen (filename, "w");
5440 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5445 interval_start = interval_end = 0; /* avoid GCC warning */
5446 for (ch = 0; ch < 0x110000; ch++)
5448 char value = symbolic_width (ch);
5449 if (value != 0) /* skip Cc control characters and unassigned characters */
5451 if (value == interval_value)
5452 /* Extend the interval. */
5456 /* Terminate the interval. */
5457 if (interval_value != 0)
5459 if (interval_end == interval_start)
5460 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5462 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5464 /* Start a new interval. */
5465 interval_start = interval_end = ch;
5466 interval_value = value;
5470 /* Terminate the last interval. */
5471 if (interval_value != 0)
5473 if (interval_end == interval_start)
5474 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5476 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5479 if (ferror (stream) || fclose (stream))
5481 fprintf (stderr, "error writing to '%s'\n", filename);
5486 /* ========================================================================= */
5488 /* Line breaking classification.
5489 Updated for Unicode TR #14 revision 26. */
5493 /* Values >= 25 are resolved at run time. */
5494 LBP_BK = 25, /* mandatory break */
5495 /*LBP_CR, carriage return - not used here because it's a DOSism */
5496 /*LBP_LF, line feed - not used here because it's a DOSism */
5497 LBP_CM = 26, /* attached characters and combining marks */
5498 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5499 /*LBP_SG, surrogates - not used here because they are not characters */
5500 LBP_WJ = 0, /* word joiner */
5501 LBP_ZW = 27, /* zero width space */
5502 LBP_GL = 1, /* non-breaking (glue) */
5503 LBP_SP = 28, /* space */
5504 LBP_B2 = 2, /* break opportunity before and after */
5505 LBP_BA = 3, /* break opportunity after */
5506 LBP_BB = 4, /* break opportunity before */
5507 LBP_HY = 5, /* hyphen */
5508 LBP_CB = 29, /* contingent break opportunity */
5509 LBP_CL = 6, /* closing punctuation */
5510 LBP_CP = 7, /* closing parenthesis */
5511 LBP_EX = 8, /* exclamation/interrogation */
5512 LBP_IN = 9, /* inseparable */
5513 LBP_NS = 10, /* non starter */
5514 LBP_OP = 11, /* opening punctuation */
5515 LBP_QU = 12, /* ambiguous quotation */
5516 LBP_IS = 13, /* infix separator (numeric) */
5517 LBP_NU = 14, /* numeric */
5518 LBP_PO = 15, /* postfix (numeric) */
5519 LBP_PR = 16, /* prefix (numeric) */
5520 LBP_SY = 17, /* symbols allowing breaks */
5521 LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
5522 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
5523 LBP_H2 = 19, /* Hangul LV syllable */
5524 LBP_H3 = 20, /* Hangul LVT syllable */
5525 LBP_ID = 21, /* ideographic */
5526 LBP_JL = 22, /* Hangul L Jamo */
5527 LBP_JV = 23, /* Hangul V Jamo */
5528 LBP_JT = 24, /* Hangul T Jamo */
5529 LBP_SA = 31, /* complex context (South East Asian) */
5530 LBP_XX = 32 /* unknown */
5533 /* Returns the line breaking classification for ch, as a bit mask. */
5535 get_lbp (unsigned int ch)
5539 if (unicode_attributes[ch].name != NULL)
5541 /* mandatory break */
5542 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5543 || ch == 0x000C /* form feed */
5544 || ch == 0x000B /* line tabulation */
5545 || ch == 0x2028 /* LINE SEPARATOR */
5546 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5547 attr |= (int64_t) 1 << LBP_BK;
5549 if (ch == 0x2060 /* WORD JOINER */
5550 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5551 attr |= (int64_t) 1 << LBP_WJ;
5553 /* zero width space */
5554 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5555 attr |= (int64_t) 1 << LBP_ZW;
5557 /* non-breaking (glue) */
5558 if (ch == 0x00A0 /* NO-BREAK SPACE */
5559 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5560 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5561 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5562 || ch == 0x2007 /* FIGURE SPACE */
5563 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5564 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5565 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5566 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5567 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
5568 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5569 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
5570 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
5571 attr |= (int64_t) 1 << LBP_GL;
5574 if (ch == 0x0020 /* SPACE */)
5575 attr |= (int64_t) 1 << LBP_SP;
5577 /* break opportunity before and after */
5578 if (ch == 0x2014 /* EM DASH */)
5579 attr |= (int64_t) 1 << LBP_B2;
5581 /* break opportunity after */
5582 if (/* Breaking Spaces */
5583 ch == 0x1680 /* OGHAM SPACE MARK */
5584 || ch == 0x2000 /* EN QUAD */
5585 || ch == 0x2001 /* EM QUAD */
5586 || ch == 0x2002 /* EN SPACE */
5587 || ch == 0x2003 /* EM SPACE */
5588 || ch == 0x2004 /* THREE-PER-EM SPACE */
5589 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5590 || ch == 0x2006 /* SIX-PER-EM SPACE */
5591 || ch == 0x2008 /* PUNCTUATION SPACE */
5592 || ch == 0x2009 /* THIN SPACE */
5593 || ch == 0x200A /* HAIR SPACE */
5594 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5596 || ch == 0x0009 /* tab */
5597 /* Conditional Hyphens */
5598 || ch == 0x00AD /* SOFT HYPHEN */
5599 /* Breaking Hyphens */
5600 || ch == 0x058A /* ARMENIAN HYPHEN */
5601 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
5602 || ch == 0x2010 /* HYPHEN */
5603 || ch == 0x2012 /* FIGURE DASH */
5604 || ch == 0x2013 /* EN DASH */
5605 /* Visible Word Dividers */
5606 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5607 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5608 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5609 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5610 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5611 || ch == 0x2027 /* HYPHENATION POINT */
5612 || ch == 0x007C /* VERTICAL LINE */
5613 /* Historic Word Separators */
5614 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5615 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5616 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5617 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5618 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5619 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5620 || ch == 0x205A /* TWO DOT PUNCTUATION */
5621 || ch == 0x205B /* FOUR DOT MARK */
5622 || ch == 0x205D /* TRICOLON */
5623 || ch == 0x205E /* VERTICAL FOUR DOTS */
5624 || ch == 0x2E19 /* PALM BRANCH */
5625 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5626 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5627 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5628 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5629 || ch == 0x2E30 /* RING POINT */
5630 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
5631 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5632 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5633 || ch == 0x10102 /* AEGEAN CHECK MARK */
5634 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5635 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5636 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5637 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5639 || ch == 0x0964 /* DEVANAGARI DANDA */
5640 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5641 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5642 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5643 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5644 || ch == 0x104B /* MYANMAR SIGN SECTION */
5645 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5646 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5647 || ch == 0x17D4 /* KHMER SIGN KHAN */
5648 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5649 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5650 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5651 || ch == 0xA8CE /* SAURASHTRA DANDA */
5652 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5653 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5654 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5655 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5656 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5657 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5659 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5660 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5661 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5662 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5663 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5664 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5665 /* Other Terminating Punctuation */
5666 || ch == 0x1804 /* MONGOLIAN COLON */
5667 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5668 || ch == 0x1B5A /* BALINESE PANTI */
5669 || ch == 0x1B5B /* BALINESE PAMADA */
5670 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5671 || ch == 0x1B60 /* BALINESE PAMENENG */
5672 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5673 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5674 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5675 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5676 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5677 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5678 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5679 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5680 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5681 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5682 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5683 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5684 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5685 || ch == 0xA60D /* VAI COMMA */
5686 || ch == 0xA60F /* VAI QUESTION MARK */
5687 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5688 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5689 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5690 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5691 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5692 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5693 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5694 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5695 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5696 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
5697 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
5698 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
5699 || ch == 0xA6F3 /* BAMUM FULL STOP */
5700 || ch == 0xA6F4 /* BAMUM COLON */
5701 || ch == 0xA6F5 /* BAMUM COMMA */
5702 || ch == 0xA6F6 /* BAMUM SEMICOLON */
5703 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
5704 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
5705 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
5706 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
5707 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
5708 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
5709 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
5710 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
5711 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
5712 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
5713 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
5714 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
5715 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
5716 || ch == 0x11047 /* BRAHMI DANDA */
5717 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
5718 || ch == 0x110BE /* KAITHI SECTION MARK */
5719 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
5720 || ch == 0x110C0 /* KAITHI DANDA */
5721 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
5722 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5723 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5724 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5725 attr |= (int64_t) 1 << LBP_BA;
5727 /* break opportunity before */
5728 if (ch == 0x00B4 /* ACUTE ACCENT */
5729 || ch == 0x1FFD /* GREEK OXIA */
5730 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5731 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5732 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5733 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5734 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5735 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5736 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5737 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5738 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5739 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5740 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5741 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5742 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5743 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5744 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5745 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5746 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5747 attr |= (int64_t) 1 << LBP_BB;
5750 if (ch == 0x002D /* HYPHEN-MINUS */)
5751 attr |= (int64_t) 1 << LBP_HY;
5753 /* contingent break opportunity */
5754 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5755 attr |= (int64_t) 1 << LBP_CB;
5757 /* closing parenthesis */
5758 if (ch == 0x0029 /* RIGHT PARENTHESIS */
5759 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
5760 attr |= (int64_t) 1 << LBP_CP;
5762 /* closing punctuation */
5763 if ((unicode_attributes[ch].category[0] == 'P'
5764 && unicode_attributes[ch].category[1] == 'e'
5765 && !(attr & ((int64_t) 1 << LBP_CP)))
5766 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5767 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5768 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5769 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5770 || ch == 0xFE50 /* SMALL COMMA */
5771 || ch == 0xFE52 /* SMALL FULL STOP */
5772 || ch == 0xFF0C /* FULLWIDTH COMMA */
5773 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5774 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5775 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
5776 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5777 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
5778 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
5779 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
5780 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
5781 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
5782 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
5783 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
5784 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
5785 attr |= (int64_t) 1 << LBP_CL;
5787 /* exclamation/interrogation */
5788 if (ch == 0x0021 /* EXCLAMATION MARK */
5789 || ch == 0x003F /* QUESTION MARK */
5790 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5791 || ch == 0x061B /* ARABIC SEMICOLON */
5792 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5793 || ch == 0x061F /* ARABIC QUESTION MARK */
5794 || ch == 0x06D4 /* ARABIC FULL STOP */
5795 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5796 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5797 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5798 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5799 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5800 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5801 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5802 || ch == 0x1802 /* MONGOLIAN COMMA */
5803 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5804 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5805 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5806 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5807 || ch == 0x1945 /* LIMBU QUESTION MARK */
5808 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5809 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5810 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5811 || ch == 0x2CFE /* COPTIC FULL STOP */
5812 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5813 || ch == 0xA60E /* VAI FULL STOP */
5814 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5815 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5816 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5817 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5818 || ch == 0xFE56 /* SMALL QUESTION MARK */
5819 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5820 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5821 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5822 attr |= (int64_t) 1 << LBP_EX;
5825 if (ch == 0x2024 /* ONE DOT LEADER */
5826 || ch == 0x2025 /* TWO DOT LEADER */
5827 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5828 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5829 attr |= (int64_t) 1 << LBP_IN;
5832 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5833 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5834 || ch == 0x203D /* INTERROBANG */
5835 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5836 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5837 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5838 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5839 || ch == 0x301C /* WAVE DASH */
5840 || ch == 0x303C /* MASU MARK */
5841 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5842 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5843 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5844 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5845 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5846 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5847 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5848 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5849 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5850 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5851 || ch == 0xA015 /* YI SYLLABLE WU */
5852 || ch == 0xFE54 /* SMALL SEMICOLON */
5853 || ch == 0xFE55 /* SMALL COLON */
5854 || ch == 0xFF1A /* FULLWIDTH COLON */
5855 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5856 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5857 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5858 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5859 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5860 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5861 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5862 attr |= (int64_t) 1 << LBP_NS;
5864 /* opening punctuation */
5865 if ((unicode_attributes[ch].category[0] == 'P'
5866 && unicode_attributes[ch].category[1] == 's')
5867 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5868 || ch == 0x00BF /* INVERTED QUESTION MARK */
5869 || ch == 0x2E18 /* INVERTED INTERROBANG */
5870 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5871 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
5872 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
5873 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
5874 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
5875 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
5876 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
5877 attr |= (int64_t) 1 << LBP_OP;
5879 /* ambiguous quotation */
5880 if ((unicode_attributes[ch].category[0] == 'P'
5881 && (unicode_attributes[ch].category[1] == 'f'
5882 || unicode_attributes[ch].category[1] == 'i'))
5883 || ch == 0x0022 /* QUOTATION MARK */
5884 || ch == 0x0027 /* APOSTROPHE */
5885 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5886 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5887 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5888 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5889 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5890 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5891 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5892 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5893 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5894 || ch == 0x2E0B /* RAISED SQUARE */)
5895 attr |= (int64_t) 1 << LBP_QU;
5897 /* infix separator (numeric) */
5898 if (ch == 0x002C /* COMMA */
5899 || ch == 0x002E /* FULL STOP */
5900 || ch == 0x003A /* COLON */
5901 || ch == 0x003B /* SEMICOLON */
5902 || ch == 0x037E /* GREEK QUESTION MARK */
5903 || ch == 0x0589 /* ARMENIAN FULL STOP */
5904 || ch == 0x060C /* ARABIC COMMA */
5905 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5906 || ch == 0x07F8 /* NKO COMMA */
5907 || ch == 0x2044 /* FRACTION SLASH */
5908 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5909 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5910 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5911 attr |= (int64_t) 1 << LBP_IS;
5914 if ((unicode_attributes[ch].category[0] == 'N'
5915 && unicode_attributes[ch].category[1] == 'd'
5916 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5917 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5918 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5919 attr |= (int64_t) 1 << LBP_NU;
5921 /* postfix (numeric) */
5922 if (ch == 0x0025 /* PERCENT SIGN */
5923 || ch == 0x00A2 /* CENT SIGN */
5924 || ch == 0x00B0 /* DEGREE SIGN */
5925 || ch == 0x060B /* AFGHANI SIGN */
5926 || ch == 0x066A /* ARABIC PERCENT SIGN */
5927 || ch == 0x2030 /* PER MILLE SIGN */
5928 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5929 || ch == 0x2032 /* PRIME */
5930 || ch == 0x2033 /* DOUBLE PRIME */
5931 || ch == 0x2034 /* TRIPLE PRIME */
5932 || ch == 0x2035 /* REVERSED PRIME */
5933 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5934 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5935 || ch == 0x20A7 /* PESETA SIGN */
5936 || ch == 0x2103 /* DEGREE CELSIUS */
5937 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5938 || ch == 0xFDFC /* RIAL SIGN */
5939 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5940 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5941 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5942 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5943 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5944 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5945 || ch == 0x09F2 /* BENGALI RUPEE MARK */
5946 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
5947 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
5948 || ch == 0x0D79 /* MALAYALAM DATE MARK */
5949 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
5950 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
5951 attr |= (int64_t) 1 << LBP_PO;
5953 /* prefix (numeric) */
5954 if ((unicode_attributes[ch].category[0] == 'S'
5955 && unicode_attributes[ch].category[1] == 'c')
5956 || ch == 0x002B /* PLUS SIGN */
5957 || ch == 0x005C /* REVERSE SOLIDUS */
5958 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5959 || ch == 0x2116 /* NUMERO SIGN */
5960 || ch == 0x2212 /* MINUS SIGN */
5961 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5962 if (!(attr & ((int64_t) 1 << LBP_PO)))
5963 attr |= (int64_t) 1 << LBP_PR;
5965 /* symbols allowing breaks */
5966 if (ch == 0x002F /* SOLIDUS */)
5967 attr |= (int64_t) 1 << LBP_SY;
5969 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5970 attr |= (int64_t) 1 << LBP_H2;
5972 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5973 attr |= (int64_t) 1 << LBP_H3;
5975 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
5976 attr |= (int64_t) 1 << LBP_JL;
5978 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
5979 attr |= (int64_t) 1 << LBP_JV;
5981 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
5982 attr |= (int64_t) 1 << LBP_JT;
5984 /* complex context (South East Asian) */
5985 if (((unicode_attributes[ch].category[0] == 'C'
5986 && unicode_attributes[ch].category[1] == 'f')
5987 || (unicode_attributes[ch].category[0] == 'L'
5988 && (unicode_attributes[ch].category[1] == 'm'
5989 || unicode_attributes[ch].category[1] == 'o'))
5990 || (unicode_attributes[ch].category[0] == 'M'
5991 && (unicode_attributes[ch].category[1] == 'c'
5992 || unicode_attributes[ch].category[1] == 'n')
5993 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
5994 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5995 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5996 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5997 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
5998 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5999 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6000 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6001 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6002 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
6003 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6004 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6005 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6006 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6007 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6008 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
6009 attr |= (int64_t) 1 << LBP_SA;
6011 /* attached characters and combining marks */
6012 if ((unicode_attributes[ch].category[0] == 'M'
6013 && (unicode_attributes[ch].category[1] == 'c'
6014 || unicode_attributes[ch].category[1] == 'e'
6015 || unicode_attributes[ch].category[1] == 'n'))
6016 || (unicode_attributes[ch].category[0] == 'C'
6017 && (unicode_attributes[ch].category[1] == 'c'
6018 || unicode_attributes[ch].category[1] == 'f')
6019 && ch != 0x110BD /* KAITHI NUMBER SIGN */))
6020 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6021 attr |= (int64_t) 1 << LBP_CM;
6024 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6025 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6026 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6027 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6028 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6029 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6030 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6031 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6032 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6033 || ch == 0xFE62 /* SMALL PLUS SIGN */
6034 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6035 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6036 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6037 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6038 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
6039 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
6040 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
6041 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
6042 || (ch >= 0x3000 && ch <= 0x33FF
6043 && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
6044 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6045 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
6046 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
6047 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
6048 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
6049 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
6050 || ch == 0xFE45 /* SESAME DOT */
6051 || ch == 0xFE46 /* WHITE SESAME DOT */
6052 || ch == 0xFE49 /* DASHED OVERLINE */
6053 || ch == 0xFE4A /* CENTRELINE OVERLINE */
6054 || ch == 0xFE4B /* WAVY OVERLINE */
6055 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
6056 || ch == 0xFE4D /* DASHED LOW LINE */
6057 || ch == 0xFE4E /* CENTRELINE LOW LINE */
6058 || ch == 0xFE4F /* WAVY LOW LINE */
6059 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
6060 || ch == 0xFE58 /* SMALL EM DASH */
6061 || ch == 0xFE5F /* SMALL NUMBER SIGN */
6062 || ch == 0xFE60 /* SMALL AMPERSAND */
6063 || ch == 0xFE61 /* SMALL ASTERISK */
6064 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
6065 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
6066 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
6067 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
6068 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
6069 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
6070 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
6071 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
6072 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
6073 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
6074 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
6075 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
6076 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
6077 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
6078 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
6079 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
6080 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
6081 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
6082 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
6083 || ch == 0xFF5E /* FULLWIDTH TILDE */
6084 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
6085 || ch == 0xFFE3 /* FULLWIDTH MACRON */
6086 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
6087 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6088 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
6089 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
6090 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
6091 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
6092 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
6093 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
6095 /* ambiguous (ideograph) ? */
6096 if ((unicode_width[ch] != NULL
6097 && unicode_width[ch][0] == 'A'
6099 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6100 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
6101 attr |= (int64_t) 1 << LBP_AI;
6103 attr |= (int64_t) 1 << LBP_ID;
6106 /* ordinary alphabetic and symbol characters */
6107 if ((unicode_attributes[ch].category[0] == 'L'
6108 && (unicode_attributes[ch].category[1] == 'u'
6109 || unicode_attributes[ch].category[1] == 'l'
6110 || unicode_attributes[ch].category[1] == 't'
6111 || unicode_attributes[ch].category[1] == 'm'
6112 || unicode_attributes[ch].category[1] == 'o'))
6113 || (unicode_attributes[ch].category[0] == 'S'
6114 && (unicode_attributes[ch].category[1] == 'm'
6115 || unicode_attributes[ch].category[1] == 'k'
6116 || unicode_attributes[ch].category[1] == 'o'))
6117 || (unicode_attributes[ch].category[0] == 'N'
6118 && (unicode_attributes[ch].category[1] == 'l'
6119 || unicode_attributes[ch].category[1] == 'o'))
6120 || (unicode_attributes[ch].category[0] == 'P'
6121 && (unicode_attributes[ch].category[1] == 'c'
6122 || unicode_attributes[ch].category[1] == 'd'
6123 || unicode_attributes[ch].category[1] == 'o'))
6124 || ch == 0x0600 /* ARABIC NUMBER SIGN */
6125 || ch == 0x0601 /* ARABIC SIGN SANAH */
6126 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
6127 || ch == 0x0603 /* ARABIC SIGN SAFHA */
6128 || ch == 0x06DD /* ARABIC END OF AYAH */
6129 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
6130 || ch == 0x2061 /* FUNCTION APPLICATION */
6131 || ch == 0x2062 /* INVISIBLE TIMES */
6132 || ch == 0x2063 /* INVISIBLE SEPARATOR */
6133 || ch == 0x2064 /* INVISIBLE PLUS */
6134 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6135 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
6136 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
6138 /* ambiguous (alphabetic) ? */
6139 if ((unicode_width[ch] != NULL
6140 && unicode_width[ch][0] == 'A'
6142 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
6143 && ch != 0x2022 /* BULLET */
6144 && ch != 0x203E /* OVERLINE */
6145 && ch != 0x2126 /* OHM SIGN */
6146 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
6147 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
6148 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
6149 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
6150 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
6151 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
6152 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
6153 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
6154 || ch == 0x00A7 /* SECTION SIGN */
6155 || ch == 0x00A8 /* DIAERESIS */
6156 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
6157 || ch == 0x00B2 /* SUPERSCRIPT TWO */
6158 || ch == 0x00B3 /* SUPERSCRIPT THREE */
6159 || ch == 0x00B6 /* PILCROW SIGN */
6160 || ch == 0x00B7 /* MIDDLE DOT */
6161 || ch == 0x00B8 /* CEDILLA */
6162 || ch == 0x00B9 /* SUPERSCRIPT ONE */
6163 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
6164 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
6165 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
6166 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
6167 || ch == 0x00D7 /* MULTIPLICATION SIGN */
6168 || ch == 0x00F7 /* DIVISION SIGN */
6169 || ch == 0x02C7 /* CARON */
6170 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
6171 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
6172 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
6173 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
6174 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
6175 || ch == 0x02D8 /* BREVE */
6176 || ch == 0x02D9 /* DOT ABOVE */
6177 || ch == 0x02DA /* RING ABOVE */
6178 || ch == 0x02DB /* OGONEK */
6179 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
6180 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6181 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
6182 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6183 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
6184 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
6185 || ch == 0x2616 /* WHITE SHOGI PIECE */
6186 || ch == 0x2617 /* BLACK SHOGI PIECE */)
6187 attr |= (int64_t) 1 << LBP_AI;
6189 attr |= (int64_t) 1 << LBP_AL;
6190 attr &= ~((int64_t) 1 << LBP_CM);
6195 /* Unassigned character. */
6196 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
6197 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
6198 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
6199 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
6200 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
6201 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6202 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
6203 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6204 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
6205 attr |= (int64_t) 1 << LBP_ID;
6210 attr |= (int64_t) 1 << LBP_XX;
6215 /* Output the line breaking properties in a human readable format. */
6217 debug_output_lbp (FILE *stream)
6221 for (i = 0; i < 0x110000; i++)
6223 int64_t attr = get_lbp (i);
6224 if (attr != (int64_t) 1 << LBP_XX)
6226 fprintf (stream, "0x%04X", i);
6227 #define PRINT_BIT(attr,bit) \
6228 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
6229 PRINT_BIT(attr,LBP_BK);
6230 PRINT_BIT(attr,LBP_CM);
6231 PRINT_BIT(attr,LBP_WJ);
6232 PRINT_BIT(attr,LBP_ZW);
6233 PRINT_BIT(attr,LBP_GL);
6234 PRINT_BIT(attr,LBP_SP);
6235 PRINT_BIT(attr,LBP_B2);
6236 PRINT_BIT(attr,LBP_BA);
6237 PRINT_BIT(attr,LBP_BB);
6238 PRINT_BIT(attr,LBP_HY);
6239 PRINT_BIT(attr,LBP_CB);
6240 PRINT_BIT(attr,LBP_CL);
6241 PRINT_BIT(attr,LBP_CP);
6242 PRINT_BIT(attr,LBP_EX);
6243 PRINT_BIT(attr,LBP_IN);
6244 PRINT_BIT(attr,LBP_NS);
6245 PRINT_BIT(attr,LBP_OP);
6246 PRINT_BIT(attr,LBP_QU);
6247 PRINT_BIT(attr,LBP_IS);
6248 PRINT_BIT(attr,LBP_NU);
6249 PRINT_BIT(attr,LBP_PO);
6250 PRINT_BIT(attr,LBP_PR);
6251 PRINT_BIT(attr,LBP_SY);
6252 PRINT_BIT(attr,LBP_AI);
6253 PRINT_BIT(attr,LBP_AL);
6254 PRINT_BIT(attr,LBP_H2);
6255 PRINT_BIT(attr,LBP_H3);
6256 PRINT_BIT(attr,LBP_ID);
6257 PRINT_BIT(attr,LBP_JL);
6258 PRINT_BIT(attr,LBP_JV);
6259 PRINT_BIT(attr,LBP_JT);
6260 PRINT_BIT(attr,LBP_SA);
6261 PRINT_BIT(attr,LBP_XX);
6263 fprintf (stream, "\n");
6269 debug_output_lbrk_tables (const char *filename)
6273 stream = fopen (filename, "w");
6276 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6280 debug_output_lbp (stream);
6282 if (ferror (stream) || fclose (stream))
6284 fprintf (stderr, "error writing to '%s'\n", filename);
6289 /* The line breaking property from the LineBreak.txt file. */
6290 int unicode_org_lbp[0x110000];
6292 /* Stores in unicode_org_lbp[] the line breaking property from the
6293 LineBreak.txt file. */
6295 fill_org_lbp (const char *linebreak_filename)
6299 char field0[FIELDLEN];
6300 char field1[FIELDLEN];
6301 char field2[FIELDLEN];
6304 for (i = 0; i < 0x110000; i++)
6305 unicode_org_lbp[i] = LBP_XX;
6307 stream = fopen (linebreak_filename, "r");
6310 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
6326 do c = getc (stream); while (c != EOF && c != '\n');
6330 n = getfield (stream, field0, ';');
6331 n += getfield (stream, field1, ' ');
6332 n += getfield (stream, field2, '\n');
6337 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
6341 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
6377 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
6378 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
6379 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
6380 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
6383 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
6384 field1, linebreak_filename, lineno);
6387 i = strtoul (field0, NULL, 16);
6388 if (strstr (field0, "..") != NULL)
6390 /* Deal with a range. */
6391 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6393 unicode_org_lbp[i] = value;
6397 /* Single character line. */
6398 unicode_org_lbp[i] = value;
6401 if (ferror (stream) || fclose (stream))
6403 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
6408 /* Output the line breaking properties in a human readable format. */
6410 debug_output_org_lbp (FILE *stream)
6414 for (i = 0; i < 0x110000; i++)
6416 int attr = unicode_org_lbp[i];
6419 fprintf (stream, "0x%04X", i);
6420 #define PRINT_BIT(attr,bit) \
6421 if (attr == bit) fprintf (stream, " " #bit);
6422 PRINT_BIT(attr,LBP_BK);
6423 PRINT_BIT(attr,LBP_CM);
6424 PRINT_BIT(attr,LBP_WJ);
6425 PRINT_BIT(attr,LBP_ZW);
6426 PRINT_BIT(attr,LBP_GL);
6427 PRINT_BIT(attr,LBP_SP);
6428 PRINT_BIT(attr,LBP_B2);
6429 PRINT_BIT(attr,LBP_BA);
6430 PRINT_BIT(attr,LBP_BB);
6431 PRINT_BIT(attr,LBP_HY);
6432 PRINT_BIT(attr,LBP_CB);
6433 PRINT_BIT(attr,LBP_CL);
6434 PRINT_BIT(attr,LBP_CP);
6435 PRINT_BIT(attr,LBP_EX);
6436 PRINT_BIT(attr,LBP_IN);
6437 PRINT_BIT(attr,LBP_NS);
6438 PRINT_BIT(attr,LBP_OP);
6439 PRINT_BIT(attr,LBP_QU);
6440 PRINT_BIT(attr,LBP_IS);
6441 PRINT_BIT(attr,LBP_NU);
6442 PRINT_BIT(attr,LBP_PO);
6443 PRINT_BIT(attr,LBP_PR);
6444 PRINT_BIT(attr,LBP_SY);
6445 PRINT_BIT(attr,LBP_AI);
6446 PRINT_BIT(attr,LBP_AL);
6447 PRINT_BIT(attr,LBP_H2);
6448 PRINT_BIT(attr,LBP_H3);
6449 PRINT_BIT(attr,LBP_ID);
6450 PRINT_BIT(attr,LBP_JL);
6451 PRINT_BIT(attr,LBP_JV);
6452 PRINT_BIT(attr,LBP_JT);
6453 PRINT_BIT(attr,LBP_SA);
6454 PRINT_BIT(attr,LBP_XX);
6456 fprintf (stream, "\n");
6462 debug_output_org_lbrk_tables (const char *filename)
6466 stream = fopen (filename, "w");
6469 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6473 debug_output_org_lbp (stream);
6475 if (ferror (stream) || fclose (stream))
6477 fprintf (stderr, "error writing to '%s'\n", filename);
6482 /* Construction of sparse 3-level tables. */
6483 #define TABLE lbp_table
6484 #define ELEMENT unsigned char
6485 #define DEFAULT LBP_XX
6486 #define xmalloc malloc
6487 #define xrealloc realloc
6491 output_lbp (FILE *stream1, FILE *stream2)
6495 unsigned int level1_offset, level2_offset, level3_offset;
6499 lbp_table_init (&t);
6501 for (i = 0; i < 0x110000; i++)
6503 int64_t attr = get_lbp (i);
6505 /* Now attr should contain exactly one bit. */
6506 if (attr == 0 || ((attr & (attr - 1)) != 0))
6509 if (attr != (int64_t) 1 << LBP_XX)
6511 unsigned int log2_attr;
6512 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6514 lbp_table_add (&t, i, log2_attr);
6518 lbp_table_finalize (&t);
6521 5 * sizeof (uint32_t);
6523 5 * sizeof (uint32_t)
6524 + t.level1_size * sizeof (uint32_t);
6526 5 * sizeof (uint32_t)
6527 + t.level1_size * sizeof (uint32_t)
6528 + (t.level2_size << t.q) * sizeof (uint32_t);
6530 for (i = 0; i < 5; i++)
6531 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6532 ((uint32_t *) t.result)[i]);
6533 fprintf (stream1, "\n");
6534 fprintf (stream1, "typedef struct\n");
6535 fprintf (stream1, " {\n");
6536 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6537 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6538 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6539 fprintf (stream1, " }\n");
6540 fprintf (stream1, "lbrkprop_t;\n");
6541 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6543 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6544 fprintf (stream2, "{\n");
6545 fprintf (stream2, " {");
6546 if (t.level1_size > 8)
6547 fprintf (stream2, "\n ");
6548 for (i = 0; i < t.level1_size; i++)
6551 if (i > 0 && (i % 8) == 0)
6552 fprintf (stream2, "\n ");
6553 offset = ((uint32_t *) (t.result + level1_offset))[i];
6555 fprintf (stream2, " %5d", -1);
6557 fprintf (stream2, " %5zu",
6558 (offset - level2_offset) / sizeof (uint32_t));
6559 if (i+1 < t.level1_size)
6560 fprintf (stream2, ",");
6562 if (t.level1_size > 8)
6563 fprintf (stream2, "\n ");
6564 fprintf (stream2, " },\n");
6565 fprintf (stream2, " {");
6566 if (t.level2_size << t.q > 8)
6567 fprintf (stream2, "\n ");
6568 for (i = 0; i < t.level2_size << t.q; i++)
6571 if (i > 0 && (i % 8) == 0)
6572 fprintf (stream2, "\n ");
6573 offset = ((uint32_t *) (t.result + level2_offset))[i];
6575 fprintf (stream2, " %5d", -1);
6577 fprintf (stream2, " %5zu",
6578 (offset - level3_offset) / sizeof (unsigned char));
6579 if (i+1 < t.level2_size << t.q)
6580 fprintf (stream2, ",");
6582 if (t.level2_size << t.q > 8)
6583 fprintf (stream2, "\n ");
6584 fprintf (stream2, " },\n");
6585 fprintf (stream2, " {");
6586 if (t.level3_size << t.p > 8)
6587 fprintf (stream2, "\n ");
6588 for (i = 0; i < t.level3_size << t.p; i++)
6590 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6591 const char *value_string;
6594 #define CASE(x) case x: value_string = #x; break;
6632 if (i > 0 && (i % 8) == 0)
6633 fprintf (stream2, "\n ");
6634 fprintf (stream2, " %s%s", value_string,
6635 (i+1 < t.level3_size << t.p ? "," : ""));
6637 if (t.level3_size << t.p > 8)
6638 fprintf (stream2, "\n ");
6639 fprintf (stream2, " }\n");
6640 fprintf (stream2, "};\n");
6644 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6646 const char *filenames[2];
6650 filenames[0] = filename1;
6651 filenames[1] = filename2;
6653 for (i = 0; i < 2; i++)
6655 streams[i] = fopen (filenames[i], "w");
6656 if (streams[i] == NULL)
6658 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6663 for (i = 0; i < 2; i++)
6665 FILE *stream = streams[i];
6667 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6668 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6669 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6671 fprintf (stream, "\n");
6673 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6674 still carries the GPL header), and it's gnulib-tool which replaces the
6675 GPL header with an LGPL header. */
6676 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6677 fprintf (stream, "\n");
6678 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6679 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6680 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6681 fprintf (stream, " (at your option) any later version.\n");
6682 fprintf (stream, "\n");
6683 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6684 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6685 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6686 fprintf (stream, " GNU General Public License for more details.\n");
6687 fprintf (stream, "\n");
6688 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6689 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6690 fprintf (stream, "\n");
6693 output_lbp (streams[0], streams[1]);
6695 for (i = 0; i < 2; i++)
6697 if (ferror (streams[i]) || fclose (streams[i]))
6699 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6705 /* ========================================================================= */
6707 /* Word break property.
6708 Updated for Unicode TR #29 revision 17. */
6710 /* Possible values of the Word_Break property. */
6725 WBP_EXTENDNUMLET = 7
6728 /* Returns the word breaking property for ch, as a bit mask. */
6730 get_wbp (unsigned int ch)
6734 if (unicode_attributes[ch].name != NULL)
6737 attr |= 1 << WBP_CR;
6740 attr |= 1 << WBP_LF;
6742 if (ch == 0x000B || ch == 0x000C
6744 || ch == 0x2028 || ch == 0x2029)
6745 attr |= 1 << WBP_NEWLINE;
6747 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6748 || (unicode_attributes[ch].category != NULL
6749 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6750 attr |= 1 << WBP_EXTEND;
6752 if (unicode_attributes[ch].category != NULL
6753 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6754 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
6755 attr |= 1 << WBP_FORMAT;
6757 if ((unicode_scripts[ch] < numscripts
6758 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6759 || (ch >= 0x3031 && ch <= 0x3035)
6760 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6762 attr |= 1 << WBP_KATAKANA;
6764 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6766 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6767 && (attr & (1 << WBP_KATAKANA)) == 0
6768 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6769 && !(unicode_scripts[ch] < numscripts
6770 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6771 && (attr & (1 << WBP_EXTEND)) == 0)
6772 attr |= 1 << WBP_ALETTER;
6774 if (is_WBP_MIDNUMLET (ch))
6775 attr |= 1 << WBP_MIDNUMLET;
6777 if (is_WBP_MIDLETTER (ch))
6778 attr |= 1 << WBP_MIDLETTER;
6780 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6781 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6783 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6784 attr |= 1 << WBP_MIDNUM;
6786 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6788 attr |= 1 << WBP_NUMERIC;
6790 if (unicode_attributes[ch].category != NULL
6791 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6792 attr |= 1 << WBP_EXTENDNUMLET;
6797 attr |= 1 << WBP_OTHER;
6802 /* Output the word break property in a human readable format. */
6804 debug_output_wbp (FILE *stream)
6808 for (i = 0; i < 0x110000; i++)
6810 int attr = get_wbp (i);
6811 if (attr != 1 << WBP_OTHER)
6813 fprintf (stream, "0x%04X", i);
6814 if (attr & (1 << WBP_CR))
6815 fprintf (stream, " CR");
6816 if (attr & (1 << WBP_LF))
6817 fprintf (stream, " LF");
6818 if (attr & (1 << WBP_NEWLINE))
6819 fprintf (stream, " Newline");
6820 if (attr & (1 << WBP_EXTEND))
6821 fprintf (stream, " Extend");
6822 if (attr & (1 << WBP_FORMAT))
6823 fprintf (stream, " Format");
6824 if (attr & (1 << WBP_KATAKANA))
6825 fprintf (stream, " Katakana");
6826 if (attr & (1 << WBP_ALETTER))
6827 fprintf (stream, " ALetter");
6828 if (attr & (1 << WBP_MIDNUMLET))
6829 fprintf (stream, " MidNumLet");
6830 if (attr & (1 << WBP_MIDLETTER))
6831 fprintf (stream, " MidLetter");
6832 if (attr & (1 << WBP_MIDNUM))
6833 fprintf (stream, " MidNum");
6834 if (attr & (1 << WBP_NUMERIC))
6835 fprintf (stream, " Numeric");
6836 if (attr & (1 << WBP_EXTENDNUMLET))
6837 fprintf (stream, " ExtendNumLet");
6838 fprintf (stream, "\n");
6844 debug_output_wbrk_tables (const char *filename)
6848 stream = fopen (filename, "w");
6851 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6855 debug_output_wbp (stream);
6857 if (ferror (stream) || fclose (stream))
6859 fprintf (stderr, "error writing to '%s'\n", filename);
6864 /* The word break property from the WordBreakProperty.txt file. */
6865 int unicode_org_wbp[0x110000];
6867 /* Stores in unicode_org_wbp[] the word break property from the
6868 WordBreakProperty.txt file. */
6870 fill_org_wbp (const char *wordbreakproperty_filename)
6875 for (i = 0; i < 0x110000; i++)
6876 unicode_org_wbp[i] = WBP_OTHER;
6878 stream = fopen (wordbreakproperty_filename, "r");
6881 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6888 unsigned int i1, i2;
6889 char padding[200+1];
6890 char propname[200+1];
6893 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6896 if (buf[0] == '\0' || buf[0] == '#')
6899 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6901 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6903 fprintf (stderr, "parse error in '%s'\n",
6904 wordbreakproperty_filename);
6909 #define PROP(name,value) \
6910 if (strcmp (propname, name) == 0) propvalue = value; else
6913 PROP ("Newline", WBP_NEWLINE)
6914 PROP ("Extend", WBP_EXTEND)
6915 PROP ("Format", WBP_FORMAT)
6916 PROP ("Katakana", WBP_KATAKANA)
6917 PROP ("ALetter", WBP_ALETTER)
6918 PROP ("MidNumLet", WBP_MIDNUMLET)
6919 PROP ("MidLetter", WBP_MIDLETTER)
6920 PROP ("MidNum", WBP_MIDNUM)
6921 PROP ("Numeric", WBP_NUMERIC)
6922 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6925 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6926 wordbreakproperty_filename);
6929 if (!(i1 <= i2 && i2 < 0x110000))
6932 for (i = i1; i <= i2; i++)
6933 unicode_org_wbp[i] = propvalue;
6936 if (ferror (stream) || fclose (stream))
6938 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6943 /* Output the word break property in a human readable format. */
6945 debug_output_org_wbp (FILE *stream)
6949 for (i = 0; i < 0x110000; i++)
6951 int propvalue = unicode_org_wbp[i];
6952 if (propvalue != WBP_OTHER)
6954 fprintf (stream, "0x%04X", i);
6955 #define PROP(name,value) \
6956 if (propvalue == value) fprintf (stream, " " name); else
6959 PROP ("Newline", WBP_NEWLINE)
6960 PROP ("Extend", WBP_EXTEND)
6961 PROP ("Format", WBP_FORMAT)
6962 PROP ("Katakana", WBP_KATAKANA)
6963 PROP ("ALetter", WBP_ALETTER)
6964 PROP ("MidNumLet", WBP_MIDNUMLET)
6965 PROP ("MidLetter", WBP_MIDLETTER)
6966 PROP ("MidNum", WBP_MIDNUM)
6967 PROP ("Numeric", WBP_NUMERIC)
6968 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6970 fprintf (stream, " ??");
6971 fprintf (stream, "\n");
6977 debug_output_org_wbrk_tables (const char *filename)
6981 stream = fopen (filename, "w");
6984 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6988 debug_output_org_wbp (stream);
6990 if (ferror (stream) || fclose (stream))
6992 fprintf (stderr, "error writing to '%s'\n", filename);
6997 /* Construction of sparse 3-level tables. */
6998 #define TABLE wbp_table
6999 #define ELEMENT unsigned char
7000 #define DEFAULT WBP_OTHER
7001 #define xmalloc malloc
7002 #define xrealloc realloc
7006 output_wbp (FILE *stream)
7010 unsigned int level1_offset, level2_offset, level3_offset;
7014 wbp_table_init (&t);
7016 for (i = 0; i < 0x110000; i++)
7018 int attr = get_wbp (i);
7020 /* Now attr should contain exactly one bit. */
7021 if (attr == 0 || ((attr & (attr - 1)) != 0))
7024 if (attr != 1 << WBP_OTHER)
7026 unsigned int log2_attr;
7027 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7029 wbp_table_add (&t, i, log2_attr);
7033 wbp_table_finalize (&t);
7036 5 * sizeof (uint32_t);
7038 5 * sizeof (uint32_t)
7039 + t.level1_size * sizeof (uint32_t);
7041 5 * sizeof (uint32_t)
7042 + t.level1_size * sizeof (uint32_t)
7043 + (t.level2_size << t.q) * sizeof (uint32_t);
7045 for (i = 0; i < 5; i++)
7046 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
7047 ((uint32_t *) t.result)[i]);
7048 fprintf (stream, "\n");
7049 fprintf (stream, "typedef struct\n");
7050 fprintf (stream, " {\n");
7051 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7052 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
7053 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7054 fprintf (stream, " }\n");
7055 fprintf (stream, "wbrkprop_t;\n");
7056 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
7057 fprintf (stream, "{\n");
7058 fprintf (stream, " {");
7059 if (t.level1_size > 8)
7060 fprintf (stream, "\n ");
7061 for (i = 0; i < t.level1_size; i++)
7064 if (i > 0 && (i % 8) == 0)
7065 fprintf (stream, "\n ");
7066 offset = ((uint32_t *) (t.result + level1_offset))[i];
7068 fprintf (stream, " %5d", -1);
7070 fprintf (stream, " %5zu",
7071 (offset - level2_offset) / sizeof (uint32_t));
7072 if (i+1 < t.level1_size)
7073 fprintf (stream, ",");
7075 if (t.level1_size > 8)
7076 fprintf (stream, "\n ");
7077 fprintf (stream, " },\n");
7078 fprintf (stream, " {");
7079 if (t.level2_size << t.q > 8)
7080 fprintf (stream, "\n ");
7081 for (i = 0; i < t.level2_size << t.q; i++)
7084 if (i > 0 && (i % 8) == 0)
7085 fprintf (stream, "\n ");
7086 offset = ((uint32_t *) (t.result + level2_offset))[i];
7088 fprintf (stream, " %5d", -1);
7090 fprintf (stream, " %5zu",
7091 (offset - level3_offset) / sizeof (unsigned char));
7092 if (i+1 < t.level2_size << t.q)
7093 fprintf (stream, ",");
7095 if (t.level2_size << t.q > 8)
7096 fprintf (stream, "\n ");
7097 fprintf (stream, " },\n");
7098 fprintf (stream, " {");
7099 if (t.level3_size << t.p > 4)
7100 fprintf (stream, "\n ");
7101 for (i = 0; i < t.level3_size << t.p; i++)
7103 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7104 const char *value_string;
7107 #define CASE(x) case x: value_string = #x; break;
7116 CASE(WBP_MIDNUMLET);
7117 CASE(WBP_MIDLETTER);
7120 CASE(WBP_EXTENDNUMLET);
7125 if (i > 0 && (i % 4) == 0)
7126 fprintf (stream, "\n ");
7127 fprintf (stream, " %s%s", value_string,
7128 (i+1 < t.level3_size << t.p ? "," : ""));
7130 if (t.level3_size << t.p > 4)
7131 fprintf (stream, "\n ");
7132 fprintf (stream, " }\n");
7133 fprintf (stream, "};\n");
7137 output_wbrk_tables (const char *filename, const char *version)
7141 stream = fopen (filename, "w");
7144 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7148 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7149 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7150 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7152 fprintf (stream, "\n");
7154 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7155 still carries the GPL header), and it's gnulib-tool which replaces the
7156 GPL header with an LGPL header. */
7157 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
7158 fprintf (stream, "\n");
7159 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7160 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7161 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7162 fprintf (stream, " (at your option) any later version.\n");
7163 fprintf (stream, "\n");
7164 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7165 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7166 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7167 fprintf (stream, " GNU General Public License for more details.\n");
7168 fprintf (stream, "\n");
7169 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7170 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7171 fprintf (stream, "\n");
7173 output_wbp (stream);
7175 if (ferror (stream) || fclose (stream))
7177 fprintf (stderr, "error writing to '%s'\n", filename);
7182 /* ========================================================================= */
7184 /* Grapheme break property.
7185 Updated for Unicode TR #29 revision 17. */
7187 /* Possible values of the Grapheme_Cluster_Break property. */
7196 GBP_SPACINGMARK = 6,
7204 /* Construction of sparse 3-level tables. */
7205 #define TABLE gbp_table
7206 #define ELEMENT unsigned char
7207 #define DEFAULT GBP_OTHER
7208 #define xmalloc malloc
7209 #define xrealloc realloc
7212 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
7213 int unicode_org_gbp[0x110000];
7215 /* Output the unit test data for the grapheme break property. */
7217 output_gbp_test (const char *filename)
7223 stream = fopen (filename, "w");
7226 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7230 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7231 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
7232 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
7233 fprintf (stream, "\n");
7234 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7235 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7236 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7237 fprintf (stream, " (at your option) any later version.\n");
7238 fprintf (stream, "\n");
7239 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7240 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7241 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7242 fprintf (stream, " GNU General Public License for more details.\n");
7243 fprintf (stream, "\n");
7244 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7245 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7246 fprintf (stream, "\n");
7249 for (ch = 0; ch < 0x110000; ch++)
7251 int gbp = unicode_org_gbp[ch];
7252 const char *gbp_string;
7254 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
7259 #define CASE(x) case x: gbp_string = #x; break;
7266 CASE (GBP_SPACINGMARK)
7278 fprintf (stream, ",\n");
7279 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
7283 fprintf (stream, "\n");
7285 if (ferror (stream) || fclose (stream))
7287 fprintf (stderr, "error writing to '%s'\n", filename);
7292 /* Output the per-character grapheme break property table. */
7294 output_gbp_table (const char *filename, const char *version)
7299 unsigned int level1_offset, level2_offset, level3_offset;
7301 stream = fopen (filename, "w");
7304 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7308 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7309 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
7310 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7315 gbp_table_init (&t);
7317 for (ch = 0; ch < 0x110000; ch++)
7318 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
7320 gbp_table_finalize (&t);
7322 /* Offsets in t.result, in memory of this process. */
7324 5 * sizeof (uint32_t);
7326 5 * sizeof (uint32_t)
7327 + t.level1_size * sizeof (uint32_t);
7329 5 * sizeof (uint32_t)
7330 + t.level1_size * sizeof (uint32_t)
7331 + (t.level2_size << t.q) * sizeof (uint32_t);
7333 for (i = 0; i < 5; i++)
7334 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
7335 ((uint32_t *) t.result)[i]);
7336 fprintf (stream, "static const\n");
7337 fprintf (stream, "struct\n");
7338 fprintf (stream, " {\n");
7339 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7340 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7341 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
7342 t.level3_size, t.p);
7343 fprintf (stream, " }\n");
7344 fprintf (stream, "unigbrkprop =\n");
7345 fprintf (stream, "{\n");
7346 fprintf (stream, " {");
7347 if (t.level1_size > 8)
7348 fprintf (stream, "\n ");
7349 for (i = 0; i < t.level1_size; i++)
7352 if (i > 0 && (i % 8) == 0)
7353 fprintf (stream, "\n ");
7354 offset = ((uint32_t *) (t.result + level1_offset))[i];
7356 fprintf (stream, " %5d", -1);
7358 fprintf (stream, " %5zu",
7359 (offset - level2_offset) / sizeof (uint32_t));
7360 if (i+1 < t.level1_size)
7361 fprintf (stream, ",");
7363 if (t.level1_size > 8)
7364 fprintf (stream, "\n ");
7365 fprintf (stream, " },\n");
7366 fprintf (stream, " {");
7367 if (t.level2_size << t.q > 8)
7368 fprintf (stream, "\n ");
7369 for (i = 0; i < t.level2_size << t.q; i++)
7372 if (i > 0 && (i % 8) == 0)
7373 fprintf (stream, "\n ");
7374 offset = ((uint32_t *) (t.result + level2_offset))[i];
7376 fprintf (stream, " %5d", -1);
7378 fprintf (stream, " %5zu",
7379 (offset - level3_offset) / sizeof (uint8_t) / 2);
7380 if (i+1 < t.level2_size << t.q)
7381 fprintf (stream, ",");
7383 if (t.level2_size << t.q > 8)
7384 fprintf (stream, "\n ");
7385 fprintf (stream, " },\n");
7386 fprintf (stream, " {");
7387 if (t.level3_size << t.p > 8)
7388 fprintf (stream, "\n ");
7389 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
7391 unsigned char *p = (unsigned char *) (t.result + level3_offset);
7392 unsigned char value0 = p[i * 2];
7393 unsigned char value1 = p[i * 2 + 1];
7394 if (i > 0 && (i % 8) == 0)
7395 fprintf (stream, "\n ");
7396 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
7397 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
7399 if (t.level3_size << t.p > 8)
7400 fprintf (stream, "\n ");
7401 fprintf (stream, " }\n");
7402 fprintf (stream, "};\n");
7404 if (ferror (stream) || fclose (stream))
7406 fprintf (stderr, "error writing to '%s'\n", filename);
7411 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
7412 GraphemeBreakProperty.txt file. */
7414 fill_org_gbp (const char *graphemebreakproperty_filename)
7420 for (i = 0; i < 0x110000; i++)
7421 unicode_org_gbp[i] = GBP_OTHER;
7423 stream = fopen (graphemebreakproperty_filename, "r");
7426 fprintf (stderr, "error during fopen of '%s'\n",
7427 graphemebreakproperty_filename);
7434 unsigned int i1, i2;
7435 char padding[200+1];
7436 char propname[200+1];
7440 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7443 if (buf[0] == '\0' || buf[0] == '#')
7446 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7448 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7450 fprintf (stderr, "parse error in '%s'\n",
7451 graphemebreakproperty_filename);
7456 #define PROP(name,value) \
7457 if (strcmp (propname, name) == 0) propvalue = value; else
7460 PROP ("Control", GBP_CONTROL)
7461 PROP ("Extend", GBP_EXTEND)
7462 PROP ("Prepend", GBP_PREPEND)
7463 PROP ("SpacingMark", GBP_SPACINGMARK)
7468 PROP ("LVT", GBP_LVT)
7471 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
7472 graphemebreakproperty_filename, lineno);
7475 if (!(i1 <= i2 && i2 < 0x110000))
7478 for (i = i1; i <= i2; i++)
7479 unicode_org_gbp[i] = propvalue;
7481 if (ferror (stream) || fclose (stream))
7483 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
7488 /* ========================================================================= */
7490 /* Composition and decomposition.
7491 Updated for Unicode TR #15 revision 33. */
7493 /* Maximum number of characters into which a single Unicode character can be
7495 #define MAX_DECOMP_LENGTH 18
7499 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
7500 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
7501 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
7502 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
7503 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
7504 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
7505 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
7506 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
7507 UC_DECOMP_SUPER, /* <super> A superscript form. */
7508 UC_DECOMP_SUB, /* <sub> A subscript form. */
7509 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
7510 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
7511 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
7512 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
7513 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
7514 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
7515 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7518 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7519 decompositions). Return the type, or -1 for none. */
7521 get_decomposition (unsigned int ch,
7522 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7524 const char *decomposition = unicode_attributes[ch].decomposition;
7526 if (decomposition != NULL && decomposition[0] != '\0')
7528 int type = UC_DECOMP_CANONICAL;
7529 unsigned int length;
7532 if (decomposition[0] == '<')
7537 rangle = strchr (decomposition + 1, '>');
7540 typelen = rangle + 1 - decomposition;
7541 #define TYPE(t1,t2) \
7542 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7545 TYPE ("<font>", UC_DECOMP_FONT)
7546 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7547 TYPE ("<initial>", UC_DECOMP_INITIAL)
7548 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7549 TYPE ("<final>", UC_DECOMP_FINAL)
7550 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7551 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7552 TYPE ("<super>", UC_DECOMP_SUPER)
7553 TYPE ("<sub>", UC_DECOMP_SUB)
7554 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7555 TYPE ("<wide>", UC_DECOMP_WIDE)
7556 TYPE ("<narrow>", UC_DECOMP_NARROW)
7557 TYPE ("<small>", UC_DECOMP_SMALL)
7558 TYPE ("<square>", UC_DECOMP_SQUARE)
7559 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7560 TYPE ("<compat>", UC_DECOMP_COMPAT)
7562 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7566 decomposition = rangle + 1;
7567 if (decomposition[0] == ' ')
7570 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7572 decomposed[length] = strtoul (decomposition, &endptr, 16);
7573 if (endptr == decomposition)
7575 decomposition = endptr;
7576 if (decomposition[0] == ' ')
7579 if (*decomposition != '\0')
7580 /* MAX_DECOMP_LENGTH is too small. */
7590 /* Construction of sparse 3-level tables. */
7591 #define TABLE decomp_table
7592 #define ELEMENT uint16_t
7593 #define DEFAULT (uint16_t)(-1)
7594 #define xmalloc malloc
7595 #define xrealloc realloc
7599 output_decomposition (FILE *stream1, FILE *stream2)
7601 struct decomp_table t;
7602 unsigned int level1_offset, level2_offset, level3_offset;
7603 unsigned int offset;
7609 decomp_table_init (&t);
7611 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7612 fprintf (stream1, "\n");
7613 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7616 for (ch = 0; ch < 0x110000; ch++)
7618 unsigned int length;
7619 unsigned int decomposed[MAX_DECOMP_LENGTH];
7620 int type = get_decomposition (ch, &length, decomposed);
7624 if (!(offset < (1 << 15)))
7626 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7628 /* Produce length 3-bytes entries. */
7630 /* We would need a special representation of zero-length entries. */
7632 for (i = 0; i < length; i++)
7635 fprintf (stream2, ",");
7636 if ((offset % 4) == 0)
7637 fprintf (stream2, "\n ");
7638 if (!(decomposed[i] < (1 << 18)))
7640 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7641 (((i+1 < length ? (1 << 23) : 0)
7642 | (i == 0 ? (type << 18) : 0)
7643 | decomposed[i]) >> 16) & 0xff,
7644 (decomposed[i] >> 8) & 0xff,
7645 decomposed[i] & 0xff);
7651 fprintf (stream2, "\n};\n");
7652 fprintf (stream2, "\n");
7654 decomp_table_finalize (&t);
7657 5 * sizeof (uint32_t);
7659 5 * sizeof (uint32_t)
7660 + t.level1_size * sizeof (uint32_t);
7662 5 * sizeof (uint32_t)
7663 + t.level1_size * sizeof (uint32_t)
7664 + (t.level2_size << t.q) * sizeof (uint32_t);
7666 for (i = 0; i < 5; i++)
7667 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7668 ((uint32_t *) t.result)[i]);
7669 fprintf (stream1, "\n");
7670 fprintf (stream1, "typedef struct\n");
7671 fprintf (stream1, " {\n");
7672 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7673 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7674 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7675 fprintf (stream1, " }\n");
7676 fprintf (stream1, "decomp_index_table_t;\n");
7677 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7678 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7679 fprintf (stream2, "{\n");
7680 fprintf (stream2, " {");
7681 if (t.level1_size > 8)
7682 fprintf (stream2, "\n ");
7683 for (i = 0; i < t.level1_size; i++)
7686 if (i > 0 && (i % 8) == 0)
7687 fprintf (stream2, "\n ");
7688 offset = ((uint32_t *) (t.result + level1_offset))[i];
7690 fprintf (stream2, " %5d", -1);
7692 fprintf (stream2, " %5zu",
7693 (offset - level2_offset) / sizeof (uint32_t));
7694 if (i+1 < t.level1_size)
7695 fprintf (stream2, ",");
7697 if (t.level1_size > 8)
7698 fprintf (stream2, "\n ");
7699 fprintf (stream2, " },\n");
7700 fprintf (stream2, " {");
7701 if (t.level2_size << t.q > 8)
7702 fprintf (stream2, "\n ");
7703 for (i = 0; i < t.level2_size << t.q; i++)
7706 if (i > 0 && (i % 8) == 0)
7707 fprintf (stream2, "\n ");
7708 offset = ((uint32_t *) (t.result + level2_offset))[i];
7710 fprintf (stream2, " %5d", -1);
7712 fprintf (stream2, " %5zu",
7713 (offset - level3_offset) / sizeof (uint16_t));
7714 if (i+1 < t.level2_size << t.q)
7715 fprintf (stream2, ",");
7717 if (t.level2_size << t.q > 8)
7718 fprintf (stream2, "\n ");
7719 fprintf (stream2, " },\n");
7720 fprintf (stream2, " {");
7721 if (t.level3_size << t.p > 8)
7722 fprintf (stream2, "\n ");
7723 for (i = 0; i < t.level3_size << t.p; i++)
7725 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7726 if (i > 0 && (i % 8) == 0)
7727 fprintf (stream2, "\n ");
7728 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7729 if (i+1 < t.level3_size << t.p)
7730 fprintf (stream2, ",");
7732 if (t.level3_size << t.p > 8)
7733 fprintf (stream2, "\n ");
7734 fprintf (stream2, " }\n");
7735 fprintf (stream2, "};\n");
7739 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7741 const char *filenames[2];
7745 filenames[0] = filename1;
7746 filenames[1] = filename2;
7748 for (i = 0; i < 2; i++)
7750 streams[i] = fopen (filenames[i], "w");
7751 if (streams[i] == NULL)
7753 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7758 for (i = 0; i < 2; i++)
7760 FILE *stream = streams[i];
7762 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7763 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7764 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7766 fprintf (stream, "\n");
7769 output_decomposition (streams[0], streams[1]);
7771 for (i = 0; i < 2; i++)
7773 if (ferror (streams[i]) || fclose (streams[i]))
7775 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7781 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7782 char unicode_composition_exclusions[0x110000];
7785 fill_composition_exclusions (const char *compositionexclusions_filename)
7790 stream = fopen (compositionexclusions_filename, "r");
7793 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7797 for (i = 0; i < 0x110000; i++)
7798 unicode_composition_exclusions[i] = 0;
7805 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7808 if (buf[0] == '\0' || buf[0] == '#')
7811 if (sscanf (buf, "%X", &i) != 1)
7813 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7816 if (!(i < 0x110000))
7819 unicode_composition_exclusions[i] = 1;
7822 if (ferror (stream) || fclose (stream))
7824 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7830 debug_output_composition_tables (const char *filename)
7835 stream = fopen (filename, "w");
7838 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7842 for (ch = 0; ch < 0x110000; ch++)
7844 unsigned int length;
7845 unsigned int decomposed[MAX_DECOMP_LENGTH];
7846 int type = get_decomposition (ch, &length, decomposed);
7848 if (type == UC_DECOMP_CANONICAL
7849 /* Consider only binary decompositions.
7850 Exclude singleton decompositions. */
7853 unsigned int code1 = decomposed[0];
7854 unsigned int code2 = decomposed[1];
7855 unsigned int combined = ch;
7857 /* Exclude decompositions where the first part is not a starter,
7858 i.e. is not of canonical combining class 0. */
7859 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7860 /* Exclude characters listed in CompositionExclusions.txt. */
7861 && !unicode_composition_exclusions[combined])
7863 /* The combined character must now also be a starter.
7865 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7868 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7872 unicode_attributes[code2].combining);
7877 if (ferror (stream) || fclose (stream))
7879 fprintf (stderr, "error writing to '%s'\n", filename);
7885 output_composition_tables (const char *filename, const char *version)
7890 stream = fopen (filename, "w");
7893 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7897 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7898 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7899 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7901 fprintf (stream, "\n");
7903 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7904 still carries the GPL header), and it's gnulib-tool which replaces the
7905 GPL header with an LGPL header. */
7906 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7907 fprintf (stream, "\n");
7908 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7909 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7910 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7911 fprintf (stream, " (at your option) any later version.\n");
7912 fprintf (stream, "\n");
7913 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7914 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7915 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7916 fprintf (stream, " GNU General Public License for more details.\n");
7917 fprintf (stream, "\n");
7918 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7919 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7920 fprintf (stream, "\n");
7922 /* The composition table is a set of mappings (code1, code2) -> combined,
7924 367 values for code1 (from 0x003C to 0x30FD),
7925 54 values for code2 (from 0x0300 to 0x309A).
7926 For a fixed code1, there are from 1 to 19 possible values for code2.
7927 For a fixed code2, there are from 1 to 117 possible values for code1.
7928 This is a very sparse matrix.
7930 We want an O(1) hash lookup.
7932 We could implement the hash lookup by mapping (code1, code2) to a linear
7933 combination mul1*code1 + mul2*code2, which is then used as an index into
7934 a 3-level table. But this leads to a table of size 37 KB.
7936 We use gperf to implement the hash lookup, giving it the 928 sets of
7937 4 bytes (code1, code2) as input. gperf generates a hash table of size
7938 1527, which is quite good (60% filled). It requires an auxiliary table
7939 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7941 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
7942 fprintf (stream, "%%struct-type\n");
7943 fprintf (stream, "%%language=ANSI-C\n");
7944 fprintf (stream, "%%define slot-name codes\n");
7945 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7946 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7947 fprintf (stream, "%%compare-lengths\n");
7948 fprintf (stream, "%%compare-strncmp\n");
7949 fprintf (stream, "%%readonly-tables\n");
7950 fprintf (stream, "%%omit-struct-type\n");
7951 fprintf (stream, "%%%%\n");
7953 for (ch = 0; ch < 0x110000; ch++)
7955 unsigned int length;
7956 unsigned int decomposed[MAX_DECOMP_LENGTH];
7957 int type = get_decomposition (ch, &length, decomposed);
7959 if (type == UC_DECOMP_CANONICAL
7960 /* Consider only binary decompositions.
7961 Exclude singleton decompositions. */
7964 unsigned int code1 = decomposed[0];
7965 unsigned int code2 = decomposed[1];
7966 unsigned int combined = ch;
7968 /* Exclude decompositions where the first part is not a starter,
7969 i.e. is not of canonical combining class 0. */
7970 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7971 /* Exclude characters listed in CompositionExclusions.txt. */
7972 && !unicode_composition_exclusions[combined])
7974 /* The combined character must now also be a starter.
7976 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7979 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7980 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
7981 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
7987 if (ferror (stream) || fclose (stream))
7989 fprintf (stderr, "error writing to '%s'\n", filename);
7994 /* ========================================================================= */
7996 /* Output the test for a simple character mapping table to the given file. */
7999 output_simple_mapping_test (const char *filename,
8000 const char *function_name,
8001 unsigned int (*func) (unsigned int),
8002 const char *version)
8008 stream = fopen (filename, "w");
8011 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8015 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8016 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
8017 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
8018 fprintf (stream, "\n");
8019 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8020 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8021 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8022 fprintf (stream, " (at your option) any later version.\n");
8023 fprintf (stream, "\n");
8024 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8025 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8026 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8027 fprintf (stream, " GNU General Public License for more details.\n");
8028 fprintf (stream, "\n");
8029 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8030 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8031 fprintf (stream, "\n");
8032 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8034 fprintf (stream, "\n");
8035 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
8036 fprintf (stream, "\n");
8039 for (ch = 0; ch < 0x110000; ch++)
8041 unsigned int value = func (ch);
8046 fprintf (stream, ",\n");
8047 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
8052 fprintf (stream, "\n");
8054 fprintf (stream, "\n");
8055 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
8056 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
8058 if (ferror (stream) || fclose (stream))
8060 fprintf (stderr, "error writing to '%s'\n", filename);
8065 /* Construction of sparse 3-level tables. */
8066 #define TABLE mapping_table
8067 #define ELEMENT int32_t
8069 #define xmalloc malloc
8070 #define xrealloc realloc
8073 /* Output a simple character mapping table to the given file. */
8076 output_simple_mapping (const char *filename,
8077 unsigned int (*func) (unsigned int),
8078 const char *version)
8082 struct mapping_table t;
8083 unsigned int level1_offset, level2_offset, level3_offset;
8085 stream = fopen (filename, "w");
8088 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8092 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8093 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
8094 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8099 mapping_table_init (&t);
8101 for (ch = 0; ch < 0x110000; ch++)
8103 int value = (int) func (ch) - (int) ch;
8105 mapping_table_add (&t, ch, value);
8108 mapping_table_finalize (&t);
8110 /* Offsets in t.result, in memory of this process. */
8112 5 * sizeof (uint32_t);
8114 5 * sizeof (uint32_t)
8115 + t.level1_size * sizeof (uint32_t);
8117 5 * sizeof (uint32_t)
8118 + t.level1_size * sizeof (uint32_t)
8119 + (t.level2_size << t.q) * sizeof (uint32_t);
8121 for (i = 0; i < 5; i++)
8122 fprintf (stream, "#define mapping_header_%d %d\n", i,
8123 ((uint32_t *) t.result)[i]);
8124 fprintf (stream, "static const\n");
8125 fprintf (stream, "struct\n");
8126 fprintf (stream, " {\n");
8127 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8128 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8129 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
8130 fprintf (stream, " }\n");
8131 fprintf (stream, "u_mapping =\n");
8132 fprintf (stream, "{\n");
8133 fprintf (stream, " {");
8134 if (t.level1_size > 8)
8135 fprintf (stream, "\n ");
8136 for (i = 0; i < t.level1_size; i++)
8139 if (i > 0 && (i % 8) == 0)
8140 fprintf (stream, "\n ");
8141 offset = ((uint32_t *) (t.result + level1_offset))[i];
8143 fprintf (stream, " %5d", -1);
8145 fprintf (stream, " %5zu",
8146 (offset - level2_offset) / sizeof (uint32_t));
8147 if (i+1 < t.level1_size)
8148 fprintf (stream, ",");
8150 if (t.level1_size > 8)
8151 fprintf (stream, "\n ");
8152 fprintf (stream, " },\n");
8153 fprintf (stream, " {");
8154 if (t.level2_size << t.q > 8)
8155 fprintf (stream, "\n ");
8156 for (i = 0; i < t.level2_size << t.q; i++)
8159 if (i > 0 && (i % 8) == 0)
8160 fprintf (stream, "\n ");
8161 offset = ((uint32_t *) (t.result + level2_offset))[i];
8163 fprintf (stream, " %5d", -1);
8165 fprintf (stream, " %5zu",
8166 (offset - level3_offset) / sizeof (int32_t));
8167 if (i+1 < t.level2_size << t.q)
8168 fprintf (stream, ",");
8170 if (t.level2_size << t.q > 8)
8171 fprintf (stream, "\n ");
8172 fprintf (stream, " },\n");
8173 fprintf (stream, " {");
8174 if (t.level3_size << t.p > 8)
8175 fprintf (stream, "\n ");
8176 for (i = 0; i < t.level3_size << t.p; i++)
8178 if (i > 0 && (i % 8) == 0)
8179 fprintf (stream, "\n ");
8180 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
8181 if (i+1 < t.level3_size << t.p)
8182 fprintf (stream, ",");
8184 if (t.level3_size << t.p > 8)
8185 fprintf (stream, "\n ");
8186 fprintf (stream, " }\n");
8187 fprintf (stream, "};\n");
8189 if (ferror (stream) || fclose (stream))
8191 fprintf (stderr, "error writing to '%s'\n", filename);
8196 /* ========================================================================= */
8198 /* A special casing context.
8199 A context is negated through x -> -x. */
8204 SCC_AFTER_SOFT_DOTTED,
8210 /* A special casing rule. */
8211 struct special_casing_rule
8214 unsigned int lower_mapping[3];
8215 unsigned int title_mapping[3];
8216 unsigned int upper_mapping[3];
8217 unsigned int casefold_mapping[3];
8218 const char *language;
8222 /* The special casing rules. */
8223 struct special_casing_rule **casing_rules;
8224 unsigned int num_casing_rules;
8225 unsigned int allocated_casing_rules;
8228 add_casing_rule (struct special_casing_rule *new_rule)
8230 if (num_casing_rules == allocated_casing_rules)
8232 allocated_casing_rules = 2 * allocated_casing_rules;
8233 if (allocated_casing_rules < 16)
8234 allocated_casing_rules = 16;
8236 (struct special_casing_rule **)
8237 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
8239 casing_rules[num_casing_rules++] = new_rule;
8242 /* Stores in casing_rules the special casing rules found in
8243 specialcasing_filename. */
8245 fill_casing_rules (const char *specialcasing_filename)
8249 stream = fopen (specialcasing_filename, "r");
8252 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
8256 casing_rules = NULL;
8257 num_casing_rules = 0;
8258 allocated_casing_rules = 0;
8268 unsigned int lower_mapping[3];
8269 unsigned int title_mapping[3];
8270 unsigned int upper_mapping[3];
8274 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8277 if (buf[0] == '\0' || buf[0] == '#')
8282 code = strtoul (scanptr, &endptr, 16);
8283 if (endptr == scanptr)
8285 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8289 if (*scanptr != ';')
8291 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8296 /* Scan lower mapping. */
8297 for (i = 0; i < 3; i++)
8298 lower_mapping[i] = 0;
8299 for (i = 0; i < 3; i++)
8301 while (*scanptr == ' ')
8303 if (*scanptr == ';')
8305 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
8306 if (endptr == scanptr)
8308 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8313 if (*scanptr != ';')
8315 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8320 /* Scan title mapping. */
8321 for (i = 0; i < 3; i++)
8322 title_mapping[i] = 0;
8323 for (i = 0; i < 3; i++)
8325 while (*scanptr == ' ')
8327 if (*scanptr == ';')
8329 title_mapping[i] = strtoul (scanptr, &endptr, 16);
8330 if (endptr == scanptr)
8332 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8337 if (*scanptr != ';')
8339 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8344 /* Scan upper mapping. */
8345 for (i = 0; i < 3; i++)
8346 upper_mapping[i] = 0;
8347 for (i = 0; i < 3; i++)
8349 while (*scanptr == ' ')
8351 if (*scanptr == ';')
8353 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
8354 if (endptr == scanptr)
8356 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8361 if (*scanptr != ';')
8363 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8368 /* Scan language and context. */
8370 context = SCC_ALWAYS;
8371 while (*scanptr == ' ')
8373 if (*scanptr != '\0' && *scanptr != '#')
8375 const char *word_begin = scanptr;
8376 const char *word_end;
8378 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8382 while (*scanptr == ' ')
8385 if (word_end - word_begin == 2)
8387 language = (char *) malloc ((word_end - word_begin) + 1);
8388 memcpy (language, word_begin, 2);
8389 language[word_end - word_begin] = '\0';
8390 word_begin = word_end = NULL;
8392 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8394 word_begin = scanptr;
8395 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8401 if (word_end > word_begin)
8403 bool negate = false;
8405 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
8410 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
8411 context = SCC_FINAL_SIGMA;
8412 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
8413 context = SCC_AFTER_SOFT_DOTTED;
8414 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
8415 context = SCC_MORE_ABOVE;
8416 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
8417 context = SCC_BEFORE_DOT;
8418 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
8419 context = SCC_AFTER_I;
8422 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
8426 context = - context;
8429 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8431 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8436 /* Store the rule. */
8438 struct special_casing_rule *new_rule =
8439 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8440 new_rule->code = code;
8441 new_rule->language = language;
8442 new_rule->context = context;
8443 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
8444 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
8445 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
8447 add_casing_rule (new_rule);
8451 if (ferror (stream) || fclose (stream))
8453 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
8458 /* A casefolding rule. */
8459 struct casefold_rule
8462 unsigned int mapping[3];
8463 const char *language;
8466 /* The casefolding rules. */
8467 struct casefold_rule **casefolding_rules;
8468 unsigned int num_casefolding_rules;
8469 unsigned int allocated_casefolding_rules;
8471 /* Stores in casefolding_rules the case folding rules found in
8472 casefolding_filename. */
8474 fill_casefolding_rules (const char *casefolding_filename)
8478 stream = fopen (casefolding_filename, "r");
8481 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
8485 casefolding_rules = NULL;
8486 num_casefolding_rules = 0;
8487 allocated_casefolding_rules = 0;
8498 unsigned int mapping[3];
8500 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8503 if (buf[0] == '\0' || buf[0] == '#')
8508 code = strtoul (scanptr, &endptr, 16);
8509 if (endptr == scanptr)
8511 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8515 if (*scanptr != ';')
8517 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8523 while (*scanptr == ' ')
8528 case 'C': case 'F': case 'S': case 'T':
8532 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8536 if (*scanptr != ';')
8538 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8543 /* Scan casefold mapping. */
8544 for (i = 0; i < 3; i++)
8546 for (i = 0; i < 3; i++)
8548 while (*scanptr == ' ')
8550 if (*scanptr == ';')
8552 mapping[i] = strtoul (scanptr, &endptr, 16);
8553 if (endptr == scanptr)
8555 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8560 if (*scanptr != ';')
8562 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8567 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8570 const char * const *languages;
8571 unsigned int languages_count;
8573 /* Type 'T' indicates that the rule is applicable to Turkish
8577 static const char * const turkish_languages[] = { "tr", "az" };
8578 languages = turkish_languages;
8579 languages_count = 2;
8583 static const char * const all_languages[] = { NULL };
8584 languages = all_languages;
8585 languages_count = 1;
8588 for (i = 0; i < languages_count; i++)
8590 /* Store a new rule. */
8591 struct casefold_rule *new_rule =
8592 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8593 new_rule->code = code;
8594 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8595 new_rule->language = languages[i];
8597 if (num_casefolding_rules == allocated_casefolding_rules)
8599 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8600 if (allocated_casefolding_rules < 16)
8601 allocated_casefolding_rules = 16;
8603 (struct casefold_rule **)
8604 realloc (casefolding_rules,
8605 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8607 casefolding_rules[num_casefolding_rules++] = new_rule;
8612 if (ferror (stream) || fclose (stream))
8614 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8619 /* Casefold mapping, when it maps to a single character. */
8620 unsigned int unicode_casefold[0x110000];
8623 to_casefold (unsigned int ch)
8625 return unicode_casefold[ch];
8628 /* Redistribute the casefolding_rules:
8629 - Rules that map to a single character, language independently, are stored
8630 in unicode_casefold.
8631 - Other rules are merged into casing_rules. */
8633 redistribute_casefolding_rules (void)
8635 unsigned int ch, i, j;
8637 /* Fill unicode_casefold[]. */
8638 for (ch = 0; ch < 0x110000; ch++)
8639 unicode_casefold[ch] = ch;
8640 for (i = 0; i < num_casefolding_rules; i++)
8642 struct casefold_rule *cfrule = casefolding_rules[i];
8644 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8647 if (!(ch < 0x110000))
8649 unicode_casefold[ch] = cfrule->mapping[0];
8653 /* Extend the special casing rules by filling in their casefold_mapping[]
8655 for (j = 0; j < num_casing_rules; j++)
8657 struct special_casing_rule *rule = casing_rules[j];
8660 rule->casefold_mapping[0] = to_casefold (rule->code);
8661 for (k = 1; k < 3; k++)
8662 rule->casefold_mapping[k] = 0;
8665 /* Now merge the other casefolding rules into casing_rules. */
8666 for (i = 0; i < num_casefolding_rules; i++)
8668 struct casefold_rule *cfrule = casefolding_rules[i];
8670 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8672 /* Find a rule that applies to the same code, same language, and it
8673 has context SCC_ALWAYS. At the same time, update all rules that
8674 have the same code and same or more specific language. */
8675 struct special_casing_rule *found_rule = NULL;
8677 for (j = 0; j < num_casing_rules; j++)
8679 struct special_casing_rule *rule = casing_rules[j];
8681 if (rule->code == cfrule->code
8682 && (cfrule->language == NULL
8683 || (rule->language != NULL
8684 && strcmp (rule->language, cfrule->language) == 0)))
8686 memcpy (rule->casefold_mapping, cfrule->mapping,
8687 sizeof (rule->casefold_mapping));
8689 if ((cfrule->language == NULL
8690 ? rule->language == NULL
8691 : rule->language != NULL
8692 && strcmp (rule->language, cfrule->language) == 0)
8693 && rule->context == SCC_ALWAYS)
8701 if (found_rule == NULL)
8703 /* Create a new rule. */
8704 struct special_casing_rule *new_rule =
8705 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8707 /* Try to find a rule that applies to the same code, no language
8708 restriction, and with context SCC_ALWAYS. */
8709 for (j = 0; j < num_casing_rules; j++)
8711 struct special_casing_rule *rule = casing_rules[j];
8713 if (rule->code == cfrule->code
8714 && rule->context == SCC_ALWAYS
8715 && rule->language == NULL)
8723 new_rule->code = cfrule->code;
8724 new_rule->language = cfrule->language;
8725 new_rule->context = SCC_ALWAYS;
8726 if (found_rule != NULL)
8728 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8729 sizeof (new_rule->lower_mapping));
8730 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8731 sizeof (new_rule->title_mapping));
8732 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8733 sizeof (new_rule->upper_mapping));
8739 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8740 for (k = 1; k < 3; k++)
8741 new_rule->lower_mapping[k] = 0;
8742 new_rule->title_mapping[0] = to_title (cfrule->code);
8743 for (k = 1; k < 3; k++)
8744 new_rule->title_mapping[k] = 0;
8745 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8746 for (k = 1; k < 3; k++)
8747 new_rule->upper_mapping[k] = 0;
8749 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8750 sizeof (new_rule->casefold_mapping));
8752 add_casing_rule (new_rule);
8759 compare_casing_rules (const void *a, const void *b)
8761 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8762 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8763 unsigned int a_code = a_rule->code;
8764 unsigned int b_code = b_rule->code;
8766 if (a_code < b_code)
8768 if (a_code > b_code)
8771 /* Sort the more specific rules before the more general ones. */
8772 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8773 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8777 sort_casing_rules (void)
8779 /* Sort the rules 1. by code, 2. by specificity. */
8780 if (num_casing_rules > 1)
8781 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8782 compare_casing_rules);
8785 /* Output the special casing rules. */
8787 output_casing_rules (const char *filename, const char *version)
8793 stream = fopen (filename, "w");
8796 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8800 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8801 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8802 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8804 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8805 fprintf (stream, "%%struct-type\n");
8806 fprintf (stream, "%%language=ANSI-C\n");
8807 fprintf (stream, "%%define slot-name code\n");
8808 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8809 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8810 fprintf (stream, "%%compare-lengths\n");
8811 fprintf (stream, "%%compare-strncmp\n");
8812 fprintf (stream, "%%readonly-tables\n");
8813 fprintf (stream, "%%omit-struct-type\n");
8814 fprintf (stream, "%%%%\n");
8817 for (i = 0; i < num_casing_rules; i++)
8819 struct special_casing_rule *rule = casing_rules[i];
8822 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8827 if (!(rule->code < 0x10000))
8829 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8833 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8834 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8836 fprintf (stream, "%d, ",
8837 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8839 context = rule->context;
8842 fprintf (stream, "-");
8843 context = - context;
8846 fprintf (stream, " ");
8850 fprintf (stream, "SCC_ALWAYS ");
8852 case SCC_FINAL_SIGMA:
8853 fprintf (stream, "SCC_FINAL_SIGMA ");
8855 case SCC_AFTER_SOFT_DOTTED:
8856 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8858 case SCC_MORE_ABOVE:
8859 fprintf (stream, "SCC_MORE_ABOVE ");
8861 case SCC_BEFORE_DOT:
8862 fprintf (stream, "SCC_BEFORE_DOT ");
8865 fprintf (stream, "SCC_AFTER_I ");
8870 fprintf (stream, ", ");
8872 if (rule->language != NULL)
8874 if (strlen (rule->language) != 2)
8876 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8879 fprintf (stream, "{ '\\0', '\\0' }, ");
8881 fprintf (stream, "{ ");
8882 for (j = 0; j < 3; j++)
8885 fprintf (stream, ", ");
8886 if (!(rule->upper_mapping[j] < 0x10000))
8888 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8891 if (rule->upper_mapping[j] != 0)
8892 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8894 fprintf (stream, " 0");
8896 fprintf (stream, " }, { ");
8897 for (j = 0; j < 3; j++)
8900 fprintf (stream, ", ");
8901 if (!(rule->lower_mapping[j] < 0x10000))
8903 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8906 if (rule->lower_mapping[j] != 0)
8907 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8909 fprintf (stream, " 0");
8911 fprintf (stream, " }, { ");
8912 for (j = 0; j < 3; j++)
8915 fprintf (stream, ", ");
8916 if (!(rule->title_mapping[j] < 0x10000))
8918 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8921 if (rule->title_mapping[j] != 0)
8922 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8924 fprintf (stream, " 0");
8926 fprintf (stream, " }, { ");
8927 for (j = 0; j < 3; j++)
8930 fprintf (stream, ", ");
8931 if (!(rule->casefold_mapping[j] < 0x10000))
8933 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8936 if (rule->casefold_mapping[j] != 0)
8937 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8939 fprintf (stream, " 0");
8941 fprintf (stream, " }\n");
8944 if (ferror (stream) || fclose (stream))
8946 fprintf (stderr, "error writing to '%s'\n", filename);
8951 /* ========================================================================= */
8953 /* Quoting the Unicode standard:
8954 Definition: A character is defined to be "cased" if it has the Lowercase
8955 or Uppercase property or has a General_Category value of
8956 Titlecase_Letter. */
8958 is_cased (unsigned int ch)
8960 return (is_property_lowercase (ch)
8961 || is_property_uppercase (ch)
8962 || is_category_Lt (ch));
8965 /* Quoting the Unicode standard:
8966 Definition: A character is defined to be "case-ignorable" if it has the
8967 value MidLetter {or the value MidNumLet} for the Word_Break property or
8968 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8969 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8970 The text marked in braces was added in Unicode 5.1.0, see
8971 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8972 Definition of case-ignorable". */
8973 /* Since this predicate is only used for the "Before C" and "After C"
8974 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8975 This simplifies the evaluation of the regular expressions
8976 \p{cased} (\p{case-ignorable})* C
8978 C (\p{case-ignorable})* \p{cased}
8981 is_case_ignorable (unsigned int ch)
8983 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8984 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8985 || is_category_Mn (ch)
8986 || is_category_Me (ch)
8987 || is_category_Cf (ch)
8988 || is_category_Lm (ch)
8989 || is_category_Sk (ch))
8993 /* ------------------------------------------------------------------------- */
8995 /* Output all case related properties. */
8997 output_casing_properties (const char *version)
8999 #define PROPERTY(FN,P) \
9000 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
9001 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
9002 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
9003 PROPERTY(cased, cased)
9004 PROPERTY(ignorable, case_ignorable)
9008 /* ========================================================================= */
9011 main (int argc, char * argv[])
9013 const char *unicodedata_filename;
9014 const char *proplist_filename;
9015 const char *derivedproplist_filename;
9016 const char *scripts_filename;
9017 const char *blocks_filename;
9018 const char *proplist30_filename;
9019 const char *eastasianwidth_filename;
9020 const char *linebreak_filename;
9021 const char *wordbreakproperty_filename;
9022 const char *graphemebreakproperty_filename;
9023 const char *compositionexclusions_filename;
9024 const char *specialcasing_filename;
9025 const char *casefolding_filename;
9026 const char *version;
9030 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
9035 unicodedata_filename = argv[1];
9036 proplist_filename = argv[2];
9037 derivedproplist_filename = argv[3];
9038 scripts_filename = argv[4];
9039 blocks_filename = argv[5];
9040 proplist30_filename = argv[6];
9041 eastasianwidth_filename = argv[7];
9042 linebreak_filename = argv[8];
9043 wordbreakproperty_filename = argv[9];
9044 graphemebreakproperty_filename = argv[10];
9045 compositionexclusions_filename = argv[11];
9046 specialcasing_filename = argv[12];
9047 casefolding_filename = argv[13];
9050 fill_attributes (unicodedata_filename);
9051 clear_properties ();
9052 fill_properties (proplist_filename);
9053 fill_properties (derivedproplist_filename);
9054 fill_properties30 (proplist30_filename);
9055 fill_scripts (scripts_filename);
9056 fill_blocks (blocks_filename);
9057 fill_width (eastasianwidth_filename);
9058 fill_org_lbp (linebreak_filename);
9059 fill_org_wbp (wordbreakproperty_filename);
9060 fill_org_gbp (graphemebreakproperty_filename);
9061 fill_composition_exclusions (compositionexclusions_filename);
9062 fill_casing_rules (specialcasing_filename);
9063 fill_casefolding_rules (casefolding_filename);
9064 redistribute_casefolding_rules ();
9065 sort_casing_rules ();
9067 output_categories (version);
9068 output_category ("unictype/categ_of.h", version);
9069 output_combclass ("unictype/combining.h", version);
9070 output_bidi_category ("unictype/bidi_of.h", version);
9071 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
9072 output_decimal_digit ("unictype/decdigit.h", version);
9073 output_digit_test ("../tests/unictype/test-digit.h", version);
9074 output_digit ("unictype/digit.h", version);
9075 output_numeric_test ("../tests/unictype/test-numeric.h", version);
9076 output_numeric ("unictype/numeric.h", version);
9077 output_mirror ("unictype/mirror.h", version);
9078 output_properties (version);
9079 output_scripts (version);
9080 output_scripts_byname (version);
9081 output_blocks (version);
9082 output_ident_properties (version);
9083 output_nonspacing_property ("uniwidth/width.c.part");
9084 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
9085 output_old_ctype (version);
9087 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
9088 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
9089 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
9091 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
9092 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
9093 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
9095 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
9096 output_gbp_table ("unigbrk/gbrkprop.h", version);
9098 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
9099 debug_output_composition_tables ("uninorm/composition.txt");
9100 output_composition_tables ("uninorm/composition-table.gperf", version);
9102 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
9103 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
9104 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
9105 output_simple_mapping ("unicase/toupper.h", to_upper, version);
9106 output_simple_mapping ("unicase/tolower.h", to_lower, version);
9107 output_simple_mapping ("unicase/totitle.h", to_title, version);
9108 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
9109 output_casing_rules ("unicase/special-casing-table.gperf", version);
9110 output_casing_properties (version);
9116 * For Emacs M-x compile
9118 * compile-command: "
9119 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
9121 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
9122 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
9123 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
9124 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
9125 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
9126 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
9127 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
9128 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
9129 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
9130 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
9131 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
9132 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
9133 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
9135 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
9136 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt