1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2496 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2505 PROP_QUOTATION_MARK,
2506 PROP_TERMINAL_PUNCTUATION,
2509 PROP_ASCII_HEX_DIGIT,
2510 PROP_OTHER_ALPHABETIC,
2514 PROP_OTHER_LOWERCASE,
2515 PROP_OTHER_UPPERCASE,
2516 PROP_NONCHARACTER_CODE_POINT,
2517 PROP_OTHER_GRAPHEME_EXTEND,
2518 PROP_IDS_BINARY_OPERATOR,
2519 PROP_IDS_TRINARY_OPERATOR,
2521 PROP_UNIFIED_IDEOGRAPH,
2522 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2525 PROP_LOGICAL_ORDER_EXCEPTION,
2526 PROP_OTHER_ID_START,
2527 PROP_OTHER_ID_CONTINUE,
2529 PROP_VARIATION_SELECTOR,
2530 PROP_PATTERN_WHITE_SPACE,
2531 PROP_PATTERN_SYNTAX,
2532 /* DerivedCoreProperties.txt */
2541 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2542 PROP_GRAPHEME_EXTEND,
2546 unsigned long long unicode_properties[0x110000];
2549 clear_properties (void)
2553 for (i = 0; i < 0x110000; i++)
2554 unicode_properties[i] = 0;
2557 /* Stores in unicode_properties[] the properties from the
2558 PropList.txt or DerivedCoreProperties.txt file. */
2560 fill_properties (const char *proplist_filename)
2565 stream = fopen (proplist_filename, "r");
2568 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2575 unsigned int i1, i2;
2576 char padding[200+1];
2577 char propname[200+1];
2578 unsigned int propvalue;
2580 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2583 if (buf[0] == '\0' || buf[0] == '#')
2586 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2588 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2590 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2595 #define PROP(name,value) \
2596 if (strcmp (propname, name) == 0) propvalue = value; else
2598 PROP ("White_Space", PROP_WHITE_SPACE)
2599 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2600 PROP ("Join_Control", PROP_JOIN_CONTROL)
2601 PROP ("Dash", PROP_DASH)
2602 PROP ("Hyphen", PROP_HYPHEN)
2603 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2604 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2605 PROP ("Other_Math", PROP_OTHER_MATH)
2606 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2607 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2608 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2609 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2610 PROP ("Diacritic", PROP_DIACRITIC)
2611 PROP ("Extender", PROP_EXTENDER)
2612 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2613 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2614 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2615 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2616 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2617 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2618 PROP ("Radical", PROP_RADICAL)
2619 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2620 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2621 PROP ("Deprecated", PROP_DEPRECATED)
2622 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2623 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2624 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2625 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2626 PROP ("STerm", PROP_STERM)
2627 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2628 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2629 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2630 /* DerivedCoreProperties.txt */
2631 PROP ("Math", PROP_MATH)
2632 PROP ("Alphabetic", PROP_ALPHABETIC)
2633 PROP ("Lowercase", PROP_LOWERCASE)
2634 PROP ("Uppercase", PROP_UPPERCASE)
2635 PROP ("ID_Start", PROP_ID_START)
2636 PROP ("ID_Continue", PROP_ID_CONTINUE)
2637 PROP ("XID_Start", PROP_XID_START)
2638 PROP ("XID_Continue", PROP_XID_CONTINUE)
2639 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2640 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2641 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2642 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2645 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2649 if (!(i1 <= i2 && i2 < 0x110000))
2652 for (i = i1; i <= i2; i++)
2653 unicode_properties[i] |= 1ULL << propvalue;
2656 if (ferror (stream) || fclose (stream))
2658 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2663 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2666 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2672 for (i = 0; i < 0x110000; i++)
2675 stream = fopen (proplist_filename, "r");
2678 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2682 /* Search for the "Property dump for: ..." line. */
2685 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2687 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2691 while (strstr (buf, property_name) == NULL);
2695 unsigned int i1, i2;
2697 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2701 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2703 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2705 fprintf (stderr, "parse error in property in '%s'\n",
2710 else if (strlen (buf) >= 4)
2712 if (sscanf (buf, "%4X", &i1) < 1)
2714 fprintf (stderr, "parse error in property in '%s'\n",
2722 fprintf (stderr, "parse error in property in '%s'\n",
2726 if (!(i1 <= i2 && i2 < 0x110000))
2728 for (i = i1; i <= i2; i++)
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2738 /* Properties from Unicode 3.0 PropList.txt file. */
2740 /* The paired punctuation property from the PropList.txt file. */
2741 char unicode_pairedpunctuation[0x110000];
2743 /* The left of pair property from the PropList.txt file. */
2744 char unicode_leftofpair[0x110000];
2747 fill_properties30 (const char *proplist30_filename)
2749 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2750 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2753 /* ------------------------------------------------------------------------- */
2755 /* See PropList.txt, UCD.html. */
2757 is_property_white_space (unsigned int ch)
2759 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2762 /* See Unicode 3.0 book, section 4.10,
2763 PropList.txt, UCD.html,
2764 DerivedCoreProperties.txt, UCD.html. */
2766 is_property_alphabetic (unsigned int ch)
2770 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2771 /* For some reason, the following are listed as having property
2772 Alphabetic but not as having property Other_Alphabetic. */
2773 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2774 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2775 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2776 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2777 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2778 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2779 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2780 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2781 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2782 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2783 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2784 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2786 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2788 if (result1 != result2)
2793 /* See PropList.txt, UCD.html. */
2795 is_property_other_alphabetic (unsigned int ch)
2797 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2800 /* See PropList.txt, UCD.html. */
2802 is_property_not_a_character (unsigned int ch)
2804 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2807 /* See PropList.txt, UCD.html,
2808 DerivedCoreProperties.txt, UCD.html. */
2810 is_property_default_ignorable_code_point (unsigned int ch)
2813 (is_category_Cf (ch)
2814 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2815 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2816 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2817 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2819 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_default_ignorable_code_point (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_deprecated (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2840 /* See PropList.txt, UCD.html. */
2842 is_property_logical_order_exception (unsigned int ch)
2844 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2847 /* See PropList.txt, UCD.html. */
2849 is_property_variation_selector (unsigned int ch)
2851 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2854 /* See PropList-3.0.1.txt. */
2856 is_property_private_use (unsigned int ch)
2858 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2859 return (ch >= 0xE000 && ch <= 0xF8FF)
2860 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2861 || (ch >= 0x100000 && ch <= 0x10FFFD);
2864 /* See PropList-3.0.1.txt. */
2866 is_property_unassigned_code_value (unsigned int ch)
2868 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2871 /* See PropList.txt, UCD.html,
2872 DerivedCoreProperties.txt, UCD.html. */
2874 is_property_uppercase (unsigned int ch)
2878 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2880 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2882 if (result1 != result2)
2887 /* See PropList.txt, UCD.html. */
2889 is_property_other_uppercase (unsigned int ch)
2891 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2894 /* See PropList.txt, UCD.html,
2895 DerivedCoreProperties.txt, UCD.html. */
2897 is_property_lowercase (unsigned int ch)
2901 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2903 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2905 if (result1 != result2)
2910 /* See PropList.txt, UCD.html. */
2912 is_property_other_lowercase (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2917 /* See PropList-3.0.1.txt. */
2919 is_property_titlecase (unsigned int ch)
2921 return is_category_Lt (ch);
2924 /* See PropList.txt, UCD.html. */
2926 is_property_soft_dotted (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2931 /* See DerivedCoreProperties.txt, UCD.html. */
2933 is_property_id_start (unsigned int ch)
2935 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2938 /* See PropList.txt, UCD.html. */
2940 is_property_other_id_start (unsigned int ch)
2942 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2945 /* See DerivedCoreProperties.txt, UCD.html. */
2947 is_property_id_continue (unsigned int ch)
2949 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2952 /* See PropList.txt, UCD.html. */
2954 is_property_other_id_continue (unsigned int ch)
2956 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2959 /* See DerivedCoreProperties.txt, UCD.html. */
2961 is_property_xid_start (unsigned int ch)
2963 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2966 /* See DerivedCoreProperties.txt, UCD.html. */
2968 is_property_xid_continue (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2973 /* See PropList.txt, UCD.html. */
2975 is_property_pattern_white_space (unsigned int ch)
2977 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2980 /* See PropList.txt, UCD.html. */
2982 is_property_pattern_syntax (unsigned int ch)
2984 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2987 /* See PropList.txt, UCD.html. */
2989 is_property_join_control (unsigned int ch)
2991 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2994 /* See DerivedCoreProperties.txt, UCD.html. */
2996 is_property_grapheme_base (unsigned int ch)
2998 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3001 /* See DerivedCoreProperties.txt, UCD.html. */
3003 is_property_grapheme_extend (unsigned int ch)
3005 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3008 /* See PropList.txt, UCD.html. */
3010 is_property_other_grapheme_extend (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3015 /* See DerivedCoreProperties.txt, UCD.html. */
3017 is_property_grapheme_link (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3022 /* See PropList.txt, UCD.html. */
3024 is_property_bidi_control (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3029 /* See PropList-3.0.1.txt. */
3031 is_property_bidi_left_to_right (unsigned int ch)
3033 return (get_bidi_category (ch) == UC_BIDI_L);
3036 /* See PropList-3.0.1.txt. */
3038 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3040 return (get_bidi_category (ch) == UC_BIDI_R);
3043 /* See PropList-3.0.1.txt. */
3045 is_property_bidi_arabic_right_to_left (unsigned int ch)
3047 return (get_bidi_category (ch) == UC_BIDI_AL);
3050 /* See PropList-3.0.1.txt. */
3052 is_property_bidi_european_digit (unsigned int ch)
3054 return (get_bidi_category (ch) == UC_BIDI_EN);
3057 /* See PropList-3.0.1.txt. */
3059 is_property_bidi_eur_num_separator (unsigned int ch)
3061 return (get_bidi_category (ch) == UC_BIDI_ES);
3064 /* See PropList-3.0.1.txt. */
3066 is_property_bidi_eur_num_terminator (unsigned int ch)
3068 return (get_bidi_category (ch) == UC_BIDI_ET);
3071 /* See PropList-3.0.1.txt. */
3073 is_property_bidi_arabic_digit (unsigned int ch)
3075 return (get_bidi_category (ch) == UC_BIDI_AN);
3078 /* See PropList-3.0.1.txt. */
3080 is_property_bidi_common_separator (unsigned int ch)
3082 return (get_bidi_category (ch) == UC_BIDI_CS);
3085 /* See PropList-3.0.1.txt. */
3087 is_property_bidi_block_separator (unsigned int ch)
3089 return (get_bidi_category (ch) == UC_BIDI_B);
3092 /* See PropList-3.0.1.txt. */
3094 is_property_bidi_segment_separator (unsigned int ch)
3096 return (get_bidi_category (ch) == UC_BIDI_S);
3099 /* See PropList-3.0.1.txt. */
3101 is_property_bidi_whitespace (unsigned int ch)
3103 return (get_bidi_category (ch) == UC_BIDI_WS);
3106 /* See PropList-3.0.1.txt. */
3108 is_property_bidi_non_spacing_mark (unsigned int ch)
3110 return (get_bidi_category (ch) == UC_BIDI_NSM);
3113 /* See PropList-3.0.1.txt. */
3115 is_property_bidi_boundary_neutral (unsigned int ch)
3117 return (get_bidi_category (ch) == UC_BIDI_BN);
3120 /* See PropList-3.0.1.txt. */
3122 is_property_bidi_pdf (unsigned int ch)
3124 return (get_bidi_category (ch) == UC_BIDI_PDF);
3127 /* See PropList-3.0.1.txt. */
3129 is_property_bidi_embedding_or_override (unsigned int ch)
3131 int category = get_bidi_category (ch);
3132 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3133 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3136 /* See PropList-3.0.1.txt. */
3138 is_property_bidi_other_neutral (unsigned int ch)
3140 return (get_bidi_category (ch) == UC_BIDI_ON);
3143 /* See PropList.txt, UCD.html. */
3145 is_property_hex_digit (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3150 /* See PropList.txt, UCD.html. */
3152 is_property_ascii_hex_digit (unsigned int ch)
3154 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3157 /* See Unicode 3.0 book, section 4.10,
3158 PropList.txt, UCD.html. */
3160 is_property_ideographic (unsigned int ch)
3162 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3165 /* See PropList.txt, UCD.html. */
3167 is_property_unified_ideograph (unsigned int ch)
3169 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3172 /* See PropList.txt, UCD.html. */
3174 is_property_radical (unsigned int ch)
3176 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3179 /* See PropList.txt, UCD.html. */
3181 is_property_ids_binary_operator (unsigned int ch)
3183 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3186 /* See PropList.txt, UCD.html. */
3188 is_property_ids_trinary_operator (unsigned int ch)
3190 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3193 /* See PropList-3.0.1.txt. */
3195 is_property_zero_width (unsigned int ch)
3197 return is_category_Cf (ch)
3198 || (unicode_attributes[ch].name != NULL
3199 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3202 /* See PropList-3.0.1.txt. */
3204 is_property_space (unsigned int ch)
3206 return is_category_Zs (ch);
3209 /* See PropList-3.0.1.txt. */
3211 is_property_non_break (unsigned int ch)
3213 /* This is exactly the set of characters having line breaking
3215 return (ch == 0x00A0 /* NO-BREAK SPACE */
3216 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3217 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3218 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3219 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3220 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3221 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3222 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3223 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3224 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3225 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3226 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3227 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3228 || ch == 0x2007 /* FIGURE SPACE */
3229 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3230 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3233 /* See PropList-3.0.1.txt. */
3235 is_property_iso_control (unsigned int ch)
3238 (unicode_attributes[ch].name != NULL
3239 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3241 is_category_Cc (ch);
3243 if (result1 != result2)
3248 /* See PropList-3.0.1.txt. */
3250 is_property_format_control (unsigned int ch)
3252 return (is_category_Cf (ch)
3253 && get_bidi_category (ch) == UC_BIDI_BN
3254 && !is_property_join_control (ch)
3258 /* See PropList.txt, UCD.html. */
3260 is_property_dash (unsigned int ch)
3262 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3265 /* See PropList.txt, UCD.html. */
3267 is_property_hyphen (unsigned int ch)
3269 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3272 /* See PropList-3.0.1.txt. */
3274 is_property_punctuation (unsigned int ch)
3276 return is_category_P (ch);
3279 /* See PropList-3.0.1.txt. */
3281 is_property_line_separator (unsigned int ch)
3283 return is_category_Zl (ch);
3286 /* See PropList-3.0.1.txt. */
3288 is_property_paragraph_separator (unsigned int ch)
3290 return is_category_Zp (ch);
3293 /* See PropList.txt, UCD.html. */
3295 is_property_quotation_mark (unsigned int ch)
3297 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3300 /* See PropList.txt, UCD.html. */
3302 is_property_sentence_terminal (unsigned int ch)
3304 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3307 /* See PropList.txt, UCD.html. */
3309 is_property_terminal_punctuation (unsigned int ch)
3311 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3314 /* See PropList-3.0.1.txt. */
3316 is_property_currency_symbol (unsigned int ch)
3318 return is_category_Sc (ch);
3321 /* See Unicode 3.0 book, section 4.9,
3322 PropList.txt, UCD.html,
3323 DerivedCoreProperties.txt, UCD.html. */
3325 is_property_math (unsigned int ch)
3329 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3331 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3333 if (result1 != result2)
3338 /* See PropList.txt, UCD.html. */
3340 is_property_other_math (unsigned int ch)
3342 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3345 /* See PropList-3.0.1.txt. */
3347 is_property_paired_punctuation (unsigned int ch)
3349 return unicode_pairedpunctuation[ch];
3352 /* See PropList-3.0.1.txt. */
3354 is_property_left_of_pair (unsigned int ch)
3356 return unicode_leftofpair[ch];
3359 /* See PropList-3.0.1.txt. */
3361 is_property_combining (unsigned int ch)
3363 return (unicode_attributes[ch].name != NULL
3364 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3365 || is_category_Mc (ch)
3366 || is_category_Me (ch)
3367 || is_category_Mn (ch)));
3370 #if 0 /* same as is_property_bidi_non_spacing_mark */
3371 /* See PropList-3.0.1.txt. */
3373 is_property_non_spacing (unsigned int ch)
3375 return (unicode_attributes[ch].name != NULL
3376 && get_bidi_category (ch) == UC_BIDI_NSM);
3380 /* See PropList-3.0.1.txt. */
3382 is_property_composite (unsigned int ch)
3384 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3385 logical in some sense. */
3386 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3388 if (unicode_attributes[ch].name != NULL
3389 && unicode_attributes[ch].decomposition != NULL)
3391 /* Test whether the decomposition contains more than one character,
3392 and the first is not a space. */
3393 const char *decomp = unicode_attributes[ch].decomposition;
3394 if (decomp[0] == '<')
3396 decomp = strchr (decomp, '>') + 1;
3397 if (decomp[0] == ' ')
3400 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3405 /* See PropList-3.0.1.txt. */
3407 is_property_decimal_digit (unsigned int ch)
3409 return is_category_Nd (ch);
3412 /* See PropList-3.0.1.txt. */
3414 is_property_numeric (unsigned int ch)
3416 return ((get_numeric_value (ch)).denominator > 0)
3417 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3418 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3421 /* See PropList.txt, UCD.html. */
3423 is_property_diacritic (unsigned int ch)
3425 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3428 /* See PropList.txt, UCD.html. */
3430 is_property_extender (unsigned int ch)
3432 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3435 /* See PropList-3.0.1.txt. */
3437 is_property_ignorable_control (unsigned int ch)
3439 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3440 || is_category_Cf (ch))
3444 /* ------------------------------------------------------------------------- */
3446 /* Output all properties. */
3448 output_properties (const char *version)
3450 #define PROPERTY(P) \
3451 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3452 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3453 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3454 PROPERTY(white_space)
3455 PROPERTY(alphabetic)
3456 PROPERTY(other_alphabetic)
3457 PROPERTY(not_a_character)
3458 PROPERTY(default_ignorable_code_point)
3459 PROPERTY(other_default_ignorable_code_point)
3460 PROPERTY(deprecated)
3461 PROPERTY(logical_order_exception)
3462 PROPERTY(variation_selector)
3463 PROPERTY(private_use)
3464 PROPERTY(unassigned_code_value)
3466 PROPERTY(other_uppercase)
3468 PROPERTY(other_lowercase)
3470 PROPERTY(soft_dotted)
3472 PROPERTY(other_id_start)
3473 PROPERTY(id_continue)
3474 PROPERTY(other_id_continue)
3476 PROPERTY(xid_continue)
3477 PROPERTY(pattern_white_space)
3478 PROPERTY(pattern_syntax)
3479 PROPERTY(join_control)
3480 PROPERTY(grapheme_base)
3481 PROPERTY(grapheme_extend)
3482 PROPERTY(other_grapheme_extend)
3483 PROPERTY(grapheme_link)
3484 PROPERTY(bidi_control)
3485 PROPERTY(bidi_left_to_right)
3486 PROPERTY(bidi_hebrew_right_to_left)
3487 PROPERTY(bidi_arabic_right_to_left)
3488 PROPERTY(bidi_european_digit)
3489 PROPERTY(bidi_eur_num_separator)
3490 PROPERTY(bidi_eur_num_terminator)
3491 PROPERTY(bidi_arabic_digit)
3492 PROPERTY(bidi_common_separator)
3493 PROPERTY(bidi_block_separator)
3494 PROPERTY(bidi_segment_separator)
3495 PROPERTY(bidi_whitespace)
3496 PROPERTY(bidi_non_spacing_mark)
3497 PROPERTY(bidi_boundary_neutral)
3499 PROPERTY(bidi_embedding_or_override)
3500 PROPERTY(bidi_other_neutral)
3502 PROPERTY(ascii_hex_digit)
3503 PROPERTY(ideographic)
3504 PROPERTY(unified_ideograph)
3506 PROPERTY(ids_binary_operator)
3507 PROPERTY(ids_trinary_operator)
3508 PROPERTY(zero_width)
3511 PROPERTY(iso_control)
3512 PROPERTY(format_control)
3515 PROPERTY(punctuation)
3516 PROPERTY(line_separator)
3517 PROPERTY(paragraph_separator)
3518 PROPERTY(quotation_mark)
3519 PROPERTY(sentence_terminal)
3520 PROPERTY(terminal_punctuation)
3521 PROPERTY(currency_symbol)
3523 PROPERTY(other_math)
3524 PROPERTY(paired_punctuation)
3525 PROPERTY(left_of_pair)
3528 PROPERTY(decimal_digit)
3532 PROPERTY(ignorable_control)
3536 /* ========================================================================= */
3540 static const char *scripts[256];
3541 static unsigned int numscripts;
3543 static uint8_t unicode_scripts[0x110000];
3546 fill_scripts (const char *scripts_filename)
3551 stream = fopen (scripts_filename, "r");
3554 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3560 for (i = 0; i < 0x110000; i++)
3561 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3566 unsigned int i1, i2;
3567 char padding[200+1];
3568 char scriptname[200+1];
3571 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3574 if (buf[0] == '\0' || buf[0] == '#')
3577 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3579 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3581 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3591 for (script = numscripts - 1; script >= 0; script--)
3592 if (strcmp (scripts[script], scriptname) == 0)
3596 scripts[numscripts] = strdup (scriptname);
3597 script = numscripts;
3599 if (numscripts == 256)
3603 for (i = i1; i <= i2; i++)
3605 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3606 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3607 unicode_scripts[i] = script;
3611 if (ferror (stream) || fclose (stream))
3613 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3618 /* Construction of sparse 3-level tables. */
3619 #define TABLE script_table
3620 #define ELEMENT uint8_t
3621 #define DEFAULT (uint8_t)~(uint8_t)0
3622 #define xmalloc malloc
3623 #define xrealloc realloc
3627 output_scripts (const char *version)
3629 const char *filename = "unictype/scripts.h";
3631 unsigned int ch, s, i;
3632 struct script_table t;
3633 unsigned int level1_offset, level2_offset, level3_offset;
3637 const char *lowercase_name;
3640 scriptinfo_t scriptinfo[256];
3642 stream = fopen (filename, "w");
3645 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3649 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3650 fprintf (stream, "/* Unicode scripts. */\n");
3651 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3654 for (s = 0; s < numscripts; s++)
3656 char *lcp = strdup (scripts[s]);
3659 for (cp = lcp; *cp != '\0'; cp++)
3660 if (*cp >= 'A' && *cp <= 'Z')
3663 scriptinfo[s].lowercase_name = lcp;
3666 for (s = 0; s < numscripts; s++)
3668 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3669 scriptinfo[s].lowercase_name);
3670 fprintf (stream, "{\n");
3672 for (ch = 0; ch < 0x110000; ch++)
3673 if (unicode_scripts[ch] == s)
3679 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3684 fprintf (stream, ",\n");
3686 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3688 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3692 fprintf (stream, "\n");
3693 fprintf (stream, "};\n");
3696 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3697 fprintf (stream, "{\n");
3698 for (s = 0; s < numscripts; s++)
3700 fprintf (stream, " {\n");
3701 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3702 scriptinfo[s].lowercase_name);
3703 fprintf (stream, " script_%s_intervals,\n",
3704 scriptinfo[s].lowercase_name);
3705 fprintf (stream, " \"%s\"\n", scripts[s]);
3706 fprintf (stream, " }");
3707 if (s+1 < numscripts)
3708 fprintf (stream, ",");
3709 fprintf (stream, "\n");
3711 fprintf (stream, "};\n");
3715 script_table_init (&t);
3717 for (ch = 0; ch < 0x110000; ch++)
3719 unsigned int s = unicode_scripts[ch];
3720 if (s != (uint8_t)~(uint8_t)0)
3721 script_table_add (&t, ch, s);
3724 script_table_finalize (&t);
3726 /* Offsets in t.result, in memory of this process. */
3728 5 * sizeof (uint32_t);
3730 5 * sizeof (uint32_t)
3731 + t.level1_size * sizeof (uint32_t);
3733 5 * sizeof (uint32_t)
3734 + t.level1_size * sizeof (uint32_t)
3735 + (t.level2_size << t.q) * sizeof (uint32_t);
3737 for (i = 0; i < 5; i++)
3738 fprintf (stream, "#define script_header_%d %d\n", i,
3739 ((uint32_t *) t.result)[i]);
3740 fprintf (stream, "static const\n");
3741 fprintf (stream, "struct\n");
3742 fprintf (stream, " {\n");
3743 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3744 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3745 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3746 fprintf (stream, " }\n");
3747 fprintf (stream, "u_script =\n");
3748 fprintf (stream, "{\n");
3749 fprintf (stream, " {");
3750 if (t.level1_size > 8)
3751 fprintf (stream, "\n ");
3752 for (i = 0; i < t.level1_size; i++)
3755 if (i > 0 && (i % 8) == 0)
3756 fprintf (stream, "\n ");
3757 offset = ((uint32_t *) (t.result + level1_offset))[i];
3759 fprintf (stream, " %5d", -1);
3761 fprintf (stream, " %5zu",
3762 (offset - level2_offset) / sizeof (uint32_t));
3763 if (i+1 < t.level1_size)
3764 fprintf (stream, ",");
3766 if (t.level1_size > 8)
3767 fprintf (stream, "\n ");
3768 fprintf (stream, " },\n");
3769 fprintf (stream, " {");
3770 if (t.level2_size << t.q > 8)
3771 fprintf (stream, "\n ");
3772 for (i = 0; i < t.level2_size << t.q; i++)
3775 if (i > 0 && (i % 8) == 0)
3776 fprintf (stream, "\n ");
3777 offset = ((uint32_t *) (t.result + level2_offset))[i];
3779 fprintf (stream, " %5d", -1);
3781 fprintf (stream, " %5zu",
3782 (offset - level3_offset) / sizeof (uint8_t));
3783 if (i+1 < t.level2_size << t.q)
3784 fprintf (stream, ",");
3786 if (t.level2_size << t.q > 8)
3787 fprintf (stream, "\n ");
3788 fprintf (stream, " },\n");
3789 fprintf (stream, " {");
3790 if (t.level3_size << t.p > 8)
3791 fprintf (stream, "\n ");
3792 for (i = 0; i < t.level3_size << t.p; i++)
3794 if (i > 0 && (i % 8) == 0)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3797 if (i+1 < t.level3_size << t.p)
3798 fprintf (stream, ",");
3800 if (t.level3_size << t.p > 8)
3801 fprintf (stream, "\n ");
3802 fprintf (stream, " }\n");
3803 fprintf (stream, "};\n");
3805 if (ferror (stream) || fclose (stream))
3807 fprintf (stderr, "error writing to '%s'\n", filename);
3813 output_scripts_byname (const char *version)
3815 const char *filename = "unictype/scripts_byname.gperf";
3819 stream = fopen (filename, "w");
3822 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3826 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3827 fprintf (stream, "/* Unicode scripts. */\n");
3828 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3830 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3831 fprintf (stream, "%%struct-type\n");
3832 fprintf (stream, "%%language=ANSI-C\n");
3833 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3834 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3835 fprintf (stream, "%%readonly-tables\n");
3836 fprintf (stream, "%%global-table\n");
3837 fprintf (stream, "%%define word-array-name script_names\n");
3838 fprintf (stream, "%%%%\n");
3839 for (s = 0; s < numscripts; s++)
3840 fprintf (stream, "%s, %u\n", scripts[s], s);
3842 if (ferror (stream) || fclose (stream))
3844 fprintf (stderr, "error writing to '%s'\n", filename);
3849 /* ========================================================================= */
3853 typedef struct { unsigned int start; unsigned int end; const char *name; }
3855 static block_t blocks[256];
3856 static unsigned int numblocks;
3859 fill_blocks (const char *blocks_filename)
3863 stream = fopen (blocks_filename, "r");
3866 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3873 unsigned int i1, i2;
3874 char padding[200+1];
3875 char blockname[200+1];
3877 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3880 if (buf[0] == '\0' || buf[0] == '#')
3883 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3885 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3888 blocks[numblocks].start = i1;
3889 blocks[numblocks].end = i2;
3890 blocks[numblocks].name = strdup (blockname);
3891 /* It must be sorted. */
3892 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3895 if (numblocks == 256)
3899 if (ferror (stream) || fclose (stream))
3901 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3906 /* Return the smallest block index among the blocks for characters >= ch. */
3908 block_first_index (unsigned int ch)
3910 /* Binary search. */
3911 unsigned int lo = 0;
3912 unsigned int hi = numblocks;
3914 All blocks[i], i < lo, have blocks[i].end < ch,
3915 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3918 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3919 if (blocks[mid].end < ch)
3927 /* Return the largest block index among the blocks for characters <= ch,
3930 block_last_index (unsigned int ch)
3932 /* Binary search. */
3933 unsigned int lo = 0;
3934 unsigned int hi = numblocks;
3936 All blocks[i], i < lo, have blocks[i].start <= ch,
3937 all blocks[i], i >= hi, have blocks[i].start > ch. */
3940 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3941 if (blocks[mid].start <= ch)
3950 output_blocks (const char *version)
3952 const char *filename = "unictype/blocks.h";
3953 const unsigned int shift = 8; /* bits to shift away for array access */
3954 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3959 stream = fopen (filename, "w");
3962 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3966 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3967 fprintf (stream, "/* Unicode blocks. */\n");
3968 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3971 fprintf (stream, "static const uc_block_t blocks[] =\n");
3972 fprintf (stream, "{\n");
3973 for (i = 0; i < numblocks; i++)
3975 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3976 blocks[i].end, blocks[i].name);
3977 if (i+1 < numblocks)
3978 fprintf (stream, ",");
3979 fprintf (stream, "\n");
3981 fprintf (stream, "};\n");
3982 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3983 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3984 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3985 threshold >> shift);
3986 fprintf (stream, "{\n");
3987 for (i1 = 0; i1 < (threshold >> shift); i1++)
3989 unsigned int first_index = block_first_index (i1 << shift);
3990 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3991 fprintf (stream, " %3d, %3d", first_index, last_index);
3992 if (i1+1 < (threshold >> shift))
3993 fprintf (stream, ",");
3994 fprintf (stream, "\n");
3996 fprintf (stream, "};\n");
3997 fprintf (stream, "#define blocks_upper_first_index %d\n",
3998 block_first_index (threshold));
3999 fprintf (stream, "#define blocks_upper_last_index %d\n",
4000 block_last_index (0x10FFFF));
4002 if (ferror (stream) || fclose (stream))
4004 fprintf (stderr, "error writing to '%s'\n", filename);
4009 /* ========================================================================= */
4011 /* C and Java syntax. */
4015 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4016 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4017 UC_IDENTIFIER_INVALID, /* not valid */
4018 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4021 /* ISO C 99 section 6.4.(3). */
4023 is_c_whitespace (unsigned int ch)
4025 return (ch == ' ' /* space */
4026 || ch == '\t' /* horizontal tab */
4027 || ch == '\n' || ch == '\r' /* new-line */
4028 || ch == '\v' /* vertical tab */
4029 || ch == '\f'); /* form-feed */
4032 /* ISO C 99 section 6.4.2.1 and appendix D. */
4034 c_ident_category (unsigned int ch)
4036 /* Section 6.4.2.1. */
4037 if (ch >= '0' && ch <= '9')
4038 return UC_IDENTIFIER_VALID;
4039 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4040 return UC_IDENTIFIER_START;
4046 || (ch >= 0x00C0 && ch <= 0x00D6)
4047 || (ch >= 0x00D8 && ch <= 0x00F6)
4048 || (ch >= 0x00F8 && ch <= 0x01F5)
4049 || (ch >= 0x01FA && ch <= 0x0217)
4050 || (ch >= 0x0250 && ch <= 0x02A8)
4051 || (ch >= 0x1E00 && ch <= 0x1E9B)
4052 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4056 || (ch >= 0x0388 && ch <= 0x038A)
4058 || (ch >= 0x038E && ch <= 0x03A1)
4059 || (ch >= 0x03A3 && ch <= 0x03CE)
4060 || (ch >= 0x03D0 && ch <= 0x03D6)
4065 || (ch >= 0x03E2 && ch <= 0x03F3)
4066 || (ch >= 0x1F00 && ch <= 0x1F15)
4067 || (ch >= 0x1F18 && ch <= 0x1F1D)
4068 || (ch >= 0x1F20 && ch <= 0x1F45)
4069 || (ch >= 0x1F48 && ch <= 0x1F4D)
4070 || (ch >= 0x1F50 && ch <= 0x1F57)
4074 || (ch >= 0x1F5F && ch <= 0x1F7D)
4075 || (ch >= 0x1F80 && ch <= 0x1FB4)
4076 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4077 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4078 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4079 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4080 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4081 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4082 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4083 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4085 || (ch >= 0x0401 && ch <= 0x040C)
4086 || (ch >= 0x040E && ch <= 0x044F)
4087 || (ch >= 0x0451 && ch <= 0x045C)
4088 || (ch >= 0x045E && ch <= 0x0481)
4089 || (ch >= 0x0490 && ch <= 0x04C4)
4090 || (ch >= 0x04C7 && ch <= 0x04C8)
4091 || (ch >= 0x04CB && ch <= 0x04CC)
4092 || (ch >= 0x04D0 && ch <= 0x04EB)
4093 || (ch >= 0x04EE && ch <= 0x04F5)
4094 || (ch >= 0x04F8 && ch <= 0x04F9)
4096 || (ch >= 0x0531 && ch <= 0x0556)
4097 || (ch >= 0x0561 && ch <= 0x0587)
4099 || (ch >= 0x05B0 && ch <= 0x05B9)
4100 || (ch >= 0x05BB && ch <= 0x05BD)
4102 || (ch >= 0x05C1 && ch <= 0x05C2)
4103 || (ch >= 0x05D0 && ch <= 0x05EA)
4104 || (ch >= 0x05F0 && ch <= 0x05F2)
4106 || (ch >= 0x0621 && ch <= 0x063A)
4107 || (ch >= 0x0640 && ch <= 0x0652)
4108 || (ch >= 0x0670 && ch <= 0x06B7)
4109 || (ch >= 0x06BA && ch <= 0x06BE)
4110 || (ch >= 0x06C0 && ch <= 0x06CE)
4111 || (ch >= 0x06D0 && ch <= 0x06DC)
4112 || (ch >= 0x06E5 && ch <= 0x06E8)
4113 || (ch >= 0x06EA && ch <= 0x06ED)
4115 || (ch >= 0x0901 && ch <= 0x0903)
4116 || (ch >= 0x0905 && ch <= 0x0939)
4117 || (ch >= 0x093E && ch <= 0x094D)
4118 || (ch >= 0x0950 && ch <= 0x0952)
4119 || (ch >= 0x0958 && ch <= 0x0963)
4121 || (ch >= 0x0981 && ch <= 0x0983)
4122 || (ch >= 0x0985 && ch <= 0x098C)
4123 || (ch >= 0x098F && ch <= 0x0990)
4124 || (ch >= 0x0993 && ch <= 0x09A8)
4125 || (ch >= 0x09AA && ch <= 0x09B0)
4127 || (ch >= 0x09B6 && ch <= 0x09B9)
4128 || (ch >= 0x09BE && ch <= 0x09C4)
4129 || (ch >= 0x09C7 && ch <= 0x09C8)
4130 || (ch >= 0x09CB && ch <= 0x09CD)
4131 || (ch >= 0x09DC && ch <= 0x09DD)
4132 || (ch >= 0x09DF && ch <= 0x09E3)
4133 || (ch >= 0x09F0 && ch <= 0x09F1)
4136 || (ch >= 0x0A05 && ch <= 0x0A0A)
4137 || (ch >= 0x0A0F && ch <= 0x0A10)
4138 || (ch >= 0x0A13 && ch <= 0x0A28)
4139 || (ch >= 0x0A2A && ch <= 0x0A30)
4140 || (ch >= 0x0A32 && ch <= 0x0A33)
4141 || (ch >= 0x0A35 && ch <= 0x0A36)
4142 || (ch >= 0x0A38 && ch <= 0x0A39)
4143 || (ch >= 0x0A3E && ch <= 0x0A42)
4144 || (ch >= 0x0A47 && ch <= 0x0A48)
4145 || (ch >= 0x0A4B && ch <= 0x0A4D)
4146 || (ch >= 0x0A59 && ch <= 0x0A5C)
4150 || (ch >= 0x0A81 && ch <= 0x0A83)
4151 || (ch >= 0x0A85 && ch <= 0x0A8B)
4153 || (ch >= 0x0A8F && ch <= 0x0A91)
4154 || (ch >= 0x0A93 && ch <= 0x0AA8)
4155 || (ch >= 0x0AAA && ch <= 0x0AB0)
4156 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4157 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4158 || (ch >= 0x0ABD && ch <= 0x0AC5)
4159 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4160 || (ch >= 0x0ACB && ch <= 0x0ACD)
4164 || (ch >= 0x0B01 && ch <= 0x0B03)
4165 || (ch >= 0x0B05 && ch <= 0x0B0C)
4166 || (ch >= 0x0B0F && ch <= 0x0B10)
4167 || (ch >= 0x0B13 && ch <= 0x0B28)
4168 || (ch >= 0x0B2A && ch <= 0x0B30)
4169 || (ch >= 0x0B32 && ch <= 0x0B33)
4170 || (ch >= 0x0B36 && ch <= 0x0B39)
4171 || (ch >= 0x0B3E && ch <= 0x0B43)
4172 || (ch >= 0x0B47 && ch <= 0x0B48)
4173 || (ch >= 0x0B4B && ch <= 0x0B4D)
4174 || (ch >= 0x0B5C && ch <= 0x0B5D)
4175 || (ch >= 0x0B5F && ch <= 0x0B61)
4177 || (ch >= 0x0B82 && ch <= 0x0B83)
4178 || (ch >= 0x0B85 && ch <= 0x0B8A)
4179 || (ch >= 0x0B8E && ch <= 0x0B90)
4180 || (ch >= 0x0B92 && ch <= 0x0B95)
4181 || (ch >= 0x0B99 && ch <= 0x0B9A)
4183 || (ch >= 0x0B9E && ch <= 0x0B9F)
4184 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4185 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4186 || (ch >= 0x0BAE && ch <= 0x0BB5)
4187 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4188 || (ch >= 0x0BBE && ch <= 0x0BC2)
4189 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4190 || (ch >= 0x0BCA && ch <= 0x0BCD)
4192 || (ch >= 0x0C01 && ch <= 0x0C03)
4193 || (ch >= 0x0C05 && ch <= 0x0C0C)
4194 || (ch >= 0x0C0E && ch <= 0x0C10)
4195 || (ch >= 0x0C12 && ch <= 0x0C28)
4196 || (ch >= 0x0C2A && ch <= 0x0C33)
4197 || (ch >= 0x0C35 && ch <= 0x0C39)
4198 || (ch >= 0x0C3E && ch <= 0x0C44)
4199 || (ch >= 0x0C46 && ch <= 0x0C48)
4200 || (ch >= 0x0C4A && ch <= 0x0C4D)
4201 || (ch >= 0x0C60 && ch <= 0x0C61)
4203 || (ch >= 0x0C82 && ch <= 0x0C83)
4204 || (ch >= 0x0C85 && ch <= 0x0C8C)
4205 || (ch >= 0x0C8E && ch <= 0x0C90)
4206 || (ch >= 0x0C92 && ch <= 0x0CA8)
4207 || (ch >= 0x0CAA && ch <= 0x0CB3)
4208 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4209 || (ch >= 0x0CBE && ch <= 0x0CC4)
4210 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4211 || (ch >= 0x0CCA && ch <= 0x0CCD)
4213 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4215 || (ch >= 0x0D02 && ch <= 0x0D03)
4216 || (ch >= 0x0D05 && ch <= 0x0D0C)
4217 || (ch >= 0x0D0E && ch <= 0x0D10)
4218 || (ch >= 0x0D12 && ch <= 0x0D28)
4219 || (ch >= 0x0D2A && ch <= 0x0D39)
4220 || (ch >= 0x0D3E && ch <= 0x0D43)
4221 || (ch >= 0x0D46 && ch <= 0x0D48)
4222 || (ch >= 0x0D4A && ch <= 0x0D4D)
4223 || (ch >= 0x0D60 && ch <= 0x0D61)
4225 || (ch >= 0x0E01 && ch <= 0x0E3A)
4226 || (ch >= 0x0E40 && ch <= 0x0E5B)
4228 || (ch >= 0x0E81 && ch <= 0x0E82)
4230 || (ch >= 0x0E87 && ch <= 0x0E88)
4233 || (ch >= 0x0E94 && ch <= 0x0E97)
4234 || (ch >= 0x0E99 && ch <= 0x0E9F)
4235 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4238 || (ch >= 0x0EAA && ch <= 0x0EAB)
4239 || (ch >= 0x0EAD && ch <= 0x0EAE)
4240 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4241 || (ch >= 0x0EBB && ch <= 0x0EBD)
4242 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4244 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4245 || (ch >= 0x0EDC && ch <= 0x0EDD)
4248 || (ch >= 0x0F18 && ch <= 0x0F19)
4252 || (ch >= 0x0F3E && ch <= 0x0F47)
4253 || (ch >= 0x0F49 && ch <= 0x0F69)
4254 || (ch >= 0x0F71 && ch <= 0x0F84)
4255 || (ch >= 0x0F86 && ch <= 0x0F8B)
4256 || (ch >= 0x0F90 && ch <= 0x0F95)
4258 || (ch >= 0x0F99 && ch <= 0x0FAD)
4259 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4262 || (ch >= 0x10A0 && ch <= 0x10C5)
4263 || (ch >= 0x10D0 && ch <= 0x10F6)
4265 || (ch >= 0x3041 && ch <= 0x3093)
4266 || (ch >= 0x309B && ch <= 0x309C)
4268 || (ch >= 0x30A1 && ch <= 0x30F6)
4269 || (ch >= 0x30FB && ch <= 0x30FC)
4271 || (ch >= 0x3105 && ch <= 0x312C)
4272 /* CJK Unified Ideographs */
4273 || (ch >= 0x4E00 && ch <= 0x9FA5)
4275 || (ch >= 0xAC00 && ch <= 0xD7A3)
4277 || (ch >= 0x0660 && ch <= 0x0669)
4278 || (ch >= 0x06F0 && ch <= 0x06F9)
4279 || (ch >= 0x0966 && ch <= 0x096F)
4280 || (ch >= 0x09E6 && ch <= 0x09EF)
4281 || (ch >= 0x0A66 && ch <= 0x0A6F)
4282 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4283 || (ch >= 0x0B66 && ch <= 0x0B6F)
4284 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4285 || (ch >= 0x0C66 && ch <= 0x0C6F)
4286 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4287 || (ch >= 0x0D66 && ch <= 0x0D6F)
4288 || (ch >= 0x0E50 && ch <= 0x0E59)
4289 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4290 || (ch >= 0x0F20 && ch <= 0x0F33)
4291 /* Special characters */
4294 || (ch >= 0x02B0 && ch <= 0x02B8)
4296 || (ch >= 0x02BD && ch <= 0x02C1)
4297 || (ch >= 0x02D0 && ch <= 0x02D1)
4298 || (ch >= 0x02E0 && ch <= 0x02E4)
4304 || (ch >= 0x203F && ch <= 0x2040)
4307 || (ch >= 0x210A && ch <= 0x2113)
4309 || (ch >= 0x2118 && ch <= 0x211D)
4313 || (ch >= 0x212A && ch <= 0x2131)
4314 || (ch >= 0x2133 && ch <= 0x2138)
4315 || (ch >= 0x2160 && ch <= 0x2182)
4316 || (ch >= 0x3005 && ch <= 0x3007)
4317 || (ch >= 0x3021 && ch <= 0x3029)
4319 return UC_IDENTIFIER_START;
4320 return UC_IDENTIFIER_INVALID;
4323 /* The Java Language Specification, 3rd edition, §3.6.
4324 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4326 is_java_whitespace (unsigned int ch)
4328 return (ch == ' ' || ch == '\t' || ch == '\f'
4329 || ch == '\n' || ch == '\r');
4332 /* The Java Language Specification, 3rd edition, §3.8.
4333 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4334 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4336 java_ident_category (unsigned int ch)
4338 /* FIXME: Check this against Sun's JDK implementation. */
4339 if (is_category_L (ch) /* = Character.isLetter(ch) */
4340 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4341 || is_category_Sc (ch) /* currency symbol */
4342 || is_category_Pc (ch) /* connector punctuation */
4344 return UC_IDENTIFIER_START;
4345 if (is_category_Nd (ch) /* digit */
4346 || is_category_Mc (ch) /* combining mark */
4347 || is_category_Mn (ch) /* non-spacing mark */
4349 return UC_IDENTIFIER_VALID;
4350 if ((ch >= 0x0000 && ch <= 0x0008)
4351 || (ch >= 0x000E && ch <= 0x001B)
4352 || (ch >= 0x007F && ch <= 0x009F)
4353 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4355 return UC_IDENTIFIER_IGNORABLE;
4356 return UC_IDENTIFIER_INVALID;
4359 /* Construction of sparse 3-level tables. */
4360 #define TABLE identsyntax_table
4361 #define ELEMENT uint8_t
4362 #define DEFAULT UC_IDENTIFIER_INVALID
4363 #define xmalloc malloc
4364 #define xrealloc realloc
4367 /* Output an identifier syntax categorization in a three-level bitmap. */
4369 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4373 struct identsyntax_table t;
4374 unsigned int level1_offset, level2_offset, level3_offset;
4376 stream = fopen (filename, "w");
4379 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4383 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4384 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4385 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4390 identsyntax_table_init (&t);
4392 for (ch = 0; ch < 0x110000; ch++)
4394 int syntaxcode = predicate (ch);
4395 if (syntaxcode != UC_IDENTIFIER_INVALID)
4396 identsyntax_table_add (&t, ch, syntaxcode);
4399 identsyntax_table_finalize (&t);
4401 /* Offsets in t.result, in memory of this process. */
4403 5 * sizeof (uint32_t);
4405 5 * sizeof (uint32_t)
4406 + t.level1_size * sizeof (uint32_t);
4408 5 * sizeof (uint32_t)
4409 + t.level1_size * sizeof (uint32_t)
4410 + (t.level2_size << t.q) * sizeof (uint32_t);
4412 for (i = 0; i < 5; i++)
4413 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4414 ((uint32_t *) t.result)[i]);
4415 fprintf (stream, "static const\n");
4416 fprintf (stream, "struct\n");
4417 fprintf (stream, " {\n");
4418 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4419 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4420 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4421 (1 << t.p) * 2 / 16);
4422 fprintf (stream, " }\n");
4423 fprintf (stream, "%s =\n", name);
4424 fprintf (stream, "{\n");
4425 fprintf (stream, " {");
4426 if (t.level1_size > 8)
4427 fprintf (stream, "\n ");
4428 for (i = 0; i < t.level1_size; i++)
4431 if (i > 0 && (i % 8) == 0)
4432 fprintf (stream, "\n ");
4433 offset = ((uint32_t *) (t.result + level1_offset))[i];
4435 fprintf (stream, " %5d", -1);
4437 fprintf (stream, " %5zu",
4438 (offset - level2_offset) / sizeof (uint32_t));
4439 if (i+1 < t.level1_size)
4440 fprintf (stream, ",");
4442 if (t.level1_size > 8)
4443 fprintf (stream, "\n ");
4444 fprintf (stream, " },\n");
4445 fprintf (stream, " {");
4446 if (t.level2_size << t.q > 8)
4447 fprintf (stream, "\n ");
4448 for (i = 0; i < t.level2_size << t.q; i++)
4451 if (i > 0 && (i % 8) == 0)
4452 fprintf (stream, "\n ");
4453 offset = ((uint32_t *) (t.result + level2_offset))[i];
4455 fprintf (stream, " %5d", -1);
4457 fprintf (stream, " %5zu",
4458 (offset - level3_offset) / sizeof (uint8_t));
4459 if (i+1 < t.level2_size << t.q)
4460 fprintf (stream, ",");
4462 if (t.level2_size << t.q > 8)
4463 fprintf (stream, "\n ");
4464 fprintf (stream, " },\n");
4465 /* Pack the level3 array. Each entry needs 2 bits only. */
4466 fprintf (stream, " {");
4467 if ((t.level3_size << t.p) * 2 / 16 > 8)
4468 fprintf (stream, "\n ");
4469 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4471 if (i > 0 && (i % 8) == 0)
4472 fprintf (stream, "\n ");
4473 fprintf (stream, " 0x%04x",
4474 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4479 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4480 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4481 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4482 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4483 fprintf (stream, ",");
4485 if ((t.level3_size << t.p) * 2 / 16 > 8)
4486 fprintf (stream, "\n ");
4487 fprintf (stream, " }\n");
4488 fprintf (stream, "};\n");
4490 if (ferror (stream) || fclose (stream))
4492 fprintf (stderr, "error writing to '%s'\n", filename);
4498 output_ident_properties (const char *version)
4500 #define PROPERTY(P) \
4501 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4502 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4503 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4504 PROPERTY(c_whitespace)
4505 PROPERTY(java_whitespace)
4508 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4509 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4512 /* ========================================================================= */
4514 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4515 glibc/localedata/locales/i18n file, generated by
4516 glibc/localedata/gen-unicode-ctype.c. */
4518 /* Character mappings. */
4521 to_upper (unsigned int ch)
4523 if (unicode_attributes[ch].name != NULL
4524 && unicode_attributes[ch].upper != NONE)
4525 return unicode_attributes[ch].upper;
4531 to_lower (unsigned int ch)
4533 if (unicode_attributes[ch].name != NULL
4534 && unicode_attributes[ch].lower != NONE)
4535 return unicode_attributes[ch].lower;
4541 to_title (unsigned int ch)
4543 if (unicode_attributes[ch].name != NULL
4544 && unicode_attributes[ch].title != NONE)
4545 return unicode_attributes[ch].title;
4550 /* Character class properties. */
4553 is_upper (unsigned int ch)
4555 return (to_lower (ch) != ch);
4559 is_lower (unsigned int ch)
4561 return (to_upper (ch) != ch)
4562 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4567 is_alpha (unsigned int ch)
4569 return (unicode_attributes[ch].name != NULL
4570 && ((unicode_attributes[ch].category[0] == 'L'
4571 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4572 <U0E2F>, <U0E46> should belong to is_punct. */
4573 && (ch != 0x0E2F) && (ch != 0x0E46))
4574 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4575 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4577 || (ch >= 0x0E34 && ch <= 0x0E3A)
4578 || (ch >= 0x0E47 && ch <= 0x0E4E)
4579 /* Avoid warning for <U0345>. */
4581 /* Avoid warnings for <U2160>..<U217F>. */
4582 || (unicode_attributes[ch].category[0] == 'N'
4583 && unicode_attributes[ch].category[1] == 'l')
4584 /* Avoid warnings for <U24B6>..<U24E9>. */
4585 || (unicode_attributes[ch].category[0] == 'S'
4586 && unicode_attributes[ch].category[1] == 'o'
4587 && strstr (unicode_attributes[ch].name, " LETTER ")
4589 /* Consider all the non-ASCII digits as alphabetic.
4590 ISO C 99 forbids us to have them in category "digit",
4591 but we want iswalnum to return true on them. */
4592 || (unicode_attributes[ch].category[0] == 'N'
4593 && unicode_attributes[ch].category[1] == 'd'
4594 && !(ch >= 0x0030 && ch <= 0x0039))));
4598 is_digit (unsigned int ch)
4601 return (unicode_attributes[ch].name != NULL
4602 && unicode_attributes[ch].category[0] == 'N'
4603 && unicode_attributes[ch].category[1] == 'd');
4604 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4605 a zero. Must add <0> in front of them by hand. */
4607 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4610 The iswdigit function tests for any wide character that corresponds
4611 to a decimal-digit character (as defined in 5.2.1).
4613 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4615 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_outdigit (unsigned int ch)
4622 return (ch >= 0x0030 && ch <= 0x0039);
4626 is_alnum (unsigned int ch)
4628 return is_alpha (ch) || is_digit (ch);
4632 is_blank (unsigned int ch)
4634 return (ch == 0x0009 /* '\t' */
4635 /* Category Zs without mention of "<noBreak>" */
4636 || (unicode_attributes[ch].name != NULL
4637 && unicode_attributes[ch].category[0] == 'Z'
4638 && unicode_attributes[ch].category[1] == 's'
4639 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4643 is_space (unsigned int ch)
4645 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4646 should treat it like a punctuation character, not like a space. */
4647 return (ch == 0x0020 /* ' ' */
4648 || ch == 0x000C /* '\f' */
4649 || ch == 0x000A /* '\n' */
4650 || ch == 0x000D /* '\r' */
4651 || ch == 0x0009 /* '\t' */
4652 || ch == 0x000B /* '\v' */
4653 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4654 || (unicode_attributes[ch].name != NULL
4655 && unicode_attributes[ch].category[0] == 'Z'
4656 && (unicode_attributes[ch].category[1] == 'l'
4657 || unicode_attributes[ch].category[1] == 'p'
4658 || (unicode_attributes[ch].category[1] == 's'
4659 && !strstr (unicode_attributes[ch].decomposition,
4664 is_cntrl (unsigned int ch)
4666 return (unicode_attributes[ch].name != NULL
4667 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4668 /* Categories Zl and Zp */
4669 || (unicode_attributes[ch].category[0] == 'Z'
4670 && (unicode_attributes[ch].category[1] == 'l'
4671 || unicode_attributes[ch].category[1] == 'p'))));
4675 is_xdigit (unsigned int ch)
4678 return is_digit (ch)
4679 || (ch >= 0x0041 && ch <= 0x0046)
4680 || (ch >= 0x0061 && ch <= 0x0066);
4682 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4685 The iswxdigit function tests for any wide character that corresponds
4686 to a hexadecimal-digit character (as defined in 6.4.4.1).
4688 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4690 return (ch >= 0x0030 && ch <= 0x0039)
4691 || (ch >= 0x0041 && ch <= 0x0046)
4692 || (ch >= 0x0061 && ch <= 0x0066);
4697 is_graph (unsigned int ch)
4699 return (unicode_attributes[ch].name != NULL
4700 && strcmp (unicode_attributes[ch].name, "<control>")
4705 is_print (unsigned int ch)
4707 return (unicode_attributes[ch].name != NULL
4708 && strcmp (unicode_attributes[ch].name, "<control>")
4709 /* Categories Zl and Zp */
4710 && !(unicode_attributes[ch].name != NULL
4711 && unicode_attributes[ch].category[0] == 'Z'
4712 && (unicode_attributes[ch].category[1] == 'l'
4713 || unicode_attributes[ch].category[1] == 'p')));
4717 is_punct (unsigned int ch)
4720 return (unicode_attributes[ch].name != NULL
4721 && unicode_attributes[ch].category[0] == 'P');
4723 /* The traditional POSIX definition of punctuation is every graphic,
4724 non-alphanumeric character. */
4725 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4729 /* Output all properties. */
4731 output_old_ctype (const char *version)
4733 #define PROPERTY(P) \
4734 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4735 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4736 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4755 is_combining (unsigned int ch)
4757 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4758 file. In 3.0.1 it was identical to the union of the general categories
4759 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4760 PropList.txt file, so we take the latter definition. */
4761 return (unicode_attributes[ch].name != NULL
4762 && unicode_attributes[ch].category[0] == 'M'
4763 && (unicode_attributes[ch].category[1] == 'n'
4764 || unicode_attributes[ch].category[1] == 'c'
4765 || unicode_attributes[ch].category[1] == 'e'));
4769 is_combining_level3 (unsigned int ch)
4771 return is_combining (ch)
4772 && !(unicode_attributes[ch].combining[0] != '\0'
4773 && unicode_attributes[ch].combining[0] != '0'
4774 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4777 /* Return the UCS symbol string for a Unicode character. */
4779 ucs_symbol (unsigned int i)
4781 static char buf[11+1];
4783 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4787 /* Return the UCS symbol range string for a Unicode characters interval. */
4789 ucs_symbol_range (unsigned int low, unsigned int high)
4791 static char buf[24+1];
4793 strcpy (buf, ucs_symbol (low));
4795 strcat (buf, ucs_symbol (high));
4799 /* Output a character class (= property) table. */
4802 output_charclass (FILE *stream, const char *classname,
4803 bool (*func) (unsigned int))
4805 char table[0x110000];
4807 bool need_semicolon;
4808 const int max_column = 75;
4811 for (i = 0; i < 0x110000; i++)
4812 table[i] = (int) func (i);
4814 fprintf (stream, "%s ", classname);
4815 need_semicolon = false;
4817 for (i = 0; i < 0x110000; )
4823 unsigned int low, high;
4829 while (i < 0x110000 && table[i]);
4833 strcpy (buf, ucs_symbol (low));
4835 strcpy (buf, ucs_symbol_range (low, high));
4839 fprintf (stream, ";");
4843 if (column + strlen (buf) > max_column)
4845 fprintf (stream, "/\n ");
4849 fprintf (stream, "%s", buf);
4850 column += strlen (buf);
4851 need_semicolon = true;
4854 fprintf (stream, "\n");
4857 /* Output a character mapping table. */
4860 output_charmap (FILE *stream, const char *mapname,
4861 unsigned int (*func) (unsigned int))
4863 char table[0x110000];
4865 bool need_semicolon;
4866 const int max_column = 75;
4869 for (i = 0; i < 0x110000; i++)
4870 table[i] = (func (i) != i);
4872 fprintf (stream, "%s ", mapname);
4873 need_semicolon = false;
4875 for (i = 0; i < 0x110000; i++)
4881 strcat (buf, ucs_symbol (i));
4883 strcat (buf, ucs_symbol (func (i)));
4888 fprintf (stream, ";");
4892 if (column + strlen (buf) > max_column)
4894 fprintf (stream, "/\n ");
4898 fprintf (stream, "%s", buf);
4899 column += strlen (buf);
4900 need_semicolon = true;
4902 fprintf (stream, "\n");
4905 /* Output the width table. */
4908 output_widthmap (FILE *stream)
4912 /* Output the tables to the given file. */
4915 output_tables (const char *filename, const char *version)
4920 stream = fopen (filename, "w");
4923 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4927 fprintf (stream, "escape_char /\n");
4928 fprintf (stream, "comment_char %%\n");
4929 fprintf (stream, "\n");
4930 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4932 fprintf (stream, "\n");
4934 fprintf (stream, "LC_IDENTIFICATION\n");
4935 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4936 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4937 fprintf (stream, "address \"\"\n");
4938 fprintf (stream, "contact \"\"\n");
4939 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4940 fprintf (stream, "tel \"\"\n");
4941 fprintf (stream, "fax \"\"\n");
4942 fprintf (stream, "language \"\"\n");
4943 fprintf (stream, "territory \"Earth\"\n");
4944 fprintf (stream, "revision \"%s\"\n", version);
4949 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4950 fprintf (stream, "date \"%s\"\n", date);
4952 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4953 fprintf (stream, "END LC_IDENTIFICATION\n");
4954 fprintf (stream, "\n");
4956 /* Verifications. */
4957 for (ch = 0; ch < 0x110000; ch++)
4959 /* toupper restriction: "Only characters specified for the keywords
4960 lower and upper shall be specified. */
4961 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4963 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4964 ucs_symbol (ch), ch, to_upper (ch));
4966 /* tolower restriction: "Only characters specified for the keywords
4967 lower and upper shall be specified. */
4968 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4970 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4971 ucs_symbol (ch), ch, to_lower (ch));
4973 /* alpha restriction: "Characters classified as either upper or lower
4974 shall automatically belong to this class. */
4975 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4976 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4978 /* alpha restriction: "No character specified for the keywords cntrl,
4979 digit, punct or space shall be specified." */
4980 if (is_alpha (ch) && is_cntrl (ch))
4981 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4982 if (is_alpha (ch) && is_digit (ch))
4983 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4984 if (is_alpha (ch) && is_punct (ch))
4985 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4986 if (is_alpha (ch) && is_space (ch))
4987 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4989 /* space restriction: "No character specified for the keywords upper,
4990 lower, alpha, digit, graph or xdigit shall be specified."
4991 upper, lower, alpha already checked above. */
4992 if (is_space (ch) && is_digit (ch))
4993 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4994 if (is_space (ch) && is_graph (ch))
4995 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4996 if (is_space (ch) && is_xdigit (ch))
4997 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4999 /* cntrl restriction: "No character specified for the keywords upper,
5000 lower, alpha, digit, punct, graph, print or xdigit shall be
5001 specified." upper, lower, alpha already checked above. */
5002 if (is_cntrl (ch) && is_digit (ch))
5003 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_punct (ch))
5005 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5006 if (is_cntrl (ch) && is_graph (ch))
5007 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5008 if (is_cntrl (ch) && is_print (ch))
5009 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5010 if (is_cntrl (ch) && is_xdigit (ch))
5011 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5013 /* punct restriction: "No character specified for the keywords upper,
5014 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5015 be specified." upper, lower, alpha, cntrl already checked above. */
5016 if (is_punct (ch) && is_digit (ch))
5017 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5018 if (is_punct (ch) && is_xdigit (ch))
5019 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5020 if (is_punct (ch) && (ch == 0x0020))
5021 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5023 /* graph restriction: "No character specified for the keyword cntrl
5024 shall be specified." Already checked above. */
5026 /* print restriction: "No character specified for the keyword cntrl
5027 shall be specified." Already checked above. */
5029 /* graph - print relation: differ only in the <space> character.
5030 How is this possible if there are more than one space character?!
5031 I think susv2/xbd/locale.html should speak of "space characters",
5032 not "space character". */
5033 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5035 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5036 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5038 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5041 fprintf (stream, "LC_CTYPE\n");
5042 output_charclass (stream, "upper", is_upper);
5043 output_charclass (stream, "lower", is_lower);
5044 output_charclass (stream, "alpha", is_alpha);
5045 output_charclass (stream, "digit", is_digit);
5046 output_charclass (stream, "outdigit", is_outdigit);
5047 output_charclass (stream, "blank", is_blank);
5048 output_charclass (stream, "space", is_space);
5049 output_charclass (stream, "cntrl", is_cntrl);
5050 output_charclass (stream, "punct", is_punct);
5051 output_charclass (stream, "xdigit", is_xdigit);
5052 output_charclass (stream, "graph", is_graph);
5053 output_charclass (stream, "print", is_print);
5054 output_charclass (stream, "class \"combining\";", is_combining);
5055 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5056 output_charmap (stream, "toupper", to_upper);
5057 output_charmap (stream, "tolower", to_lower);
5058 output_charmap (stream, "map \"totitle\";", to_title);
5059 output_widthmap (stream);
5060 fprintf (stream, "END LC_CTYPE\n");
5062 if (ferror (stream) || fclose (stream))
5064 fprintf (stderr, "error writing to '%s'\n", filename);
5071 /* ========================================================================= */
5073 /* The width property from the EastAsianWidth.txt file.
5074 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5075 const char * unicode_width[0x110000];
5077 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5080 fill_width (const char *width_filename)
5084 char field0[FIELDLEN];
5085 char field1[FIELDLEN];
5086 char field2[FIELDLEN];
5089 for (i = 0; i < 0x110000; i++)
5090 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5092 stream = fopen (width_filename, "r");
5095 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5110 do c = getc (stream); while (c != EOF && c != '\n');
5114 n = getfield (stream, field0, ';');
5115 n += getfield (stream, field1, ' ');
5116 n += getfield (stream, field2, '\n');
5121 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5124 i = strtoul (field0, NULL, 16);
5125 if (strstr (field0, "..") != NULL)
5127 /* Deal with a range. */
5128 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5130 unicode_width[i] = strdup (field1);
5134 /* Single character line. */
5135 unicode_width[i] = strdup (field1);
5138 if (ferror (stream) || fclose (stream))
5140 fprintf (stderr, "error reading from '%s'\n", width_filename);
5145 /* ========================================================================= */
5147 /* Line breaking classification. */
5151 /* Values >= 24 are resolved at run time. */
5152 LBP_BK = 24, /* mandatory break */
5153 /*LBP_CR, carriage return - not used here because it's a DOSism */
5154 /*LBP_LF, line feed - not used here because it's a DOSism */
5155 LBP_CM = 25, /* attached characters and combining marks */
5156 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5157 /*LBP_SG, surrogates - not used here because they are not characters */
5158 LBP_WJ = 0, /* word joiner */
5159 LBP_ZW = 26, /* zero width space */
5160 LBP_GL = 1, /* non-breaking (glue) */
5161 LBP_SP = 27, /* space */
5162 LBP_B2 = 2, /* break opportunity before and after */
5163 LBP_BA = 3, /* break opportunity after */
5164 LBP_BB = 4, /* break opportunity before */
5165 LBP_HY = 5, /* hyphen */
5166 LBP_CB = 28, /* contingent break opportunity */
5167 LBP_CL = 6, /* closing punctuation */
5168 LBP_EX = 7, /* exclamation/interrogation */
5169 LBP_IN = 8, /* inseparable */
5170 LBP_NS = 9, /* non starter */
5171 LBP_OP = 10, /* opening punctuation */
5172 LBP_QU = 11, /* ambiguous quotation */
5173 LBP_IS = 12, /* infix separator (numeric) */
5174 LBP_NU = 13, /* numeric */
5175 LBP_PO = 14, /* postfix (numeric) */
5176 LBP_PR = 15, /* prefix (numeric) */
5177 LBP_SY = 16, /* symbols allowing breaks */
5178 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5179 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5180 LBP_H2 = 18, /* Hangul LV syllable */
5181 LBP_H3 = 19, /* Hangul LVT syllable */
5182 LBP_ID = 20, /* ideographic */
5183 LBP_JL = 21, /* Hangul L Jamo */
5184 LBP_JV = 22, /* Hangul V Jamo */
5185 LBP_JT = 23, /* Hangul T Jamo */
5186 LBP_SA = 30, /* complex context (South East Asian) */
5187 LBP_XX = 31 /* unknown */
5190 /* Returns the line breaking classification for ch, as a bit mask. */
5192 get_lbp (unsigned int ch)
5196 if (unicode_attributes[ch].name != NULL)
5198 /* mandatory break */
5199 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5200 || ch == 0x000C /* form feed */
5201 || ch == 0x000B /* line tabulation */
5202 || ch == 0x2028 /* LINE SEPARATOR */
5203 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5204 attr |= 1 << LBP_BK;
5206 if (ch == 0x2060 /* WORD JOINER */
5207 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5208 attr |= 1 << LBP_WJ;
5210 /* zero width space */
5211 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5212 attr |= 1 << LBP_ZW;
5214 /* non-breaking (glue) */
5215 if (ch == 0x00A0 /* NO-BREAK SPACE */
5216 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5217 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5218 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5219 || ch == 0x2007 /* FIGURE SPACE */
5220 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5221 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5222 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5223 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5224 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5225 attr |= 1 << LBP_GL;
5228 if (ch == 0x0020 /* SPACE */)
5229 attr |= 1 << LBP_SP;
5231 /* break opportunity before and after */
5232 if (ch == 0x2014 /* EM DASH */)
5233 attr |= 1 << LBP_B2;
5235 /* break opportunity after */
5236 if (ch == 0x1680 /* OGHAM SPACE MARK */
5237 || ch == 0x2000 /* EN QUAD */
5238 || ch == 0x2001 /* EM QUAD */
5239 || ch == 0x2002 /* EN SPACE */
5240 || ch == 0x2003 /* EM SPACE */
5241 || ch == 0x2004 /* THREE-PER-EM SPACE */
5242 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5243 || ch == 0x2006 /* SIX-PER-EM SPACE */
5244 || ch == 0x2008 /* PUNCTUATION SPACE */
5245 || ch == 0x2009 /* THIN SPACE */
5246 || ch == 0x200A /* HAIR SPACE */
5247 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5248 || ch == 0x0009 /* tab */
5249 || ch == 0x00AD /* SOFT HYPHEN */
5250 || ch == 0x058A /* ARMENIAN HYPHEN */
5251 || ch == 0x2010 /* HYPHEN */
5252 || ch == 0x2012 /* FIGURE DASH */
5253 || ch == 0x2013 /* EN DASH */
5254 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5255 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5256 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5257 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5258 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5259 || ch == 0x2027 /* HYPHENATION POINT */
5260 || ch == 0x007C /* VERTICAL LINE */
5261 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5262 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5263 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5264 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5265 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5266 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5267 || ch == 0x205A /* TWO DOT PUNCTUATION */
5268 || ch == 0x205B /* FOUR DOT MARK */
5269 || ch == 0x205D /* TRICOLON */
5270 || ch == 0x205E /* VERTICAL FOUR DOTS */
5271 || ch == 0x2E19 /* PALM BRANCH */
5272 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5273 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5274 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5275 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5276 || ch == 0x2E30 /* RING POINT */
5277 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5278 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5279 || ch == 0x10102 /* AEGEAN CHECK MARK */
5280 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5281 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5282 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5283 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5284 || ch == 0x0964 /* DEVANAGARI DANDA */
5285 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5286 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5287 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5288 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5289 || ch == 0x104B /* MYANMAR SIGN SECTION */
5290 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5291 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5292 || ch == 0x17D4 /* KHMER SIGN KHAN */
5293 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5294 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5295 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5296 || ch == 0xA8CE /* SAURASHTRA DANDA */
5297 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5298 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5299 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5300 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5301 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5302 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5303 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5304 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5305 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5306 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5307 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5308 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5309 || ch == 0x1804 /* MONGOLIAN COLON */
5310 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5311 || ch == 0x1B5A /* BALINESE PANTI */
5312 || ch == 0x1B5B /* BALINESE PAMADA */
5313 || ch == 0x1B5C /* BALINESE WINDU */
5314 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5315 || ch == 0x1B60 /* BALINESE PAMENENG */
5316 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5317 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5318 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5319 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5320 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5321 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5322 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5323 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5324 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5325 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5326 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5327 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5328 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5329 || ch == 0xA60D /* VAI COMMA */
5330 || ch == 0xA60F /* VAI QUESTION MARK */
5331 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5332 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5333 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5334 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5335 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5336 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5337 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5338 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5339 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5340 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5341 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5342 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5343 attr |= 1 << LBP_BA;
5345 /* break opportunity before */
5346 if (ch == 0x00B4 /* ACUTE ACCENT */
5347 || ch == 0x1FFD /* GREEK OXIA */
5348 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5349 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5350 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5351 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5352 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5353 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5354 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5355 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5356 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5357 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5358 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5359 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5360 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5361 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5362 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5363 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5364 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5365 attr |= 1 << LBP_BB;
5368 if (ch == 0x002D /* HYPHEN-MINUS */)
5369 attr |= 1 << LBP_HY;
5371 /* contingent break opportunity */
5372 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5373 attr |= 1 << LBP_CB;
5375 /* closing punctuation */
5376 if ((unicode_attributes[ch].category[0] == 'P'
5377 && unicode_attributes[ch].category[1] == 'e')
5378 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5379 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5380 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5381 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5382 || ch == 0xFE50 /* SMALL COMMA */
5383 || ch == 0xFE52 /* SMALL FULL STOP */
5384 || ch == 0xFF0C /* FULLWIDTH COMMA */
5385 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5386 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5387 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5388 attr |= 1 << LBP_CL;
5390 /* exclamation/interrogation */
5391 if (ch == 0x0021 /* EXCLAMATION MARK */
5392 || ch == 0x003F /* QUESTION MARK */
5393 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5394 || ch == 0x061B /* ARABIC SEMICOLON */
5395 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5396 || ch == 0x061F /* ARABIC QUESTION MARK */
5397 || ch == 0x06D4 /* ARABIC FULL STOP */
5398 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5399 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5400 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5401 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5402 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5403 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5404 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5405 || ch == 0x1802 /* MONGOLIAN COMMA */
5406 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5407 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5408 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5409 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5410 || ch == 0x1945 /* LIMBU QUESTION MARK */
5411 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5412 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5413 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5414 || ch == 0x2CFE /* COPTIC FULL STOP */
5415 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5416 || ch == 0xA60E /* VAI FULL STOP */
5417 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5418 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5419 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5420 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5421 || ch == 0xFE56 /* SMALL QUESTION MARK */
5422 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5423 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5424 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5425 attr |= 1 << LBP_EX;
5428 if (ch == 0x2024 /* ONE DOT LEADER */
5429 || ch == 0x2025 /* TWO DOT LEADER */
5430 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5431 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5432 attr |= 1 << LBP_IN;
5435 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5436 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5437 || ch == 0x203D /* INTERROBANG */
5438 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5439 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5440 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5441 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5442 || ch == 0x301C /* WAVE DASH */
5443 || ch == 0x303C /* MASU MARK */
5444 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5445 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5446 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5447 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5448 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5449 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5450 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5451 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5452 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5453 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5454 || ch == 0xA015 /* YI SYLLABLE WU */
5455 || ch == 0xFE54 /* SMALL SEMICOLON */
5456 || ch == 0xFE55 /* SMALL COLON */
5457 || ch == 0xFF1A /* FULLWIDTH COLON */
5458 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5459 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5460 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5461 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5462 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5463 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5464 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5465 attr |= 1 << LBP_NS;
5467 /* opening punctuation */
5468 if ((unicode_attributes[ch].category[0] == 'P'
5469 && unicode_attributes[ch].category[1] == 's')
5470 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5471 || ch == 0x00BF /* INVERTED QUESTION MARK */
5472 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5473 attr |= 1 << LBP_OP;
5475 /* ambiguous quotation */
5476 if ((unicode_attributes[ch].category[0] == 'P'
5477 && (unicode_attributes[ch].category[1] == 'f'
5478 || unicode_attributes[ch].category[1] == 'i'))
5479 || ch == 0x0022 /* QUOTATION MARK */
5480 || ch == 0x0027 /* APOSTROPHE */
5481 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5482 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5483 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5484 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5485 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5486 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5487 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5488 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5489 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5490 || ch == 0x2E0B /* RAISED SQUARE */)
5491 attr |= 1 << LBP_QU;
5493 /* infix separator (numeric) */
5494 if (ch == 0x002C /* COMMA */
5495 || ch == 0x002E /* FULL STOP */
5496 || ch == 0x003A /* COLON */
5497 || ch == 0x003B /* SEMICOLON */
5498 || ch == 0x037E /* GREEK QUESTION MARK */
5499 || ch == 0x0589 /* ARMENIAN FULL STOP */
5500 || ch == 0x060C /* ARABIC COMMA */
5501 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5502 || ch == 0x07F8 /* NKO COMMA */
5503 || ch == 0x2044 /* FRACTION SLASH */
5504 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5505 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5506 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5507 attr |= 1 << LBP_IS;
5510 if ((unicode_attributes[ch].category[0] == 'N'
5511 && unicode_attributes[ch].category[1] == 'd'
5512 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5513 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5514 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5515 attr |= 1 << LBP_NU;
5517 /* postfix (numeric) */
5518 if (ch == 0x0025 /* PERCENT SIGN */
5519 || ch == 0x00A2 /* CENT SIGN */
5520 || ch == 0x00B0 /* DEGREE SIGN */
5521 || ch == 0x060B /* AFGHANI SIGN */
5522 || ch == 0x066A /* ARABIC PERCENT SIGN */
5523 || ch == 0x2030 /* PER MILLE SIGN */
5524 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5525 || ch == 0x2032 /* PRIME */
5526 || ch == 0x2033 /* DOUBLE PRIME */
5527 || ch == 0x2034 /* TRIPLE PRIME */
5528 || ch == 0x2035 /* REVERSED PRIME */
5529 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5530 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5531 || ch == 0x20A7 /* PESETA SIGN */
5532 || ch == 0x2103 /* DEGREE CELSIUS */
5533 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5534 || ch == 0xFDFC /* RIAL SIGN */
5535 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5536 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5537 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5538 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5539 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5540 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5541 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5542 attr |= 1 << LBP_PO;
5544 /* prefix (numeric) */
5545 if ((unicode_attributes[ch].category[0] == 'S'
5546 && unicode_attributes[ch].category[1] == 'c')
5547 || ch == 0x002B /* PLUS SIGN */
5548 || ch == 0x005C /* REVERSE SOLIDUS */
5549 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5550 || ch == 0x2116 /* NUMERO SIGN */
5551 || ch == 0x2212 /* MINUS SIGN */
5552 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5553 if (!(attr & (1 << LBP_PO)))
5554 attr |= 1 << LBP_PR;
5556 /* symbols allowing breaks */
5557 if (ch == 0x002F /* SOLIDUS */)
5558 attr |= 1 << LBP_SY;
5560 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5561 attr |= 1 << LBP_H2;
5563 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5564 attr |= 1 << LBP_H3;
5566 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5567 attr |= 1 << LBP_JL;
5569 if (ch >= 0x1160 && ch <= 0x11A2)
5570 attr |= 1 << LBP_JV;
5572 if (ch >= 0x11A8 && ch <= 0x11F9)
5573 attr |= 1 << LBP_JT;
5575 /* complex context (South East Asian) */
5576 if (((unicode_attributes[ch].category[0] == 'C'
5577 && unicode_attributes[ch].category[1] == 'f')
5578 || (unicode_attributes[ch].category[0] == 'L'
5579 && (unicode_attributes[ch].category[1] == 'm'
5580 || unicode_attributes[ch].category[1] == 'o'))
5581 || (unicode_attributes[ch].category[0] == 'M'
5582 && (unicode_attributes[ch].category[1] == 'c'
5583 || unicode_attributes[ch].category[1] == 'n'))
5584 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5585 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5586 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5587 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5588 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5589 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5590 || (ch >= 0x1000 && ch <= 0x109F)
5591 || (ch >= 0x1780 && ch <= 0x17FF)
5592 || (ch >= 0x1950 && ch <= 0x19DF)))
5593 attr |= 1 << LBP_SA;
5595 /* attached characters and combining marks */
5596 if ((unicode_attributes[ch].category[0] == 'M'
5597 && (unicode_attributes[ch].category[1] == 'c'
5598 || unicode_attributes[ch].category[1] == 'e'
5599 || unicode_attributes[ch].category[1] == 'n'))
5600 || (unicode_attributes[ch].category[0] == 'C'
5601 && (unicode_attributes[ch].category[1] == 'c'
5602 || unicode_attributes[ch].category[1] == 'f')))
5603 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5604 attr |= 1 << LBP_CM;
5607 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5608 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5609 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5610 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5611 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5612 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5613 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5614 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5615 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5616 || ch == 0xFE62 /* SMALL PLUS SIGN */
5617 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5618 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5619 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5620 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5621 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5622 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5623 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5624 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5625 || (ch >= 0x3000 && ch <= 0x33FF
5626 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5627 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5628 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5629 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5630 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5631 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5632 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5633 || ch == 0xFE45 /* SESAME DOT */
5634 || ch == 0xFE46 /* WHITE SESAME DOT */
5635 || ch == 0xFE49 /* DASHED OVERLINE */
5636 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5637 || ch == 0xFE4B /* WAVY OVERLINE */
5638 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5639 || ch == 0xFE4D /* DASHED LOW LINE */
5640 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5641 || ch == 0xFE4F /* WAVY LOW LINE */
5642 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5643 || ch == 0xFE58 /* SMALL EM DASH */
5644 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5645 || ch == 0xFE60 /* SMALL AMPERSAND */
5646 || ch == 0xFE61 /* SMALL ASTERISK */
5647 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5648 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5649 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5650 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5651 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5652 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5653 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5654 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5655 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5656 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5657 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5658 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5659 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5660 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5661 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5662 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5663 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5664 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5665 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5666 || ch == 0xFF5E /* FULLWIDTH TILDE */
5667 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5668 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5669 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5670 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5672 /* ambiguous (ideograph) ? */
5673 if ((unicode_width[ch] != NULL
5674 && unicode_width[ch][0] == 'A'
5676 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5677 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5678 attr |= 1 << LBP_AI;
5680 attr |= 1 << LBP_ID;
5683 /* ordinary alphabetic and symbol characters */
5684 if ((unicode_attributes[ch].category[0] == 'L'
5685 && (unicode_attributes[ch].category[1] == 'u'
5686 || unicode_attributes[ch].category[1] == 'l'
5687 || unicode_attributes[ch].category[1] == 't'
5688 || unicode_attributes[ch].category[1] == 'm'
5689 || unicode_attributes[ch].category[1] == 'o'))
5690 || (unicode_attributes[ch].category[0] == 'S'
5691 && (unicode_attributes[ch].category[1] == 'm'
5692 || unicode_attributes[ch].category[1] == 'k'
5693 || unicode_attributes[ch].category[1] == 'o'))
5694 || (unicode_attributes[ch].category[0] == 'N'
5695 && (unicode_attributes[ch].category[1] == 'l'
5696 || unicode_attributes[ch].category[1] == 'o'))
5697 || (unicode_attributes[ch].category[0] == 'P'
5698 && (unicode_attributes[ch].category[1] == 'c'
5699 || unicode_attributes[ch].category[1] == 'd'
5700 || unicode_attributes[ch].category[1] == 'o'))
5701 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5702 || ch == 0x0601 /* ARABIC SIGN SANAH */
5703 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5704 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5705 || ch == 0x06DD /* ARABIC END OF AYAH */
5706 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5707 || ch == 0x2061 /* FUNCTION APPLICATION */
5708 || ch == 0x2062 /* INVISIBLE TIMES */
5709 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5710 || ch == 0x2064 /* INVISIBLE PLUS */)
5711 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5713 /* ambiguous (alphabetic) ? */
5714 if ((unicode_width[ch] != NULL
5715 && unicode_width[ch][0] == 'A'
5717 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5718 && ch != 0x2022 /* BULLET */
5719 && ch != 0x203E /* OVERLINE */
5720 && ch != 0x2126 /* OHM SIGN */
5721 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5722 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5723 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5724 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5725 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5726 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5727 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5728 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5729 || ch == 0x00A7 /* SECTION SIGN */
5730 || ch == 0x00A8 /* DIAERESIS */
5731 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5732 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5733 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5734 || ch == 0x00B6 /* PILCROW SIGN */
5735 || ch == 0x00B7 /* MIDDLE DOT */
5736 || ch == 0x00B8 /* CEDILLA */
5737 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5738 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5739 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5740 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5741 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5742 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5743 || ch == 0x00F7 /* DIVISION SIGN */
5744 || ch == 0x02C7 /* CARON */
5745 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5746 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5747 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5748 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5749 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5750 || ch == 0x02D8 /* BREVE */
5751 || ch == 0x02D9 /* DOT ABOVE */
5752 || ch == 0x02DA /* RING ABOVE */
5753 || ch == 0x02DB /* OGONEK */
5754 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5755 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5756 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5757 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5758 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5759 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5760 || ch == 0x2616 /* WHITE SHOGI PIECE */
5761 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5762 attr |= 1 << LBP_AI;
5764 attr |= 1 << LBP_AL;
5765 attr &= ~(1 << LBP_CM);
5771 attr |= 1 << LBP_XX;
5776 /* Output the line breaking properties in a human readable format. */
5778 debug_output_lbp (FILE *stream)
5782 for (i = 0; i < 0x110000; i++)
5784 int attr = get_lbp (i);
5785 if (attr != 1 << LBP_XX)
5787 fprintf (stream, "0x%04X", i);
5788 #define PRINT_BIT(attr,bit) \
5789 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5790 PRINT_BIT(attr,LBP_BK);
5791 PRINT_BIT(attr,LBP_CM);
5792 PRINT_BIT(attr,LBP_WJ);
5793 PRINT_BIT(attr,LBP_ZW);
5794 PRINT_BIT(attr,LBP_GL);
5795 PRINT_BIT(attr,LBP_SP);
5796 PRINT_BIT(attr,LBP_B2);
5797 PRINT_BIT(attr,LBP_BA);
5798 PRINT_BIT(attr,LBP_BB);
5799 PRINT_BIT(attr,LBP_HY);
5800 PRINT_BIT(attr,LBP_CB);
5801 PRINT_BIT(attr,LBP_CL);
5802 PRINT_BIT(attr,LBP_EX);
5803 PRINT_BIT(attr,LBP_IN);
5804 PRINT_BIT(attr,LBP_NS);
5805 PRINT_BIT(attr,LBP_OP);
5806 PRINT_BIT(attr,LBP_QU);
5807 PRINT_BIT(attr,LBP_IS);
5808 PRINT_BIT(attr,LBP_NU);
5809 PRINT_BIT(attr,LBP_PO);
5810 PRINT_BIT(attr,LBP_PR);
5811 PRINT_BIT(attr,LBP_SY);
5812 PRINT_BIT(attr,LBP_AI);
5813 PRINT_BIT(attr,LBP_AL);
5814 PRINT_BIT(attr,LBP_H2);
5815 PRINT_BIT(attr,LBP_H3);
5816 PRINT_BIT(attr,LBP_ID);
5817 PRINT_BIT(attr,LBP_JL);
5818 PRINT_BIT(attr,LBP_JV);
5819 PRINT_BIT(attr,LBP_JT);
5820 PRINT_BIT(attr,LBP_SA);
5821 PRINT_BIT(attr,LBP_XX);
5823 fprintf (stream, "\n");
5829 debug_output_lbrk_tables (const char *filename)
5833 stream = fopen (filename, "w");
5836 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5840 debug_output_lbp (stream);
5842 if (ferror (stream) || fclose (stream))
5844 fprintf (stderr, "error writing to '%s'\n", filename);
5849 /* The line breaking property from the LineBreak.txt file. */
5850 int unicode_org_lbp[0x110000];
5852 /* Stores in unicode_org_lbp[] the line breaking property from the
5853 LineBreak.txt file. */
5855 fill_org_lbp (const char *linebreak_filename)
5859 char field0[FIELDLEN];
5860 char field1[FIELDLEN];
5861 char field2[FIELDLEN];
5864 for (i = 0; i < 0x110000; i++)
5865 unicode_org_lbp[i] = LBP_XX;
5867 stream = fopen (linebreak_filename, "r");
5870 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5886 do c = getc (stream); while (c != EOF && c != '\n');
5890 n = getfield (stream, field0, ';');
5891 n += getfield (stream, field1, ' ');
5892 n += getfield (stream, field2, '\n');
5897 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5901 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5936 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5937 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5938 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5939 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5942 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5943 field1, linebreak_filename, lineno);
5946 i = strtoul (field0, NULL, 16);
5947 if (strstr (field0, "..") != NULL)
5949 /* Deal with a range. */
5950 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5952 unicode_org_lbp[i] = value;
5956 /* Single character line. */
5957 unicode_org_lbp[i] = value;
5960 if (ferror (stream) || fclose (stream))
5962 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5967 /* Output the line breaking properties in a human readable format. */
5969 debug_output_org_lbp (FILE *stream)
5973 for (i = 0; i < 0x110000; i++)
5975 int attr = unicode_org_lbp[i];
5978 fprintf (stream, "0x%04X", i);
5979 #define PRINT_BIT(attr,bit) \
5980 if (attr == bit) fprintf (stream, " " #bit);
5981 PRINT_BIT(attr,LBP_BK);
5982 PRINT_BIT(attr,LBP_CM);
5983 PRINT_BIT(attr,LBP_WJ);
5984 PRINT_BIT(attr,LBP_ZW);
5985 PRINT_BIT(attr,LBP_GL);
5986 PRINT_BIT(attr,LBP_SP);
5987 PRINT_BIT(attr,LBP_B2);
5988 PRINT_BIT(attr,LBP_BA);
5989 PRINT_BIT(attr,LBP_BB);
5990 PRINT_BIT(attr,LBP_HY);
5991 PRINT_BIT(attr,LBP_CB);
5992 PRINT_BIT(attr,LBP_CL);
5993 PRINT_BIT(attr,LBP_EX);
5994 PRINT_BIT(attr,LBP_IN);
5995 PRINT_BIT(attr,LBP_NS);
5996 PRINT_BIT(attr,LBP_OP);
5997 PRINT_BIT(attr,LBP_QU);
5998 PRINT_BIT(attr,LBP_IS);
5999 PRINT_BIT(attr,LBP_NU);
6000 PRINT_BIT(attr,LBP_PO);
6001 PRINT_BIT(attr,LBP_PR);
6002 PRINT_BIT(attr,LBP_SY);
6003 PRINT_BIT(attr,LBP_AI);
6004 PRINT_BIT(attr,LBP_AL);
6005 PRINT_BIT(attr,LBP_H2);
6006 PRINT_BIT(attr,LBP_H3);
6007 PRINT_BIT(attr,LBP_ID);
6008 PRINT_BIT(attr,LBP_JL);
6009 PRINT_BIT(attr,LBP_JV);
6010 PRINT_BIT(attr,LBP_JT);
6011 PRINT_BIT(attr,LBP_SA);
6012 PRINT_BIT(attr,LBP_XX);
6014 fprintf (stream, "\n");
6020 debug_output_org_lbrk_tables (const char *filename)
6024 stream = fopen (filename, "w");
6027 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6031 debug_output_org_lbp (stream);
6033 if (ferror (stream) || fclose (stream))
6035 fprintf (stderr, "error writing to '%s'\n", filename);
6040 /* Construction of sparse 3-level tables. */
6041 #define TABLE lbp_table
6042 #define ELEMENT unsigned char
6043 #define DEFAULT LBP_XX
6044 #define xmalloc malloc
6045 #define xrealloc realloc
6049 output_lbp (FILE *stream1, FILE *stream2)
6053 unsigned int level1_offset, level2_offset, level3_offset;
6057 lbp_table_init (&t);
6059 for (i = 0; i < 0x110000; i++)
6061 int attr = get_lbp (i);
6063 /* Now attr should contain exactly one bit. */
6064 if (attr == 0 || ((attr & (attr - 1)) != 0))
6067 if (attr != 1 << LBP_XX)
6069 unsigned int log2_attr;
6070 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6072 lbp_table_add (&t, i, log2_attr);
6076 lbp_table_finalize (&t);
6079 5 * sizeof (uint32_t);
6081 5 * sizeof (uint32_t)
6082 + t.level1_size * sizeof (uint32_t);
6084 5 * sizeof (uint32_t)
6085 + t.level1_size * sizeof (uint32_t)
6086 + (t.level2_size << t.q) * sizeof (uint32_t);
6088 for (i = 0; i < 5; i++)
6089 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6090 ((uint32_t *) t.result)[i]);
6091 fprintf (stream1, "\n");
6092 fprintf (stream1, "typedef struct\n");
6093 fprintf (stream1, " {\n");
6094 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6095 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6096 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6097 fprintf (stream1, " }\n");
6098 fprintf (stream1, "lbrkprop_t;\n");
6099 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6101 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6102 fprintf (stream2, "{\n");
6103 fprintf (stream2, " {");
6104 if (t.level1_size > 8)
6105 fprintf (stream2, "\n ");
6106 for (i = 0; i < t.level1_size; i++)
6109 if (i > 0 && (i % 8) == 0)
6110 fprintf (stream2, "\n ");
6111 offset = ((uint32_t *) (t.result + level1_offset))[i];
6113 fprintf (stream2, " %5d", -1);
6115 fprintf (stream2, " %5zu",
6116 (offset - level2_offset) / sizeof (uint32_t));
6117 if (i+1 < t.level1_size)
6118 fprintf (stream2, ",");
6120 if (t.level1_size > 8)
6121 fprintf (stream2, "\n ");
6122 fprintf (stream2, " },\n");
6123 fprintf (stream2, " {");
6124 if (t.level2_size << t.q > 8)
6125 fprintf (stream2, "\n ");
6126 for (i = 0; i < t.level2_size << t.q; i++)
6129 if (i > 0 && (i % 8) == 0)
6130 fprintf (stream2, "\n ");
6131 offset = ((uint32_t *) (t.result + level2_offset))[i];
6133 fprintf (stream2, " %5d", -1);
6135 fprintf (stream2, " %5zu",
6136 (offset - level3_offset) / sizeof (unsigned char));
6137 if (i+1 < t.level2_size << t.q)
6138 fprintf (stream2, ",");
6140 if (t.level2_size << t.q > 8)
6141 fprintf (stream2, "\n ");
6142 fprintf (stream2, " },\n");
6143 fprintf (stream2, " {");
6144 if (t.level3_size << t.p > 8)
6145 fprintf (stream2, "\n ");
6146 for (i = 0; i < t.level3_size << t.p; i++)
6148 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6149 const char *value_string;
6152 #define CASE(x) case x: value_string = #x; break;
6189 if (i > 0 && (i % 8) == 0)
6190 fprintf (stream2, "\n ");
6191 fprintf (stream2, " %s%s", value_string,
6192 (i+1 < t.level3_size << t.p ? "," : ""));
6194 if (t.level3_size << t.p > 8)
6195 fprintf (stream2, "\n ");
6196 fprintf (stream2, " }\n");
6197 fprintf (stream2, "};\n");
6201 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6203 const char *filenames[2];
6207 filenames[0] = filename1;
6208 filenames[1] = filename2;
6210 for (i = 0; i < 2; i++)
6212 streams[i] = fopen (filenames[i], "w");
6213 if (streams[i] == NULL)
6215 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6220 for (i = 0; i < 2; i++)
6222 FILE *stream = streams[i];
6224 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6225 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6226 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6228 fprintf (stream, "\n");
6230 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6231 still carries the GPL header), and it's gnulib-tool which replaces the
6232 GPL header with an LGPL header. */
6233 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6234 fprintf (stream, "\n");
6235 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6236 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6237 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6238 fprintf (stream, " (at your option) any later version.\n");
6239 fprintf (stream, "\n");
6240 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6241 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6242 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6243 fprintf (stream, " GNU General Public License for more details.\n");
6244 fprintf (stream, "\n");
6245 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6246 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6247 fprintf (stream, "\n");
6250 output_lbp (streams[0], streams[1]);
6252 for (i = 0; i < 2; i++)
6254 if (ferror (streams[i]) || fclose (streams[i]))
6256 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6262 /* ========================================================================= */
6264 /* Word break property. */
6266 /* Possible values of the Word_Break property. */
6281 WBP_EXTENDNUMLET = 7
6284 /* Returns the word breaking property for ch, as a bit mask. */
6286 get_wbp (unsigned int ch)
6290 if (unicode_attributes[ch].name != NULL)
6293 attr |= 1 << WBP_CR;
6296 attr |= 1 << WBP_LF;
6298 if (ch == 0x000B || ch == 0x000C
6300 || ch == 0x2028 || ch == 0x2029)
6301 attr |= 1 << WBP_NEWLINE;
6303 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6304 || (unicode_attributes[ch].category != NULL
6305 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6306 attr |= 1 << WBP_EXTEND;
6308 if (unicode_attributes[ch].category != NULL
6309 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6310 && ch != 0x200C && ch != 0x200D)
6311 attr |= 1 << WBP_FORMAT;
6313 if ((unicode_scripts[ch] < numscripts
6314 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6315 || (ch >= 0x3031 && ch <= 0x3035)
6316 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6318 attr |= 1 << WBP_KATAKANA;
6320 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6322 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6323 && (attr & (1 << WBP_KATAKANA)) == 0
6324 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6325 && !(unicode_scripts[ch] < numscripts
6326 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6327 && (attr & (1 << WBP_EXTEND)) == 0)
6328 attr |= 1 << WBP_ALETTER;
6330 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6331 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6332 attr |= 1 << WBP_MIDNUMLET;
6334 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6335 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6336 attr |= 1 << WBP_MIDLETTER;
6338 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6339 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6341 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6342 attr |= 1 << WBP_MIDNUM;
6344 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6346 attr |= 1 << WBP_NUMERIC;
6348 if (unicode_attributes[ch].category != NULL
6349 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6350 attr |= 1 << WBP_EXTENDNUMLET;
6355 attr |= 1 << WBP_OTHER;
6360 /* Output the word break property in a human readable format. */
6362 debug_output_wbp (FILE *stream)
6366 for (i = 0; i < 0x110000; i++)
6368 int attr = get_wbp (i);
6369 if (attr != 1 << WBP_OTHER)
6371 fprintf (stream, "0x%04X", i);
6372 if (attr & (1 << WBP_CR))
6373 fprintf (stream, " CR");
6374 if (attr & (1 << WBP_LF))
6375 fprintf (stream, " LF");
6376 if (attr & (1 << WBP_NEWLINE))
6377 fprintf (stream, " Newline");
6378 if (attr & (1 << WBP_EXTEND))
6379 fprintf (stream, " Extend");
6380 if (attr & (1 << WBP_FORMAT))
6381 fprintf (stream, " Format");
6382 if (attr & (1 << WBP_KATAKANA))
6383 fprintf (stream, " Katakana");
6384 if (attr & (1 << WBP_ALETTER))
6385 fprintf (stream, " ALetter");
6386 if (attr & (1 << WBP_MIDNUMLET))
6387 fprintf (stream, " MidNumLet");
6388 if (attr & (1 << WBP_MIDLETTER))
6389 fprintf (stream, " MidLetter");
6390 if (attr & (1 << WBP_MIDNUM))
6391 fprintf (stream, " MidNum");
6392 if (attr & (1 << WBP_NUMERIC))
6393 fprintf (stream, " Numeric");
6394 if (attr & (1 << WBP_EXTENDNUMLET))
6395 fprintf (stream, " ExtendNumLet");
6396 fprintf (stream, "\n");
6402 debug_output_wbrk_tables (const char *filename)
6406 stream = fopen (filename, "w");
6409 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6413 debug_output_wbp (stream);
6415 if (ferror (stream) || fclose (stream))
6417 fprintf (stderr, "error writing to '%s'\n", filename);
6422 /* The word break property from the WordBreakProperty.txt file. */
6423 int unicode_org_wbp[0x110000];
6425 /* Stores in unicode_org_wbp[] the word break property from the
6426 WordBreakProperty.txt file. */
6428 fill_org_wbp (const char *wordbreakproperty_filename)
6433 for (i = 0; i < 0x110000; i++)
6434 unicode_org_wbp[i] = WBP_OTHER;
6436 stream = fopen (wordbreakproperty_filename, "r");
6439 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6446 unsigned int i1, i2;
6447 char padding[200+1];
6448 char propname[200+1];
6451 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6454 if (buf[0] == '\0' || buf[0] == '#')
6457 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6459 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6461 fprintf (stderr, "parse error in '%s'\n",
6462 wordbreakproperty_filename);
6467 #define PROP(name,value) \
6468 if (strcmp (propname, name) == 0) propvalue = value; else
6471 PROP ("Newline", WBP_NEWLINE)
6472 PROP ("Extend", WBP_EXTEND)
6473 PROP ("Format", WBP_FORMAT)
6474 PROP ("Katakana", WBP_KATAKANA)
6475 PROP ("ALetter", WBP_ALETTER)
6476 PROP ("MidNumLet", WBP_MIDNUMLET)
6477 PROP ("MidLetter", WBP_MIDLETTER)
6478 PROP ("MidNum", WBP_MIDNUM)
6479 PROP ("Numeric", WBP_NUMERIC)
6480 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6483 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6484 wordbreakproperty_filename);
6487 if (!(i1 <= i2 && i2 < 0x110000))
6490 for (i = i1; i <= i2; i++)
6491 unicode_org_wbp[i] = propvalue;
6494 if (ferror (stream) || fclose (stream))
6496 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6501 /* Output the word break property in a human readable format. */
6503 debug_output_org_wbp (FILE *stream)
6507 for (i = 0; i < 0x110000; i++)
6509 int propvalue = unicode_org_wbp[i];
6510 if (propvalue != WBP_OTHER)
6512 fprintf (stream, "0x%04X", i);
6513 #define PROP(name,value) \
6514 if (propvalue == value) fprintf (stream, " " name); else
6517 PROP ("Newline", WBP_NEWLINE)
6518 PROP ("Extend", WBP_EXTEND)
6519 PROP ("Format", WBP_FORMAT)
6520 PROP ("Katakana", WBP_KATAKANA)
6521 PROP ("ALetter", WBP_ALETTER)
6522 PROP ("MidNumLet", WBP_MIDNUMLET)
6523 PROP ("MidLetter", WBP_MIDLETTER)
6524 PROP ("MidNum", WBP_MIDNUM)
6525 PROP ("Numeric", WBP_NUMERIC)
6526 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6528 fprintf (stream, " ??");
6529 fprintf (stream, "\n");
6535 debug_output_org_wbrk_tables (const char *filename)
6539 stream = fopen (filename, "w");
6542 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6546 debug_output_org_wbp (stream);
6548 if (ferror (stream) || fclose (stream))
6550 fprintf (stderr, "error writing to '%s'\n", filename);
6555 /* Construction of sparse 3-level tables. */
6556 #define TABLE wbp_table
6557 #define ELEMENT unsigned char
6558 #define DEFAULT WBP_OTHER
6559 #define xmalloc malloc
6560 #define xrealloc realloc
6564 output_wbp (FILE *stream)
6568 unsigned int level1_offset, level2_offset, level3_offset;
6572 wbp_table_init (&t);
6574 for (i = 0; i < 0x110000; i++)
6576 int attr = get_wbp (i);
6578 /* Now attr should contain exactly one bit. */
6579 if (attr == 0 || ((attr & (attr - 1)) != 0))
6582 if (attr != 1 << WBP_OTHER)
6584 unsigned int log2_attr;
6585 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6587 wbp_table_add (&t, i, log2_attr);
6591 wbp_table_finalize (&t);
6594 5 * sizeof (uint32_t);
6596 5 * sizeof (uint32_t)
6597 + t.level1_size * sizeof (uint32_t);
6599 5 * sizeof (uint32_t)
6600 + t.level1_size * sizeof (uint32_t)
6601 + (t.level2_size << t.q) * sizeof (uint32_t);
6603 for (i = 0; i < 5; i++)
6604 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6605 ((uint32_t *) t.result)[i]);
6606 fprintf (stream, "\n");
6607 fprintf (stream, "typedef struct\n");
6608 fprintf (stream, " {\n");
6609 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6610 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6611 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6612 fprintf (stream, " }\n");
6613 fprintf (stream, "wbrkprop_t;\n");
6614 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6615 fprintf (stream, "{\n");
6616 fprintf (stream, " {");
6617 if (t.level1_size > 8)
6618 fprintf (stream, "\n ");
6619 for (i = 0; i < t.level1_size; i++)
6622 if (i > 0 && (i % 8) == 0)
6623 fprintf (stream, "\n ");
6624 offset = ((uint32_t *) (t.result + level1_offset))[i];
6626 fprintf (stream, " %5d", -1);
6628 fprintf (stream, " %5zu",
6629 (offset - level2_offset) / sizeof (uint32_t));
6630 if (i+1 < t.level1_size)
6631 fprintf (stream, ",");
6633 if (t.level1_size > 8)
6634 fprintf (stream, "\n ");
6635 fprintf (stream, " },\n");
6636 fprintf (stream, " {");
6637 if (t.level2_size << t.q > 8)
6638 fprintf (stream, "\n ");
6639 for (i = 0; i < t.level2_size << t.q; i++)
6642 if (i > 0 && (i % 8) == 0)
6643 fprintf (stream, "\n ");
6644 offset = ((uint32_t *) (t.result + level2_offset))[i];
6646 fprintf (stream, " %5d", -1);
6648 fprintf (stream, " %5zu",
6649 (offset - level3_offset) / sizeof (unsigned char));
6650 if (i+1 < t.level2_size << t.q)
6651 fprintf (stream, ",");
6653 if (t.level2_size << t.q > 8)
6654 fprintf (stream, "\n ");
6655 fprintf (stream, " },\n");
6656 fprintf (stream, " {");
6657 if (t.level3_size << t.p > 4)
6658 fprintf (stream, "\n ");
6659 for (i = 0; i < t.level3_size << t.p; i++)
6661 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6662 const char *value_string;
6665 #define CASE(x) case x: value_string = #x; break;
6674 CASE(WBP_MIDNUMLET);
6675 CASE(WBP_MIDLETTER);
6678 CASE(WBP_EXTENDNUMLET);
6683 if (i > 0 && (i % 4) == 0)
6684 fprintf (stream, "\n ");
6685 fprintf (stream, " %s%s", value_string,
6686 (i+1 < t.level3_size << t.p ? "," : ""));
6688 if (t.level3_size << t.p > 4)
6689 fprintf (stream, "\n ");
6690 fprintf (stream, " }\n");
6691 fprintf (stream, "};\n");
6695 output_wbrk_tables (const char *filename, const char *version)
6699 stream = fopen (filename, "w");
6702 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6706 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6707 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6708 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6710 fprintf (stream, "\n");
6712 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6713 still carries the GPL header), and it's gnulib-tool which replaces the
6714 GPL header with an LGPL header. */
6715 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6716 fprintf (stream, "\n");
6717 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6718 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6719 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6720 fprintf (stream, " (at your option) any later version.\n");
6721 fprintf (stream, "\n");
6722 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6723 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6724 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6725 fprintf (stream, " GNU General Public License for more details.\n");
6726 fprintf (stream, "\n");
6727 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6728 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6729 fprintf (stream, "\n");
6731 output_wbp (stream);
6733 if (ferror (stream) || fclose (stream))
6735 fprintf (stderr, "error writing to '%s'\n", filename);
6740 /* ========================================================================= */
6742 /* Grapheme break property. */
6744 /* Possible values of the Grapheme_Cluster_Break property. */
6753 GBP_SPACINGMARK = 6,
6761 /* Construction of sparse 3-level tables. */
6762 #define TABLE gbp_table
6763 #define ELEMENT unsigned char
6764 #define DEFAULT GBP_OTHER
6765 #define xmalloc malloc
6766 #define xrealloc realloc
6769 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
6770 int unicode_org_gbp[0x110000];
6772 /* Output the unit test data for the grapheme break property. */
6774 output_gbp_test (const char *filename)
6780 stream = fopen (filename, "w");
6783 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6787 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6788 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
6789 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
6790 fprintf (stream, "\n");
6791 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6792 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6793 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6794 fprintf (stream, " (at your option) any later version.\n");
6795 fprintf (stream, "\n");
6796 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6797 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6798 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6799 fprintf (stream, " GNU General Public License for more details.\n");
6800 fprintf (stream, "\n");
6801 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6802 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6803 fprintf (stream, "\n");
6806 for (ch = 0; ch < 0x110000; ch++)
6808 int gbp = unicode_org_gbp[ch];
6809 const char *gbp_string;
6811 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
6816 #define CASE(x) case x: gbp_string = #x; break;
6823 CASE (GBP_SPACINGMARK)
6835 fprintf (stream, ",\n");
6836 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
6840 fprintf (stream, "\n");
6842 if (ferror (stream) || fclose (stream))
6844 fprintf (stderr, "error writing to '%s'\n", filename);
6849 /* Output the per-character grapheme break property table. */
6851 output_gbp_table (const char *filename, const char *version)
6856 unsigned int level1_offset, level2_offset, level3_offset;
6858 stream = fopen (filename, "w");
6861 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6865 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6866 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
6867 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6872 gbp_table_init (&t);
6874 for (ch = 0; ch < 0x110000; ch++)
6875 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
6877 gbp_table_finalize (&t);
6879 /* Offsets in t.result, in memory of this process. */
6881 5 * sizeof (uint32_t);
6883 5 * sizeof (uint32_t)
6884 + t.level1_size * sizeof (uint32_t);
6886 5 * sizeof (uint32_t)
6887 + t.level1_size * sizeof (uint32_t)
6888 + (t.level2_size << t.q) * sizeof (uint32_t);
6890 for (i = 0; i < 5; i++)
6891 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
6892 ((uint32_t *) t.result)[i]);
6893 fprintf (stream, "static const\n");
6894 fprintf (stream, "struct\n");
6895 fprintf (stream, " {\n");
6896 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6897 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6898 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
6899 t.level3_size, t.p);
6900 fprintf (stream, " }\n");
6901 fprintf (stream, "unigbrkprop =\n");
6902 fprintf (stream, "{\n");
6903 fprintf (stream, " {");
6904 if (t.level1_size > 8)
6905 fprintf (stream, "\n ");
6906 for (i = 0; i < t.level1_size; i++)
6909 if (i > 0 && (i % 8) == 0)
6910 fprintf (stream, "\n ");
6911 offset = ((uint32_t *) (t.result + level1_offset))[i];
6913 fprintf (stream, " %5d", -1);
6915 fprintf (stream, " %5zu",
6916 (offset - level2_offset) / sizeof (uint32_t));
6917 if (i+1 < t.level1_size)
6918 fprintf (stream, ",");
6920 if (t.level1_size > 8)
6921 fprintf (stream, "\n ");
6922 fprintf (stream, " },\n");
6923 fprintf (stream, " {");
6924 if (t.level2_size << t.q > 8)
6925 fprintf (stream, "\n ");
6926 for (i = 0; i < t.level2_size << t.q; i++)
6929 if (i > 0 && (i % 8) == 0)
6930 fprintf (stream, "\n ");
6931 offset = ((uint32_t *) (t.result + level2_offset))[i];
6933 fprintf (stream, " %5d", -1);
6935 fprintf (stream, " %5zu",
6936 (offset - level3_offset) / sizeof (uint8_t) / 2);
6937 if (i+1 < t.level2_size << t.q)
6938 fprintf (stream, ",");
6940 if (t.level2_size << t.q > 8)
6941 fprintf (stream, "\n ");
6942 fprintf (stream, " },\n");
6943 fprintf (stream, " {");
6944 if (t.level3_size << t.p > 8)
6945 fprintf (stream, "\n ");
6946 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
6948 unsigned char *p = (unsigned char *) (t.result + level3_offset);
6949 unsigned char value0 = p[i * 2];
6950 unsigned char value1 = p[i * 2 + 1];
6951 if (i > 0 && (i % 8) == 0)
6952 fprintf (stream, "\n ");
6953 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
6954 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
6956 if (t.level3_size << t.p > 8)
6957 fprintf (stream, "\n ");
6958 fprintf (stream, " }\n");
6959 fprintf (stream, "};\n");
6961 if (ferror (stream) || fclose (stream))
6963 fprintf (stderr, "error writing to '%s'\n", filename);
6968 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
6969 GraphemeBreakProperty.txt file. */
6971 fill_org_gbp (const char *graphemebreakproperty_filename)
6977 for (i = 0; i < 0x110000; i++)
6978 unicode_org_gbp[i] = GBP_OTHER;
6980 stream = fopen (graphemebreakproperty_filename, "r");
6983 fprintf (stderr, "error during fopen of '%s'\n",
6984 graphemebreakproperty_filename);
6991 unsigned int i1, i2;
6992 char padding[200+1];
6993 char propname[200+1];
6997 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7000 if (buf[0] == '\0' || buf[0] == '#')
7003 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7005 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7007 fprintf (stderr, "parse error in '%s'\n",
7008 graphemebreakproperty_filename);
7013 #define PROP(name,value) \
7014 if (strcmp (propname, name) == 0) propvalue = value; else
7017 PROP ("Control", GBP_CONTROL)
7018 PROP ("Extend", GBP_EXTEND)
7019 PROP ("Prepend", GBP_PREPEND)
7020 PROP ("SpacingMark", GBP_SPACINGMARK)
7025 PROP ("LVT", GBP_LVT)
7028 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
7029 graphemebreakproperty_filename, lineno);
7032 if (!(i1 <= i2 && i2 < 0x110000))
7035 for (i = i1; i <= i2; i++)
7036 unicode_org_gbp[i] = propvalue;
7038 if (ferror (stream) || fclose (stream))
7040 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
7045 /* ========================================================================= */
7047 /* Maximum number of characters into which a single Unicode character can be
7049 #define MAX_DECOMP_LENGTH 18
7053 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
7054 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
7055 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
7056 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
7057 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
7058 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
7059 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
7060 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
7061 UC_DECOMP_SUPER, /* <super> A superscript form. */
7062 UC_DECOMP_SUB, /* <sub> A subscript form. */
7063 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
7064 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
7065 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
7066 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
7067 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
7068 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
7069 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7072 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7073 decompositions). Return the type, or -1 for none. */
7075 get_decomposition (unsigned int ch,
7076 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7078 const char *decomposition = unicode_attributes[ch].decomposition;
7080 if (decomposition != NULL && decomposition[0] != '\0')
7082 int type = UC_DECOMP_CANONICAL;
7083 unsigned int length;
7086 if (decomposition[0] == '<')
7091 rangle = strchr (decomposition + 1, '>');
7094 typelen = rangle + 1 - decomposition;
7095 #define TYPE(t1,t2) \
7096 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7099 TYPE ("<font>", UC_DECOMP_FONT)
7100 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7101 TYPE ("<initial>", UC_DECOMP_INITIAL)
7102 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7103 TYPE ("<final>", UC_DECOMP_FINAL)
7104 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7105 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7106 TYPE ("<super>", UC_DECOMP_SUPER)
7107 TYPE ("<sub>", UC_DECOMP_SUB)
7108 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7109 TYPE ("<wide>", UC_DECOMP_WIDE)
7110 TYPE ("<narrow>", UC_DECOMP_NARROW)
7111 TYPE ("<small>", UC_DECOMP_SMALL)
7112 TYPE ("<square>", UC_DECOMP_SQUARE)
7113 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7114 TYPE ("<compat>", UC_DECOMP_COMPAT)
7116 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7120 decomposition = rangle + 1;
7121 if (decomposition[0] == ' ')
7124 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7126 decomposed[length] = strtoul (decomposition, &endptr, 16);
7127 if (endptr == decomposition)
7129 decomposition = endptr;
7130 if (decomposition[0] == ' ')
7133 if (*decomposition != '\0')
7134 /* MAX_DECOMP_LENGTH is too small. */
7144 /* Construction of sparse 3-level tables. */
7145 #define TABLE decomp_table
7146 #define ELEMENT uint16_t
7147 #define DEFAULT (uint16_t)(-1)
7148 #define xmalloc malloc
7149 #define xrealloc realloc
7153 output_decomposition (FILE *stream1, FILE *stream2)
7155 struct decomp_table t;
7156 unsigned int level1_offset, level2_offset, level3_offset;
7157 unsigned int offset;
7163 decomp_table_init (&t);
7165 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7166 fprintf (stream1, "\n");
7167 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7170 for (ch = 0; ch < 0x110000; ch++)
7172 unsigned int length;
7173 unsigned int decomposed[MAX_DECOMP_LENGTH];
7174 int type = get_decomposition (ch, &length, decomposed);
7178 if (!(offset < (1 << 15)))
7180 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7182 /* Produce length 3-bytes entries. */
7184 /* We would need a special representation of zero-length entries. */
7186 for (i = 0; i < length; i++)
7189 fprintf (stream2, ",");
7190 if ((offset % 4) == 0)
7191 fprintf (stream2, "\n ");
7192 if (!(decomposed[i] < (1 << 18)))
7194 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7195 (((i+1 < length ? (1 << 23) : 0)
7196 | (i == 0 ? (type << 18) : 0)
7197 | decomposed[i]) >> 16) & 0xff,
7198 (decomposed[i] >> 8) & 0xff,
7199 decomposed[i] & 0xff);
7205 fprintf (stream2, "\n};\n");
7206 fprintf (stream2, "\n");
7208 decomp_table_finalize (&t);
7211 5 * sizeof (uint32_t);
7213 5 * sizeof (uint32_t)
7214 + t.level1_size * sizeof (uint32_t);
7216 5 * sizeof (uint32_t)
7217 + t.level1_size * sizeof (uint32_t)
7218 + (t.level2_size << t.q) * sizeof (uint32_t);
7220 for (i = 0; i < 5; i++)
7221 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7222 ((uint32_t *) t.result)[i]);
7223 fprintf (stream1, "\n");
7224 fprintf (stream1, "typedef struct\n");
7225 fprintf (stream1, " {\n");
7226 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7227 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7228 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7229 fprintf (stream1, " }\n");
7230 fprintf (stream1, "decomp_index_table_t;\n");
7231 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7232 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7233 fprintf (stream2, "{\n");
7234 fprintf (stream2, " {");
7235 if (t.level1_size > 8)
7236 fprintf (stream2, "\n ");
7237 for (i = 0; i < t.level1_size; i++)
7240 if (i > 0 && (i % 8) == 0)
7241 fprintf (stream2, "\n ");
7242 offset = ((uint32_t *) (t.result + level1_offset))[i];
7244 fprintf (stream2, " %5d", -1);
7246 fprintf (stream2, " %5zu",
7247 (offset - level2_offset) / sizeof (uint32_t));
7248 if (i+1 < t.level1_size)
7249 fprintf (stream2, ",");
7251 if (t.level1_size > 8)
7252 fprintf (stream2, "\n ");
7253 fprintf (stream2, " },\n");
7254 fprintf (stream2, " {");
7255 if (t.level2_size << t.q > 8)
7256 fprintf (stream2, "\n ");
7257 for (i = 0; i < t.level2_size << t.q; i++)
7260 if (i > 0 && (i % 8) == 0)
7261 fprintf (stream2, "\n ");
7262 offset = ((uint32_t *) (t.result + level2_offset))[i];
7264 fprintf (stream2, " %5d", -1);
7266 fprintf (stream2, " %5zu",
7267 (offset - level3_offset) / sizeof (uint16_t));
7268 if (i+1 < t.level2_size << t.q)
7269 fprintf (stream2, ",");
7271 if (t.level2_size << t.q > 8)
7272 fprintf (stream2, "\n ");
7273 fprintf (stream2, " },\n");
7274 fprintf (stream2, " {");
7275 if (t.level3_size << t.p > 8)
7276 fprintf (stream2, "\n ");
7277 for (i = 0; i < t.level3_size << t.p; i++)
7279 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7280 if (i > 0 && (i % 8) == 0)
7281 fprintf (stream2, "\n ");
7282 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7283 if (i+1 < t.level3_size << t.p)
7284 fprintf (stream2, ",");
7286 if (t.level3_size << t.p > 8)
7287 fprintf (stream2, "\n ");
7288 fprintf (stream2, " }\n");
7289 fprintf (stream2, "};\n");
7293 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7295 const char *filenames[2];
7299 filenames[0] = filename1;
7300 filenames[1] = filename2;
7302 for (i = 0; i < 2; i++)
7304 streams[i] = fopen (filenames[i], "w");
7305 if (streams[i] == NULL)
7307 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7312 for (i = 0; i < 2; i++)
7314 FILE *stream = streams[i];
7316 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7317 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7318 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7320 fprintf (stream, "\n");
7323 output_decomposition (streams[0], streams[1]);
7325 for (i = 0; i < 2; i++)
7327 if (ferror (streams[i]) || fclose (streams[i]))
7329 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7335 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7336 char unicode_composition_exclusions[0x110000];
7339 fill_composition_exclusions (const char *compositionexclusions_filename)
7344 stream = fopen (compositionexclusions_filename, "r");
7347 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7351 for (i = 0; i < 0x110000; i++)
7352 unicode_composition_exclusions[i] = 0;
7359 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7362 if (buf[0] == '\0' || buf[0] == '#')
7365 if (sscanf (buf, "%X", &i) != 1)
7367 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7370 if (!(i < 0x110000))
7373 unicode_composition_exclusions[i] = 1;
7376 if (ferror (stream) || fclose (stream))
7378 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7384 debug_output_composition_tables (const char *filename)
7389 stream = fopen (filename, "w");
7392 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7396 for (ch = 0; ch < 0x110000; ch++)
7398 unsigned int length;
7399 unsigned int decomposed[MAX_DECOMP_LENGTH];
7400 int type = get_decomposition (ch, &length, decomposed);
7402 if (type == UC_DECOMP_CANONICAL
7403 /* Consider only binary decompositions.
7404 Exclude singleton decompositions. */
7407 unsigned int code1 = decomposed[0];
7408 unsigned int code2 = decomposed[1];
7409 unsigned int combined = ch;
7411 /* Exclude decompositions where the first part is not a starter,
7412 i.e. is not of canonical combining class 0. */
7413 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7414 /* Exclude characters listed in CompositionExclusions.txt. */
7415 && !unicode_composition_exclusions[combined])
7417 /* The combined character must now also be a starter.
7419 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7422 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7426 unicode_attributes[code2].combining);
7431 if (ferror (stream) || fclose (stream))
7433 fprintf (stderr, "error writing to '%s'\n", filename);
7439 output_composition_tables (const char *filename, const char *version)
7444 stream = fopen (filename, "w");
7447 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7451 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7452 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7453 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7455 fprintf (stream, "\n");
7457 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7458 still carries the GPL header), and it's gnulib-tool which replaces the
7459 GPL header with an LGPL header. */
7460 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7461 fprintf (stream, "\n");
7462 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7463 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7464 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7465 fprintf (stream, " (at your option) any later version.\n");
7466 fprintf (stream, "\n");
7467 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7468 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7469 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7470 fprintf (stream, " GNU General Public License for more details.\n");
7471 fprintf (stream, "\n");
7472 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7473 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7474 fprintf (stream, "\n");
7476 /* The composition table is a set of mappings (code1, code2) -> combined,
7478 367 values for code1 (from 0x003C to 0x30FD),
7479 54 values for code2 (from 0x0300 to 0x309A).
7480 For a fixed code1, there are from 1 to 19 possible values for code2.
7481 For a fixed code2, there are from 1 to 117 possible values for code1.
7482 This is a very sparse matrix.
7484 We want an O(1) hash lookup.
7486 We could implement the hash lookup by mapping (code1, code2) to a linear
7487 combination mul1*code1 + mul2*code2, which is then used as an index into
7488 a 3-level table. But this leads to a table of size 37 KB.
7490 We use gperf to implement the hash lookup, giving it the 928 sets of
7491 4 bytes (code1, code2) as input. gperf generates a hash table of size
7492 1527, which is quite good (60% filled). It requires an auxiliary table
7493 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7495 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7496 fprintf (stream, "%%struct-type\n");
7497 fprintf (stream, "%%language=ANSI-C\n");
7498 fprintf (stream, "%%define slot-name codes\n");
7499 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7500 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7501 fprintf (stream, "%%compare-lengths\n");
7502 fprintf (stream, "%%compare-strncmp\n");
7503 fprintf (stream, "%%readonly-tables\n");
7504 fprintf (stream, "%%omit-struct-type\n");
7505 fprintf (stream, "%%%%\n");
7507 for (ch = 0; ch < 0x110000; ch++)
7509 unsigned int length;
7510 unsigned int decomposed[MAX_DECOMP_LENGTH];
7511 int type = get_decomposition (ch, &length, decomposed);
7513 if (type == UC_DECOMP_CANONICAL
7514 /* Consider only binary decompositions.
7515 Exclude singleton decompositions. */
7518 unsigned int code1 = decomposed[0];
7519 unsigned int code2 = decomposed[1];
7520 unsigned int combined = ch;
7522 /* Exclude decompositions where the first part is not a starter,
7523 i.e. is not of canonical combining class 0. */
7524 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7525 /* Exclude characters listed in CompositionExclusions.txt. */
7526 && !unicode_composition_exclusions[combined])
7528 /* The combined character must now also be a starter.
7530 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7533 if (!(code1 < 0x10000))
7535 if (!(code2 < 0x10000))
7537 if (!(combined < 0x10000))
7540 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7541 (code1 >> 8) & 0xff, code1 & 0xff,
7542 (code2 >> 8) & 0xff, code2 & 0xff,
7548 if (ferror (stream) || fclose (stream))
7550 fprintf (stderr, "error writing to '%s'\n", filename);
7555 /* ========================================================================= */
7557 /* Output the test for a simple character mapping table to the given file. */
7560 output_simple_mapping_test (const char *filename,
7561 const char *function_name,
7562 unsigned int (*func) (unsigned int),
7563 const char *version)
7569 stream = fopen (filename, "w");
7572 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7576 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7577 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7578 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7579 fprintf (stream, "\n");
7580 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7581 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7582 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7583 fprintf (stream, " (at your option) any later version.\n");
7584 fprintf (stream, "\n");
7585 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7586 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7587 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7588 fprintf (stream, " GNU General Public License for more details.\n");
7589 fprintf (stream, "\n");
7590 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7591 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7592 fprintf (stream, "\n");
7593 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7595 fprintf (stream, "\n");
7596 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7597 fprintf (stream, "\n");
7600 for (ch = 0; ch < 0x110000; ch++)
7602 unsigned int value = func (ch);
7607 fprintf (stream, ",\n");
7608 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7613 fprintf (stream, "\n");
7615 fprintf (stream, "\n");
7616 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7617 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7619 if (ferror (stream) || fclose (stream))
7621 fprintf (stderr, "error writing to '%s'\n", filename);
7626 /* Construction of sparse 3-level tables. */
7627 #define TABLE mapping_table
7628 #define ELEMENT int32_t
7630 #define xmalloc malloc
7631 #define xrealloc realloc
7634 /* Output a simple character mapping table to the given file. */
7637 output_simple_mapping (const char *filename,
7638 unsigned int (*func) (unsigned int),
7639 const char *version)
7643 struct mapping_table t;
7644 unsigned int level1_offset, level2_offset, level3_offset;
7646 stream = fopen (filename, "w");
7649 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7653 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7654 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7655 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7660 mapping_table_init (&t);
7662 for (ch = 0; ch < 0x110000; ch++)
7664 int value = (int) func (ch) - (int) ch;
7666 mapping_table_add (&t, ch, value);
7669 mapping_table_finalize (&t);
7671 /* Offsets in t.result, in memory of this process. */
7673 5 * sizeof (uint32_t);
7675 5 * sizeof (uint32_t)
7676 + t.level1_size * sizeof (uint32_t);
7678 5 * sizeof (uint32_t)
7679 + t.level1_size * sizeof (uint32_t)
7680 + (t.level2_size << t.q) * sizeof (uint32_t);
7682 for (i = 0; i < 5; i++)
7683 fprintf (stream, "#define mapping_header_%d %d\n", i,
7684 ((uint32_t *) t.result)[i]);
7685 fprintf (stream, "static const\n");
7686 fprintf (stream, "struct\n");
7687 fprintf (stream, " {\n");
7688 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7689 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7690 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7691 fprintf (stream, " }\n");
7692 fprintf (stream, "u_mapping =\n");
7693 fprintf (stream, "{\n");
7694 fprintf (stream, " {");
7695 if (t.level1_size > 8)
7696 fprintf (stream, "\n ");
7697 for (i = 0; i < t.level1_size; i++)
7700 if (i > 0 && (i % 8) == 0)
7701 fprintf (stream, "\n ");
7702 offset = ((uint32_t *) (t.result + level1_offset))[i];
7704 fprintf (stream, " %5d", -1);
7706 fprintf (stream, " %5zu",
7707 (offset - level2_offset) / sizeof (uint32_t));
7708 if (i+1 < t.level1_size)
7709 fprintf (stream, ",");
7711 if (t.level1_size > 8)
7712 fprintf (stream, "\n ");
7713 fprintf (stream, " },\n");
7714 fprintf (stream, " {");
7715 if (t.level2_size << t.q > 8)
7716 fprintf (stream, "\n ");
7717 for (i = 0; i < t.level2_size << t.q; i++)
7720 if (i > 0 && (i % 8) == 0)
7721 fprintf (stream, "\n ");
7722 offset = ((uint32_t *) (t.result + level2_offset))[i];
7724 fprintf (stream, " %5d", -1);
7726 fprintf (stream, " %5zu",
7727 (offset - level3_offset) / sizeof (int32_t));
7728 if (i+1 < t.level2_size << t.q)
7729 fprintf (stream, ",");
7731 if (t.level2_size << t.q > 8)
7732 fprintf (stream, "\n ");
7733 fprintf (stream, " },\n");
7734 fprintf (stream, " {");
7735 if (t.level3_size << t.p > 8)
7736 fprintf (stream, "\n ");
7737 for (i = 0; i < t.level3_size << t.p; i++)
7739 if (i > 0 && (i % 8) == 0)
7740 fprintf (stream, "\n ");
7741 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7742 if (i+1 < t.level3_size << t.p)
7743 fprintf (stream, ",");
7745 if (t.level3_size << t.p > 8)
7746 fprintf (stream, "\n ");
7747 fprintf (stream, " }\n");
7748 fprintf (stream, "};\n");
7750 if (ferror (stream) || fclose (stream))
7752 fprintf (stderr, "error writing to '%s'\n", filename);
7757 /* ========================================================================= */
7759 /* A special casing context.
7760 A context is negated through x -> -x. */
7765 SCC_AFTER_SOFT_DOTTED,
7771 /* A special casing rule. */
7772 struct special_casing_rule
7775 unsigned int lower_mapping[3];
7776 unsigned int title_mapping[3];
7777 unsigned int upper_mapping[3];
7778 unsigned int casefold_mapping[3];
7779 const char *language;
7783 /* The special casing rules. */
7784 struct special_casing_rule **casing_rules;
7785 unsigned int num_casing_rules;
7786 unsigned int allocated_casing_rules;
7789 add_casing_rule (struct special_casing_rule *new_rule)
7791 if (num_casing_rules == allocated_casing_rules)
7793 allocated_casing_rules = 2 * allocated_casing_rules;
7794 if (allocated_casing_rules < 16)
7795 allocated_casing_rules = 16;
7797 (struct special_casing_rule **)
7798 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
7800 casing_rules[num_casing_rules++] = new_rule;
7803 /* Stores in casing_rules the special casing rules found in
7804 specialcasing_filename. */
7806 fill_casing_rules (const char *specialcasing_filename)
7810 stream = fopen (specialcasing_filename, "r");
7813 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
7817 casing_rules = NULL;
7818 num_casing_rules = 0;
7819 allocated_casing_rules = 0;
7829 unsigned int lower_mapping[3];
7830 unsigned int title_mapping[3];
7831 unsigned int upper_mapping[3];
7835 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7838 if (buf[0] == '\0' || buf[0] == '#')
7843 code = strtoul (scanptr, &endptr, 16);
7844 if (endptr == scanptr)
7846 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7850 if (*scanptr != ';')
7852 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7857 /* Scan lower mapping. */
7858 for (i = 0; i < 3; i++)
7859 lower_mapping[i] = 0;
7860 for (i = 0; i < 3; i++)
7862 while (*scanptr == ' ')
7864 if (*scanptr == ';')
7866 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
7867 if (endptr == scanptr)
7869 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7874 if (*scanptr != ';')
7876 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7881 /* Scan title mapping. */
7882 for (i = 0; i < 3; i++)
7883 title_mapping[i] = 0;
7884 for (i = 0; i < 3; i++)
7886 while (*scanptr == ' ')
7888 if (*scanptr == ';')
7890 title_mapping[i] = strtoul (scanptr, &endptr, 16);
7891 if (endptr == scanptr)
7893 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7898 if (*scanptr != ';')
7900 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7905 /* Scan upper mapping. */
7906 for (i = 0; i < 3; i++)
7907 upper_mapping[i] = 0;
7908 for (i = 0; i < 3; i++)
7910 while (*scanptr == ' ')
7912 if (*scanptr == ';')
7914 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
7915 if (endptr == scanptr)
7917 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7922 if (*scanptr != ';')
7924 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7929 /* Scan language and context. */
7931 context = SCC_ALWAYS;
7932 while (*scanptr == ' ')
7934 if (*scanptr != '\0' && *scanptr != '#')
7936 const char *word_begin = scanptr;
7937 const char *word_end;
7939 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7943 while (*scanptr == ' ')
7946 if (word_end - word_begin == 2)
7948 language = (char *) malloc ((word_end - word_begin) + 1);
7949 memcpy (language, word_begin, 2);
7950 language[word_end - word_begin] = '\0';
7951 word_begin = word_end = NULL;
7953 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7955 word_begin = scanptr;
7956 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7962 if (word_end > word_begin)
7964 bool negate = false;
7966 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
7971 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
7972 context = SCC_FINAL_SIGMA;
7973 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
7974 context = SCC_AFTER_SOFT_DOTTED;
7975 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
7976 context = SCC_MORE_ABOVE;
7977 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
7978 context = SCC_BEFORE_DOT;
7979 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
7980 context = SCC_AFTER_I;
7983 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
7987 context = - context;
7990 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7992 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7997 /* Store the rule. */
7999 struct special_casing_rule *new_rule =
8000 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8001 new_rule->code = code;
8002 new_rule->language = language;
8003 new_rule->context = context;
8004 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
8005 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
8006 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
8008 add_casing_rule (new_rule);
8012 if (ferror (stream) || fclose (stream))
8014 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
8019 /* A casefolding rule. */
8020 struct casefold_rule
8023 unsigned int mapping[3];
8024 const char *language;
8027 /* The casefolding rules. */
8028 struct casefold_rule **casefolding_rules;
8029 unsigned int num_casefolding_rules;
8030 unsigned int allocated_casefolding_rules;
8032 /* Stores in casefolding_rules the case folding rules found in
8033 casefolding_filename. */
8035 fill_casefolding_rules (const char *casefolding_filename)
8039 stream = fopen (casefolding_filename, "r");
8042 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
8046 casefolding_rules = NULL;
8047 num_casefolding_rules = 0;
8048 allocated_casefolding_rules = 0;
8059 unsigned int mapping[3];
8061 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8064 if (buf[0] == '\0' || buf[0] == '#')
8069 code = strtoul (scanptr, &endptr, 16);
8070 if (endptr == scanptr)
8072 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8076 if (*scanptr != ';')
8078 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8084 while (*scanptr == ' ')
8089 case 'C': case 'F': case 'S': case 'T':
8093 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8097 if (*scanptr != ';')
8099 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8104 /* Scan casefold mapping. */
8105 for (i = 0; i < 3; i++)
8107 for (i = 0; i < 3; i++)
8109 while (*scanptr == ' ')
8111 if (*scanptr == ';')
8113 mapping[i] = strtoul (scanptr, &endptr, 16);
8114 if (endptr == scanptr)
8116 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8121 if (*scanptr != ';')
8123 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8128 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8131 const char * const *languages;
8132 unsigned int languages_count;
8134 /* Type 'T' indicates that the rule is applicable to Turkish
8138 static const char * const turkish_languages[] = { "tr", "az" };
8139 languages = turkish_languages;
8140 languages_count = 2;
8144 static const char * const all_languages[] = { NULL };
8145 languages = all_languages;
8146 languages_count = 1;
8149 for (i = 0; i < languages_count; i++)
8151 /* Store a new rule. */
8152 struct casefold_rule *new_rule =
8153 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8154 new_rule->code = code;
8155 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8156 new_rule->language = languages[i];
8158 if (num_casefolding_rules == allocated_casefolding_rules)
8160 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8161 if (allocated_casefolding_rules < 16)
8162 allocated_casefolding_rules = 16;
8164 (struct casefold_rule **)
8165 realloc (casefolding_rules,
8166 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8168 casefolding_rules[num_casefolding_rules++] = new_rule;
8173 if (ferror (stream) || fclose (stream))
8175 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8180 /* Casefold mapping, when it maps to a single character. */
8181 unsigned int unicode_casefold[0x110000];
8184 to_casefold (unsigned int ch)
8186 return unicode_casefold[ch];
8189 /* Redistribute the casefolding_rules:
8190 - Rules that map to a single character, language independently, are stored
8191 in unicode_casefold.
8192 - Other rules are merged into casing_rules. */
8194 redistribute_casefolding_rules (void)
8196 unsigned int ch, i, j;
8198 /* Fill unicode_casefold[]. */
8199 for (ch = 0; ch < 0x110000; ch++)
8200 unicode_casefold[ch] = ch;
8201 for (i = 0; i < num_casefolding_rules; i++)
8203 struct casefold_rule *cfrule = casefolding_rules[i];
8205 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8208 if (!(ch < 0x110000))
8210 unicode_casefold[ch] = cfrule->mapping[0];
8214 /* Extend the special casing rules by filling in their casefold_mapping[]
8216 for (j = 0; j < num_casing_rules; j++)
8218 struct special_casing_rule *rule = casing_rules[j];
8221 rule->casefold_mapping[0] = to_casefold (rule->code);
8222 for (k = 1; k < 3; k++)
8223 rule->casefold_mapping[k] = 0;
8226 /* Now merge the other casefolding rules into casing_rules. */
8227 for (i = 0; i < num_casefolding_rules; i++)
8229 struct casefold_rule *cfrule = casefolding_rules[i];
8231 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8233 /* Find a rule that applies to the same code, same language, and it
8234 has context SCC_ALWAYS. At the same time, update all rules that
8235 have the same code and same or more specific language. */
8236 struct special_casing_rule *found_rule = NULL;
8238 for (j = 0; j < num_casing_rules; j++)
8240 struct special_casing_rule *rule = casing_rules[j];
8242 if (rule->code == cfrule->code
8243 && (cfrule->language == NULL
8244 || (rule->language != NULL
8245 && strcmp (rule->language, cfrule->language) == 0)))
8247 memcpy (rule->casefold_mapping, cfrule->mapping,
8248 sizeof (rule->casefold_mapping));
8250 if ((cfrule->language == NULL
8251 ? rule->language == NULL
8252 : rule->language != NULL
8253 && strcmp (rule->language, cfrule->language) == 0)
8254 && rule->context == SCC_ALWAYS)
8262 if (found_rule == NULL)
8264 /* Create a new rule. */
8265 struct special_casing_rule *new_rule =
8266 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8268 /* Try to find a rule that applies to the same code, no language
8269 restriction, and with context SCC_ALWAYS. */
8270 for (j = 0; j < num_casing_rules; j++)
8272 struct special_casing_rule *rule = casing_rules[j];
8274 if (rule->code == cfrule->code
8275 && rule->context == SCC_ALWAYS
8276 && rule->language == NULL)
8284 new_rule->code = cfrule->code;
8285 new_rule->language = cfrule->language;
8286 new_rule->context = SCC_ALWAYS;
8287 if (found_rule != NULL)
8289 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8290 sizeof (new_rule->lower_mapping));
8291 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8292 sizeof (new_rule->title_mapping));
8293 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8294 sizeof (new_rule->upper_mapping));
8300 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8301 for (k = 1; k < 3; k++)
8302 new_rule->lower_mapping[k] = 0;
8303 new_rule->title_mapping[0] = to_title (cfrule->code);
8304 for (k = 1; k < 3; k++)
8305 new_rule->title_mapping[k] = 0;
8306 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8307 for (k = 1; k < 3; k++)
8308 new_rule->upper_mapping[k] = 0;
8310 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8311 sizeof (new_rule->casefold_mapping));
8313 add_casing_rule (new_rule);
8320 compare_casing_rules (const void *a, const void *b)
8322 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8323 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8324 unsigned int a_code = a_rule->code;
8325 unsigned int b_code = b_rule->code;
8327 if (a_code < b_code)
8329 if (a_code > b_code)
8332 /* Sort the more specific rules before the more general ones. */
8333 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8334 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8338 sort_casing_rules (void)
8340 /* Sort the rules 1. by code, 2. by specificity. */
8341 if (num_casing_rules > 1)
8342 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8343 compare_casing_rules);
8346 /* Output the special casing rules. */
8348 output_casing_rules (const char *filename, const char *version)
8354 stream = fopen (filename, "w");
8357 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8361 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8362 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8363 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8365 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8366 fprintf (stream, "%%struct-type\n");
8367 fprintf (stream, "%%language=ANSI-C\n");
8368 fprintf (stream, "%%define slot-name code\n");
8369 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8370 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8371 fprintf (stream, "%%compare-lengths\n");
8372 fprintf (stream, "%%compare-strncmp\n");
8373 fprintf (stream, "%%readonly-tables\n");
8374 fprintf (stream, "%%omit-struct-type\n");
8375 fprintf (stream, "%%%%\n");
8378 for (i = 0; i < num_casing_rules; i++)
8380 struct special_casing_rule *rule = casing_rules[i];
8383 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8388 if (!(rule->code < 0x10000))
8390 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8394 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8395 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8397 fprintf (stream, "%d, ",
8398 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8400 context = rule->context;
8403 fprintf (stream, "-");
8404 context = - context;
8407 fprintf (stream, " ");
8411 fprintf (stream, "SCC_ALWAYS ");
8413 case SCC_FINAL_SIGMA:
8414 fprintf (stream, "SCC_FINAL_SIGMA ");
8416 case SCC_AFTER_SOFT_DOTTED:
8417 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8419 case SCC_MORE_ABOVE:
8420 fprintf (stream, "SCC_MORE_ABOVE ");
8422 case SCC_BEFORE_DOT:
8423 fprintf (stream, "SCC_BEFORE_DOT ");
8426 fprintf (stream, "SCC_AFTER_I ");
8431 fprintf (stream, ", ");
8433 if (rule->language != NULL)
8435 if (strlen (rule->language) != 2)
8437 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8440 fprintf (stream, "{ '\\0', '\\0' }, ");
8442 fprintf (stream, "{ ");
8443 for (j = 0; j < 3; j++)
8446 fprintf (stream, ", ");
8447 if (!(rule->upper_mapping[j] < 0x10000))
8449 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8452 if (rule->upper_mapping[j] != 0)
8453 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8455 fprintf (stream, " 0");
8457 fprintf (stream, " }, { ");
8458 for (j = 0; j < 3; j++)
8461 fprintf (stream, ", ");
8462 if (!(rule->lower_mapping[j] < 0x10000))
8464 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8467 if (rule->lower_mapping[j] != 0)
8468 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8470 fprintf (stream, " 0");
8472 fprintf (stream, " }, { ");
8473 for (j = 0; j < 3; j++)
8476 fprintf (stream, ", ");
8477 if (!(rule->title_mapping[j] < 0x10000))
8479 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8482 if (rule->title_mapping[j] != 0)
8483 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8485 fprintf (stream, " 0");
8487 fprintf (stream, " }, { ");
8488 for (j = 0; j < 3; j++)
8491 fprintf (stream, ", ");
8492 if (!(rule->casefold_mapping[j] < 0x10000))
8494 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8497 if (rule->casefold_mapping[j] != 0)
8498 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8500 fprintf (stream, " 0");
8502 fprintf (stream, " }\n");
8505 if (ferror (stream) || fclose (stream))
8507 fprintf (stderr, "error writing to '%s'\n", filename);
8512 /* ========================================================================= */
8514 /* Quoting the Unicode standard:
8515 Definition: A character is defined to be "cased" if it has the Lowercase
8516 or Uppercase property or has a General_Category value of
8517 Titlecase_Letter. */
8519 is_cased (unsigned int ch)
8521 return (is_property_lowercase (ch)
8522 || is_property_uppercase (ch)
8523 || is_category_Lt (ch));
8526 /* Quoting the Unicode standard:
8527 Definition: A character is defined to be "case-ignorable" if it has the
8528 value MidLetter {or the value MidNumLet} for the Word_Break property or
8529 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8530 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8531 The text marked in braces was added in Unicode 5.1.0, see
8532 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8533 Definition of case-ignorable". */
8534 /* Since this predicate is only used for the "Before C" and "After C"
8535 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8536 This simplifies the evaluation of the regular expressions
8537 \p{cased} (\p{case-ignorable})* C
8539 C (\p{case-ignorable})* \p{cased}
8542 is_case_ignorable (unsigned int ch)
8544 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8545 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8546 || is_category_Mn (ch)
8547 || is_category_Me (ch)
8548 || is_category_Cf (ch)
8549 || is_category_Lm (ch)
8550 || is_category_Sk (ch))
8554 /* ------------------------------------------------------------------------- */
8556 /* Output all case related properties. */
8558 output_casing_properties (const char *version)
8560 #define PROPERTY(FN,P) \
8561 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
8562 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
8563 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
8564 PROPERTY(cased, cased)
8565 PROPERTY(ignorable, case_ignorable)
8569 /* ========================================================================= */
8572 main (int argc, char * argv[])
8574 const char *unicodedata_filename;
8575 const char *proplist_filename;
8576 const char *derivedproplist_filename;
8577 const char *scripts_filename;
8578 const char *blocks_filename;
8579 const char *proplist30_filename;
8580 const char *eastasianwidth_filename;
8581 const char *linebreak_filename;
8582 const char *wordbreakproperty_filename;
8583 const char *graphemebreakproperty_filename;
8584 const char *compositionexclusions_filename;
8585 const char *specialcasing_filename;
8586 const char *casefolding_filename;
8587 const char *version;
8591 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
8596 unicodedata_filename = argv[1];
8597 proplist_filename = argv[2];
8598 derivedproplist_filename = argv[3];
8599 scripts_filename = argv[4];
8600 blocks_filename = argv[5];
8601 proplist30_filename = argv[6];
8602 eastasianwidth_filename = argv[7];
8603 linebreak_filename = argv[8];
8604 wordbreakproperty_filename = argv[9];
8605 graphemebreakproperty_filename = argv[10];
8606 compositionexclusions_filename = argv[11];
8607 specialcasing_filename = argv[12];
8608 casefolding_filename = argv[13];
8611 fill_attributes (unicodedata_filename);
8612 clear_properties ();
8613 fill_properties (proplist_filename);
8614 fill_properties (derivedproplist_filename);
8615 fill_properties30 (proplist30_filename);
8616 fill_scripts (scripts_filename);
8617 fill_blocks (blocks_filename);
8618 fill_width (eastasianwidth_filename);
8619 fill_org_lbp (linebreak_filename);
8620 fill_org_wbp (wordbreakproperty_filename);
8621 fill_org_gbp (graphemebreakproperty_filename);
8622 fill_composition_exclusions (compositionexclusions_filename);
8623 fill_casing_rules (specialcasing_filename);
8624 fill_casefolding_rules (casefolding_filename);
8625 redistribute_casefolding_rules ();
8626 sort_casing_rules ();
8628 output_categories (version);
8629 output_category ("unictype/categ_of.h", version);
8630 output_combclass ("unictype/combining.h", version);
8631 output_bidi_category ("unictype/bidi_of.h", version);
8632 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
8633 output_decimal_digit ("unictype/decdigit.h", version);
8634 output_digit_test ("../tests/unictype/test-digit.h", version);
8635 output_digit ("unictype/digit.h", version);
8636 output_numeric_test ("../tests/unictype/test-numeric.h", version);
8637 output_numeric ("unictype/numeric.h", version);
8638 output_mirror ("unictype/mirror.h", version);
8639 output_properties (version);
8640 output_scripts (version);
8641 output_scripts_byname (version);
8642 output_blocks (version);
8643 output_ident_properties (version);
8644 output_old_ctype (version);
8646 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
8647 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
8648 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
8650 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
8651 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
8652 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
8654 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
8655 output_gbp_table ("unigbrk/gbrkprop.h", version);
8657 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
8658 debug_output_composition_tables ("uninorm/composition.txt");
8659 output_composition_tables ("uninorm/composition-table.gperf", version);
8661 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
8662 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
8663 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
8664 output_simple_mapping ("unicase/toupper.h", to_upper, version);
8665 output_simple_mapping ("unicase/tolower.h", to_lower, version);
8666 output_simple_mapping ("unicase/totitle.h", to_title, version);
8667 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
8668 output_casing_rules ("unicase/special-casing-table.gperf", version);
8669 output_casing_properties (version);
8675 * For Emacs M-x compile
8677 * compile-command: "
8678 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
8680 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
8681 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
8682 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
8683 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
8684 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
8685 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
8686 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
8687 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
8688 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
8689 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
8690 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
8691 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
8692 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
8694 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
8695 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt