1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2496 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2505 PROP_QUOTATION_MARK,
2506 PROP_TERMINAL_PUNCTUATION,
2509 PROP_ASCII_HEX_DIGIT,
2510 PROP_OTHER_ALPHABETIC,
2514 PROP_OTHER_LOWERCASE,
2515 PROP_OTHER_UPPERCASE,
2516 PROP_NONCHARACTER_CODE_POINT,
2517 PROP_OTHER_GRAPHEME_EXTEND,
2518 PROP_IDS_BINARY_OPERATOR,
2519 PROP_IDS_TRINARY_OPERATOR,
2521 PROP_UNIFIED_IDEOGRAPH,
2522 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2525 PROP_LOGICAL_ORDER_EXCEPTION,
2526 PROP_OTHER_ID_START,
2527 PROP_OTHER_ID_CONTINUE,
2529 PROP_VARIATION_SELECTOR,
2530 PROP_PATTERN_WHITE_SPACE,
2531 PROP_PATTERN_SYNTAX,
2532 /* DerivedCoreProperties.txt */
2541 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2542 PROP_GRAPHEME_EXTEND,
2546 unsigned long long unicode_properties[0x110000];
2549 clear_properties (void)
2553 for (i = 0; i < 0x110000; i++)
2554 unicode_properties[i] = 0;
2557 /* Stores in unicode_properties[] the properties from the
2558 PropList.txt or DerivedCoreProperties.txt file. */
2560 fill_properties (const char *proplist_filename)
2565 stream = fopen (proplist_filename, "r");
2568 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2575 unsigned int i1, i2;
2576 char padding[200+1];
2577 char propname[200+1];
2578 unsigned int propvalue;
2580 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2583 if (buf[0] == '\0' || buf[0] == '#')
2586 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2588 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2590 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2595 #define PROP(name,value) \
2596 if (strcmp (propname, name) == 0) propvalue = value; else
2598 PROP ("White_Space", PROP_WHITE_SPACE)
2599 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2600 PROP ("Join_Control", PROP_JOIN_CONTROL)
2601 PROP ("Dash", PROP_DASH)
2602 PROP ("Hyphen", PROP_HYPHEN)
2603 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2604 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2605 PROP ("Other_Math", PROP_OTHER_MATH)
2606 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2607 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2608 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2609 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2610 PROP ("Diacritic", PROP_DIACRITIC)
2611 PROP ("Extender", PROP_EXTENDER)
2612 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2613 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2614 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2615 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2616 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2617 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2618 PROP ("Radical", PROP_RADICAL)
2619 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2620 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2621 PROP ("Deprecated", PROP_DEPRECATED)
2622 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2623 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2624 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2625 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2626 PROP ("STerm", PROP_STERM)
2627 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2628 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2629 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2630 /* DerivedCoreProperties.txt */
2631 PROP ("Math", PROP_MATH)
2632 PROP ("Alphabetic", PROP_ALPHABETIC)
2633 PROP ("Lowercase", PROP_LOWERCASE)
2634 PROP ("Uppercase", PROP_UPPERCASE)
2635 PROP ("ID_Start", PROP_ID_START)
2636 PROP ("ID_Continue", PROP_ID_CONTINUE)
2637 PROP ("XID_Start", PROP_XID_START)
2638 PROP ("XID_Continue", PROP_XID_CONTINUE)
2639 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2640 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2641 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2642 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2645 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2649 if (!(i1 <= i2 && i2 < 0x110000))
2652 for (i = i1; i <= i2; i++)
2653 unicode_properties[i] |= 1ULL << propvalue;
2656 if (ferror (stream) || fclose (stream))
2658 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2663 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2666 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2672 for (i = 0; i < 0x110000; i++)
2675 stream = fopen (proplist_filename, "r");
2678 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2682 /* Search for the "Property dump for: ..." line. */
2685 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2687 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2691 while (strstr (buf, property_name) == NULL);
2695 unsigned int i1, i2;
2697 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2701 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2703 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2705 fprintf (stderr, "parse error in property in '%s'\n",
2710 else if (strlen (buf) >= 4)
2712 if (sscanf (buf, "%4X", &i1) < 1)
2714 fprintf (stderr, "parse error in property in '%s'\n",
2722 fprintf (stderr, "parse error in property in '%s'\n",
2726 if (!(i1 <= i2 && i2 < 0x110000))
2728 for (i = i1; i <= i2; i++)
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2738 /* Properties from Unicode 3.0 PropList.txt file. */
2740 /* The paired punctuation property from the PropList.txt file. */
2741 char unicode_pairedpunctuation[0x110000];
2743 /* The left of pair property from the PropList.txt file. */
2744 char unicode_leftofpair[0x110000];
2747 fill_properties30 (const char *proplist30_filename)
2749 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2750 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2753 /* ------------------------------------------------------------------------- */
2755 /* See PropList.txt, UCD.html. */
2757 is_property_white_space (unsigned int ch)
2759 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2762 /* See Unicode 3.0 book, section 4.10,
2763 PropList.txt, UCD.html,
2764 DerivedCoreProperties.txt, UCD.html. */
2766 is_property_alphabetic (unsigned int ch)
2770 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2771 /* For some reason, the following are listed as having property
2772 Alphabetic but not as having property Other_Alphabetic. */
2773 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2774 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2775 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2776 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2777 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2778 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2779 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2780 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2781 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2782 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2783 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2784 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2786 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2788 if (result1 != result2)
2793 /* See PropList.txt, UCD.html. */
2795 is_property_other_alphabetic (unsigned int ch)
2797 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2800 /* See PropList.txt, UCD.html. */
2802 is_property_not_a_character (unsigned int ch)
2804 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2807 /* See PropList.txt, UCD.html,
2808 DerivedCoreProperties.txt, UCD.html. */
2810 is_property_default_ignorable_code_point (unsigned int ch)
2813 (is_category_Cf (ch)
2814 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2815 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2816 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2817 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2819 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_default_ignorable_code_point (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_deprecated (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2840 /* See PropList.txt, UCD.html. */
2842 is_property_logical_order_exception (unsigned int ch)
2844 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2847 /* See PropList.txt, UCD.html. */
2849 is_property_variation_selector (unsigned int ch)
2851 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2854 /* See PropList-3.0.1.txt. */
2856 is_property_private_use (unsigned int ch)
2858 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2859 return (ch >= 0xE000 && ch <= 0xF8FF)
2860 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2861 || (ch >= 0x100000 && ch <= 0x10FFFD);
2864 /* See PropList-3.0.1.txt. */
2866 is_property_unassigned_code_value (unsigned int ch)
2868 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2871 /* See PropList.txt, UCD.html,
2872 DerivedCoreProperties.txt, UCD.html. */
2874 is_property_uppercase (unsigned int ch)
2878 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2880 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2882 if (result1 != result2)
2887 /* See PropList.txt, UCD.html. */
2889 is_property_other_uppercase (unsigned int ch)
2891 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2894 /* See PropList.txt, UCD.html,
2895 DerivedCoreProperties.txt, UCD.html. */
2897 is_property_lowercase (unsigned int ch)
2901 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2903 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2905 if (result1 != result2)
2910 /* See PropList.txt, UCD.html. */
2912 is_property_other_lowercase (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2917 /* See PropList-3.0.1.txt. */
2919 is_property_titlecase (unsigned int ch)
2921 return is_category_Lt (ch);
2924 /* See PropList.txt, UCD.html. */
2926 is_property_soft_dotted (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2931 /* See DerivedCoreProperties.txt, UCD.html. */
2933 is_property_id_start (unsigned int ch)
2935 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2938 /* See PropList.txt, UCD.html. */
2940 is_property_other_id_start (unsigned int ch)
2942 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2945 /* See DerivedCoreProperties.txt, UCD.html. */
2947 is_property_id_continue (unsigned int ch)
2949 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2952 /* See PropList.txt, UCD.html. */
2954 is_property_other_id_continue (unsigned int ch)
2956 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2959 /* See DerivedCoreProperties.txt, UCD.html. */
2961 is_property_xid_start (unsigned int ch)
2963 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2966 /* See DerivedCoreProperties.txt, UCD.html. */
2968 is_property_xid_continue (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2973 /* See PropList.txt, UCD.html. */
2975 is_property_pattern_white_space (unsigned int ch)
2977 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2980 /* See PropList.txt, UCD.html. */
2982 is_property_pattern_syntax (unsigned int ch)
2984 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2987 /* See PropList.txt, UCD.html. */
2989 is_property_join_control (unsigned int ch)
2991 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2994 /* See DerivedCoreProperties.txt, UCD.html. */
2996 is_property_grapheme_base (unsigned int ch)
2998 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3001 /* See DerivedCoreProperties.txt, UCD.html. */
3003 is_property_grapheme_extend (unsigned int ch)
3005 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3008 /* See PropList.txt, UCD.html. */
3010 is_property_other_grapheme_extend (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3015 /* See DerivedCoreProperties.txt, UCD.html. */
3017 is_property_grapheme_link (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3022 /* See PropList.txt, UCD.html. */
3024 is_property_bidi_control (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3029 /* See PropList-3.0.1.txt. */
3031 is_property_bidi_left_to_right (unsigned int ch)
3033 return (get_bidi_category (ch) == UC_BIDI_L);
3036 /* See PropList-3.0.1.txt. */
3038 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3040 return (get_bidi_category (ch) == UC_BIDI_R);
3043 /* See PropList-3.0.1.txt. */
3045 is_property_bidi_arabic_right_to_left (unsigned int ch)
3047 return (get_bidi_category (ch) == UC_BIDI_AL);
3050 /* See PropList-3.0.1.txt. */
3052 is_property_bidi_european_digit (unsigned int ch)
3054 return (get_bidi_category (ch) == UC_BIDI_EN);
3057 /* See PropList-3.0.1.txt. */
3059 is_property_bidi_eur_num_separator (unsigned int ch)
3061 return (get_bidi_category (ch) == UC_BIDI_ES);
3064 /* See PropList-3.0.1.txt. */
3066 is_property_bidi_eur_num_terminator (unsigned int ch)
3068 return (get_bidi_category (ch) == UC_BIDI_ET);
3071 /* See PropList-3.0.1.txt. */
3073 is_property_bidi_arabic_digit (unsigned int ch)
3075 return (get_bidi_category (ch) == UC_BIDI_AN);
3078 /* See PropList-3.0.1.txt. */
3080 is_property_bidi_common_separator (unsigned int ch)
3082 return (get_bidi_category (ch) == UC_BIDI_CS);
3085 /* See PropList-3.0.1.txt. */
3087 is_property_bidi_block_separator (unsigned int ch)
3089 return (get_bidi_category (ch) == UC_BIDI_B);
3092 /* See PropList-3.0.1.txt. */
3094 is_property_bidi_segment_separator (unsigned int ch)
3096 return (get_bidi_category (ch) == UC_BIDI_S);
3099 /* See PropList-3.0.1.txt. */
3101 is_property_bidi_whitespace (unsigned int ch)
3103 return (get_bidi_category (ch) == UC_BIDI_WS);
3106 /* See PropList-3.0.1.txt. */
3108 is_property_bidi_non_spacing_mark (unsigned int ch)
3110 return (get_bidi_category (ch) == UC_BIDI_NSM);
3113 /* See PropList-3.0.1.txt. */
3115 is_property_bidi_boundary_neutral (unsigned int ch)
3117 return (get_bidi_category (ch) == UC_BIDI_BN);
3120 /* See PropList-3.0.1.txt. */
3122 is_property_bidi_pdf (unsigned int ch)
3124 return (get_bidi_category (ch) == UC_BIDI_PDF);
3127 /* See PropList-3.0.1.txt. */
3129 is_property_bidi_embedding_or_override (unsigned int ch)
3131 int category = get_bidi_category (ch);
3132 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3133 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3136 /* See PropList-3.0.1.txt. */
3138 is_property_bidi_other_neutral (unsigned int ch)
3140 return (get_bidi_category (ch) == UC_BIDI_ON);
3143 /* See PropList.txt, UCD.html. */
3145 is_property_hex_digit (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3150 /* See PropList.txt, UCD.html. */
3152 is_property_ascii_hex_digit (unsigned int ch)
3154 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3157 /* See Unicode 3.0 book, section 4.10,
3158 PropList.txt, UCD.html. */
3160 is_property_ideographic (unsigned int ch)
3162 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3165 /* See PropList.txt, UCD.html. */
3167 is_property_unified_ideograph (unsigned int ch)
3169 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3172 /* See PropList.txt, UCD.html. */
3174 is_property_radical (unsigned int ch)
3176 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3179 /* See PropList.txt, UCD.html. */
3181 is_property_ids_binary_operator (unsigned int ch)
3183 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3186 /* See PropList.txt, UCD.html. */
3188 is_property_ids_trinary_operator (unsigned int ch)
3190 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3193 /* See PropList-3.0.1.txt. */
3195 is_property_zero_width (unsigned int ch)
3197 return is_category_Cf (ch)
3198 || (unicode_attributes[ch].name != NULL
3199 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3202 /* See PropList-3.0.1.txt. */
3204 is_property_space (unsigned int ch)
3206 return is_category_Zs (ch);
3209 /* See PropList-3.0.1.txt. */
3211 is_property_non_break (unsigned int ch)
3213 /* This is exactly the set of characters having line breaking
3215 return (ch == 0x00A0 /* NO-BREAK SPACE */
3216 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3217 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3218 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3219 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3220 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3221 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3222 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3223 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3224 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3225 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3226 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3227 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3228 || ch == 0x2007 /* FIGURE SPACE */
3229 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3230 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3233 /* See PropList-3.0.1.txt. */
3235 is_property_iso_control (unsigned int ch)
3238 (unicode_attributes[ch].name != NULL
3239 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3241 is_category_Cc (ch);
3243 if (result1 != result2)
3248 /* See PropList-3.0.1.txt. */
3250 is_property_format_control (unsigned int ch)
3252 return (is_category_Cf (ch)
3253 && get_bidi_category (ch) == UC_BIDI_BN
3254 && !is_property_join_control (ch)
3258 /* See PropList.txt, UCD.html. */
3260 is_property_dash (unsigned int ch)
3262 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3265 /* See PropList.txt, UCD.html. */
3267 is_property_hyphen (unsigned int ch)
3269 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3272 /* See PropList-3.0.1.txt. */
3274 is_property_punctuation (unsigned int ch)
3276 return is_category_P (ch);
3279 /* See PropList-3.0.1.txt. */
3281 is_property_line_separator (unsigned int ch)
3283 return is_category_Zl (ch);
3286 /* See PropList-3.0.1.txt. */
3288 is_property_paragraph_separator (unsigned int ch)
3290 return is_category_Zp (ch);
3293 /* See PropList.txt, UCD.html. */
3295 is_property_quotation_mark (unsigned int ch)
3297 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3300 /* See PropList.txt, UCD.html. */
3302 is_property_sentence_terminal (unsigned int ch)
3304 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3307 /* See PropList.txt, UCD.html. */
3309 is_property_terminal_punctuation (unsigned int ch)
3311 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3314 /* See PropList-3.0.1.txt. */
3316 is_property_currency_symbol (unsigned int ch)
3318 return is_category_Sc (ch);
3321 /* See Unicode 3.0 book, section 4.9,
3322 PropList.txt, UCD.html,
3323 DerivedCoreProperties.txt, UCD.html. */
3325 is_property_math (unsigned int ch)
3329 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3331 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3333 if (result1 != result2)
3338 /* See PropList.txt, UCD.html. */
3340 is_property_other_math (unsigned int ch)
3342 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3345 /* See PropList-3.0.1.txt. */
3347 is_property_paired_punctuation (unsigned int ch)
3349 return unicode_pairedpunctuation[ch];
3352 /* See PropList-3.0.1.txt. */
3354 is_property_left_of_pair (unsigned int ch)
3356 return unicode_leftofpair[ch];
3359 /* See PropList-3.0.1.txt. */
3361 is_property_combining (unsigned int ch)
3363 return (unicode_attributes[ch].name != NULL
3364 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3365 || is_category_Mc (ch)
3366 || is_category_Me (ch)
3367 || is_category_Mn (ch)));
3370 #if 0 /* same as is_property_bidi_non_spacing_mark */
3371 /* See PropList-3.0.1.txt. */
3373 is_property_non_spacing (unsigned int ch)
3375 return (unicode_attributes[ch].name != NULL
3376 && get_bidi_category (ch) == UC_BIDI_NSM);
3380 /* See PropList-3.0.1.txt. */
3382 is_property_composite (unsigned int ch)
3384 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3385 logical in some sense. */
3386 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3388 if (unicode_attributes[ch].name != NULL
3389 && unicode_attributes[ch].decomposition != NULL)
3391 /* Test whether the decomposition contains more than one character,
3392 and the first is not a space. */
3393 const char *decomp = unicode_attributes[ch].decomposition;
3394 if (decomp[0] == '<')
3396 decomp = strchr (decomp, '>') + 1;
3397 if (decomp[0] == ' ')
3400 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3405 /* See PropList-3.0.1.txt. */
3407 is_property_decimal_digit (unsigned int ch)
3409 return is_category_Nd (ch);
3412 /* See PropList-3.0.1.txt. */
3414 is_property_numeric (unsigned int ch)
3416 return ((get_numeric_value (ch)).denominator > 0)
3417 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3418 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3421 /* See PropList.txt, UCD.html. */
3423 is_property_diacritic (unsigned int ch)
3425 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3428 /* See PropList.txt, UCD.html. */
3430 is_property_extender (unsigned int ch)
3432 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3435 /* See PropList-3.0.1.txt. */
3437 is_property_ignorable_control (unsigned int ch)
3439 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3440 || is_category_Cf (ch))
3444 /* ------------------------------------------------------------------------- */
3446 /* Output all properties. */
3448 output_properties (const char *version)
3450 #define PROPERTY(P) \
3451 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3452 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3453 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3454 PROPERTY(white_space)
3455 PROPERTY(alphabetic)
3456 PROPERTY(other_alphabetic)
3457 PROPERTY(not_a_character)
3458 PROPERTY(default_ignorable_code_point)
3459 PROPERTY(other_default_ignorable_code_point)
3460 PROPERTY(deprecated)
3461 PROPERTY(logical_order_exception)
3462 PROPERTY(variation_selector)
3463 PROPERTY(private_use)
3464 PROPERTY(unassigned_code_value)
3466 PROPERTY(other_uppercase)
3468 PROPERTY(other_lowercase)
3470 PROPERTY(soft_dotted)
3472 PROPERTY(other_id_start)
3473 PROPERTY(id_continue)
3474 PROPERTY(other_id_continue)
3476 PROPERTY(xid_continue)
3477 PROPERTY(pattern_white_space)
3478 PROPERTY(pattern_syntax)
3479 PROPERTY(join_control)
3480 PROPERTY(grapheme_base)
3481 PROPERTY(grapheme_extend)
3482 PROPERTY(other_grapheme_extend)
3483 PROPERTY(grapheme_link)
3484 PROPERTY(bidi_control)
3485 PROPERTY(bidi_left_to_right)
3486 PROPERTY(bidi_hebrew_right_to_left)
3487 PROPERTY(bidi_arabic_right_to_left)
3488 PROPERTY(bidi_european_digit)
3489 PROPERTY(bidi_eur_num_separator)
3490 PROPERTY(bidi_eur_num_terminator)
3491 PROPERTY(bidi_arabic_digit)
3492 PROPERTY(bidi_common_separator)
3493 PROPERTY(bidi_block_separator)
3494 PROPERTY(bidi_segment_separator)
3495 PROPERTY(bidi_whitespace)
3496 PROPERTY(bidi_non_spacing_mark)
3497 PROPERTY(bidi_boundary_neutral)
3499 PROPERTY(bidi_embedding_or_override)
3500 PROPERTY(bidi_other_neutral)
3502 PROPERTY(ascii_hex_digit)
3503 PROPERTY(ideographic)
3504 PROPERTY(unified_ideograph)
3506 PROPERTY(ids_binary_operator)
3507 PROPERTY(ids_trinary_operator)
3508 PROPERTY(zero_width)
3511 PROPERTY(iso_control)
3512 PROPERTY(format_control)
3515 PROPERTY(punctuation)
3516 PROPERTY(line_separator)
3517 PROPERTY(paragraph_separator)
3518 PROPERTY(quotation_mark)
3519 PROPERTY(sentence_terminal)
3520 PROPERTY(terminal_punctuation)
3521 PROPERTY(currency_symbol)
3523 PROPERTY(other_math)
3524 PROPERTY(paired_punctuation)
3525 PROPERTY(left_of_pair)
3528 PROPERTY(decimal_digit)
3532 PROPERTY(ignorable_control)
3536 /* ========================================================================= */
3540 static const char *scripts[256];
3541 static unsigned int numscripts;
3543 static uint8_t unicode_scripts[0x110000];
3546 fill_scripts (const char *scripts_filename)
3551 stream = fopen (scripts_filename, "r");
3554 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3560 for (i = 0; i < 0x110000; i++)
3561 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3566 unsigned int i1, i2;
3567 char padding[200+1];
3568 char scriptname[200+1];
3571 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3574 if (buf[0] == '\0' || buf[0] == '#')
3577 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3579 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3581 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3591 for (script = numscripts - 1; script >= 0; script--)
3592 if (strcmp (scripts[script], scriptname) == 0)
3596 scripts[numscripts] = strdup (scriptname);
3597 script = numscripts;
3599 if (numscripts == 256)
3603 for (i = i1; i <= i2; i++)
3605 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3606 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3607 unicode_scripts[i] = script;
3611 if (ferror (stream) || fclose (stream))
3613 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3618 /* Construction of sparse 3-level tables. */
3619 #define TABLE script_table
3620 #define ELEMENT uint8_t
3621 #define DEFAULT (uint8_t)~(uint8_t)0
3622 #define xmalloc malloc
3623 #define xrealloc realloc
3627 output_scripts (const char *version)
3629 const char *filename = "unictype/scripts.h";
3631 unsigned int ch, s, i;
3632 struct script_table t;
3633 unsigned int level1_offset, level2_offset, level3_offset;
3637 const char *lowercase_name;
3640 scriptinfo_t scriptinfo[256];
3642 stream = fopen (filename, "w");
3645 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3649 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3650 fprintf (stream, "/* Unicode scripts. */\n");
3651 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3654 for (s = 0; s < numscripts; s++)
3656 char *lcp = strdup (scripts[s]);
3659 for (cp = lcp; *cp != '\0'; cp++)
3660 if (*cp >= 'A' && *cp <= 'Z')
3663 scriptinfo[s].lowercase_name = lcp;
3666 for (s = 0; s < numscripts; s++)
3668 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3669 scriptinfo[s].lowercase_name);
3670 fprintf (stream, "{\n");
3672 for (ch = 0; ch < 0x110000; ch++)
3673 if (unicode_scripts[ch] == s)
3679 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3684 fprintf (stream, ",\n");
3686 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3688 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3692 fprintf (stream, "\n");
3693 fprintf (stream, "};\n");
3696 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3697 fprintf (stream, "{\n");
3698 for (s = 0; s < numscripts; s++)
3700 fprintf (stream, " {\n");
3701 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3702 scriptinfo[s].lowercase_name);
3703 fprintf (stream, " script_%s_intervals,\n",
3704 scriptinfo[s].lowercase_name);
3705 fprintf (stream, " \"%s\"\n", scripts[s]);
3706 fprintf (stream, " }");
3707 if (s+1 < numscripts)
3708 fprintf (stream, ",");
3709 fprintf (stream, "\n");
3711 fprintf (stream, "};\n");
3715 script_table_init (&t);
3717 for (ch = 0; ch < 0x110000; ch++)
3719 unsigned int s = unicode_scripts[ch];
3720 if (s != (uint8_t)~(uint8_t)0)
3721 script_table_add (&t, ch, s);
3724 script_table_finalize (&t);
3726 /* Offsets in t.result, in memory of this process. */
3728 5 * sizeof (uint32_t);
3730 5 * sizeof (uint32_t)
3731 + t.level1_size * sizeof (uint32_t);
3733 5 * sizeof (uint32_t)
3734 + t.level1_size * sizeof (uint32_t)
3735 + (t.level2_size << t.q) * sizeof (uint32_t);
3737 for (i = 0; i < 5; i++)
3738 fprintf (stream, "#define script_header_%d %d\n", i,
3739 ((uint32_t *) t.result)[i]);
3740 fprintf (stream, "static const\n");
3741 fprintf (stream, "struct\n");
3742 fprintf (stream, " {\n");
3743 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3744 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3745 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3746 fprintf (stream, " }\n");
3747 fprintf (stream, "u_script =\n");
3748 fprintf (stream, "{\n");
3749 fprintf (stream, " {");
3750 if (t.level1_size > 8)
3751 fprintf (stream, "\n ");
3752 for (i = 0; i < t.level1_size; i++)
3755 if (i > 0 && (i % 8) == 0)
3756 fprintf (stream, "\n ");
3757 offset = ((uint32_t *) (t.result + level1_offset))[i];
3759 fprintf (stream, " %5d", -1);
3761 fprintf (stream, " %5zu",
3762 (offset - level2_offset) / sizeof (uint32_t));
3763 if (i+1 < t.level1_size)
3764 fprintf (stream, ",");
3766 if (t.level1_size > 8)
3767 fprintf (stream, "\n ");
3768 fprintf (stream, " },\n");
3769 fprintf (stream, " {");
3770 if (t.level2_size << t.q > 8)
3771 fprintf (stream, "\n ");
3772 for (i = 0; i < t.level2_size << t.q; i++)
3775 if (i > 0 && (i % 8) == 0)
3776 fprintf (stream, "\n ");
3777 offset = ((uint32_t *) (t.result + level2_offset))[i];
3779 fprintf (stream, " %5d", -1);
3781 fprintf (stream, " %5zu",
3782 (offset - level3_offset) / sizeof (uint8_t));
3783 if (i+1 < t.level2_size << t.q)
3784 fprintf (stream, ",");
3786 if (t.level2_size << t.q > 8)
3787 fprintf (stream, "\n ");
3788 fprintf (stream, " },\n");
3789 fprintf (stream, " {");
3790 if (t.level3_size << t.p > 8)
3791 fprintf (stream, "\n ");
3792 for (i = 0; i < t.level3_size << t.p; i++)
3794 if (i > 0 && (i % 8) == 0)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3797 if (i+1 < t.level3_size << t.p)
3798 fprintf (stream, ",");
3800 if (t.level3_size << t.p > 8)
3801 fprintf (stream, "\n ");
3802 fprintf (stream, " }\n");
3803 fprintf (stream, "};\n");
3805 if (ferror (stream) || fclose (stream))
3807 fprintf (stderr, "error writing to '%s'\n", filename);
3813 output_scripts_byname (const char *version)
3815 const char *filename = "unictype/scripts_byname.gperf";
3819 stream = fopen (filename, "w");
3822 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3826 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3827 fprintf (stream, "/* Unicode scripts. */\n");
3828 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3830 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3831 fprintf (stream, "%%struct-type\n");
3832 fprintf (stream, "%%language=ANSI-C\n");
3833 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3834 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3835 fprintf (stream, "%%readonly-tables\n");
3836 fprintf (stream, "%%global-table\n");
3837 fprintf (stream, "%%define word-array-name script_names\n");
3838 fprintf (stream, "%%%%\n");
3839 for (s = 0; s < numscripts; s++)
3840 fprintf (stream, "%s, %u\n", scripts[s], s);
3842 if (ferror (stream) || fclose (stream))
3844 fprintf (stderr, "error writing to '%s'\n", filename);
3849 /* ========================================================================= */
3853 typedef struct { unsigned int start; unsigned int end; const char *name; }
3855 static block_t blocks[256];
3856 static unsigned int numblocks;
3859 fill_blocks (const char *blocks_filename)
3863 stream = fopen (blocks_filename, "r");
3866 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3873 unsigned int i1, i2;
3874 char padding[200+1];
3875 char blockname[200+1];
3877 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3880 if (buf[0] == '\0' || buf[0] == '#')
3883 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3885 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3888 blocks[numblocks].start = i1;
3889 blocks[numblocks].end = i2;
3890 blocks[numblocks].name = strdup (blockname);
3891 /* It must be sorted. */
3892 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3895 if (numblocks == 256)
3899 if (ferror (stream) || fclose (stream))
3901 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3906 /* Return the smallest block index among the blocks for characters >= ch. */
3908 block_first_index (unsigned int ch)
3910 /* Binary search. */
3911 unsigned int lo = 0;
3912 unsigned int hi = numblocks;
3914 All blocks[i], i < lo, have blocks[i].end < ch,
3915 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3918 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3919 if (blocks[mid].end < ch)
3927 /* Return the largest block index among the blocks for characters <= ch,
3930 block_last_index (unsigned int ch)
3932 /* Binary search. */
3933 unsigned int lo = 0;
3934 unsigned int hi = numblocks;
3936 All blocks[i], i < lo, have blocks[i].start <= ch,
3937 all blocks[i], i >= hi, have blocks[i].start > ch. */
3940 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3941 if (blocks[mid].start <= ch)
3950 output_blocks (const char *version)
3952 const char *filename = "unictype/blocks.h";
3953 const unsigned int shift = 8; /* bits to shift away for array access */
3954 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3959 stream = fopen (filename, "w");
3962 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3966 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3967 fprintf (stream, "/* Unicode blocks. */\n");
3968 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3971 fprintf (stream, "static const uc_block_t blocks[] =\n");
3972 fprintf (stream, "{\n");
3973 for (i = 0; i < numblocks; i++)
3975 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3976 blocks[i].end, blocks[i].name);
3977 if (i+1 < numblocks)
3978 fprintf (stream, ",");
3979 fprintf (stream, "\n");
3981 fprintf (stream, "};\n");
3982 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3983 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3984 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3985 threshold >> shift);
3986 fprintf (stream, "{\n");
3987 for (i1 = 0; i1 < (threshold >> shift); i1++)
3989 unsigned int first_index = block_first_index (i1 << shift);
3990 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3991 fprintf (stream, " %3d, %3d", first_index, last_index);
3992 if (i1+1 < (threshold >> shift))
3993 fprintf (stream, ",");
3994 fprintf (stream, "\n");
3996 fprintf (stream, "};\n");
3997 fprintf (stream, "#define blocks_upper_first_index %d\n",
3998 block_first_index (threshold));
3999 fprintf (stream, "#define blocks_upper_last_index %d\n",
4000 block_last_index (0x10FFFF));
4002 if (ferror (stream) || fclose (stream))
4004 fprintf (stderr, "error writing to '%s'\n", filename);
4009 /* ========================================================================= */
4011 /* C and Java syntax. */
4015 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4016 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4017 UC_IDENTIFIER_INVALID, /* not valid */
4018 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4021 /* ISO C 99 section 6.4.(3). */
4023 is_c_whitespace (unsigned int ch)
4025 return (ch == ' ' /* space */
4026 || ch == '\t' /* horizontal tab */
4027 || ch == '\n' || ch == '\r' /* new-line */
4028 || ch == '\v' /* vertical tab */
4029 || ch == '\f'); /* form-feed */
4032 /* ISO C 99 section 6.4.2.1 and appendix D. */
4034 c_ident_category (unsigned int ch)
4036 /* Section 6.4.2.1. */
4037 if (ch >= '0' && ch <= '9')
4038 return UC_IDENTIFIER_VALID;
4039 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4040 return UC_IDENTIFIER_START;
4046 || (ch >= 0x00C0 && ch <= 0x00D6)
4047 || (ch >= 0x00D8 && ch <= 0x00F6)
4048 || (ch >= 0x00F8 && ch <= 0x01F5)
4049 || (ch >= 0x01FA && ch <= 0x0217)
4050 || (ch >= 0x0250 && ch <= 0x02A8)
4051 || (ch >= 0x1E00 && ch <= 0x1E9B)
4052 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4056 || (ch >= 0x0388 && ch <= 0x038A)
4058 || (ch >= 0x038E && ch <= 0x03A1)
4059 || (ch >= 0x03A3 && ch <= 0x03CE)
4060 || (ch >= 0x03D0 && ch <= 0x03D6)
4065 || (ch >= 0x03E2 && ch <= 0x03F3)
4066 || (ch >= 0x1F00 && ch <= 0x1F15)
4067 || (ch >= 0x1F18 && ch <= 0x1F1D)
4068 || (ch >= 0x1F20 && ch <= 0x1F45)
4069 || (ch >= 0x1F48 && ch <= 0x1F4D)
4070 || (ch >= 0x1F50 && ch <= 0x1F57)
4074 || (ch >= 0x1F5F && ch <= 0x1F7D)
4075 || (ch >= 0x1F80 && ch <= 0x1FB4)
4076 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4077 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4078 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4079 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4080 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4081 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4082 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4083 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4085 || (ch >= 0x0401 && ch <= 0x040C)
4086 || (ch >= 0x040E && ch <= 0x044F)
4087 || (ch >= 0x0451 && ch <= 0x045C)
4088 || (ch >= 0x045E && ch <= 0x0481)
4089 || (ch >= 0x0490 && ch <= 0x04C4)
4090 || (ch >= 0x04C7 && ch <= 0x04C8)
4091 || (ch >= 0x04CB && ch <= 0x04CC)
4092 || (ch >= 0x04D0 && ch <= 0x04EB)
4093 || (ch >= 0x04EE && ch <= 0x04F5)
4094 || (ch >= 0x04F8 && ch <= 0x04F9)
4096 || (ch >= 0x0531 && ch <= 0x0556)
4097 || (ch >= 0x0561 && ch <= 0x0587)
4099 || (ch >= 0x05B0 && ch <= 0x05B9)
4100 || (ch >= 0x05BB && ch <= 0x05BD)
4102 || (ch >= 0x05C1 && ch <= 0x05C2)
4103 || (ch >= 0x05D0 && ch <= 0x05EA)
4104 || (ch >= 0x05F0 && ch <= 0x05F2)
4106 || (ch >= 0x0621 && ch <= 0x063A)
4107 || (ch >= 0x0640 && ch <= 0x0652)
4108 || (ch >= 0x0670 && ch <= 0x06B7)
4109 || (ch >= 0x06BA && ch <= 0x06BE)
4110 || (ch >= 0x06C0 && ch <= 0x06CE)
4111 || (ch >= 0x06D0 && ch <= 0x06DC)
4112 || (ch >= 0x06E5 && ch <= 0x06E8)
4113 || (ch >= 0x06EA && ch <= 0x06ED)
4115 || (ch >= 0x0901 && ch <= 0x0903)
4116 || (ch >= 0x0905 && ch <= 0x0939)
4117 || (ch >= 0x093E && ch <= 0x094D)
4118 || (ch >= 0x0950 && ch <= 0x0952)
4119 || (ch >= 0x0958 && ch <= 0x0963)
4121 || (ch >= 0x0981 && ch <= 0x0983)
4122 || (ch >= 0x0985 && ch <= 0x098C)
4123 || (ch >= 0x098F && ch <= 0x0990)
4124 || (ch >= 0x0993 && ch <= 0x09A8)
4125 || (ch >= 0x09AA && ch <= 0x09B0)
4127 || (ch >= 0x09B6 && ch <= 0x09B9)
4128 || (ch >= 0x09BE && ch <= 0x09C4)
4129 || (ch >= 0x09C7 && ch <= 0x09C8)
4130 || (ch >= 0x09CB && ch <= 0x09CD)
4131 || (ch >= 0x09DC && ch <= 0x09DD)
4132 || (ch >= 0x09DF && ch <= 0x09E3)
4133 || (ch >= 0x09F0 && ch <= 0x09F1)
4136 || (ch >= 0x0A05 && ch <= 0x0A0A)
4137 || (ch >= 0x0A0F && ch <= 0x0A10)
4138 || (ch >= 0x0A13 && ch <= 0x0A28)
4139 || (ch >= 0x0A2A && ch <= 0x0A30)
4140 || (ch >= 0x0A32 && ch <= 0x0A33)
4141 || (ch >= 0x0A35 && ch <= 0x0A36)
4142 || (ch >= 0x0A38 && ch <= 0x0A39)
4143 || (ch >= 0x0A3E && ch <= 0x0A42)
4144 || (ch >= 0x0A47 && ch <= 0x0A48)
4145 || (ch >= 0x0A4B && ch <= 0x0A4D)
4146 || (ch >= 0x0A59 && ch <= 0x0A5C)
4150 || (ch >= 0x0A81 && ch <= 0x0A83)
4151 || (ch >= 0x0A85 && ch <= 0x0A8B)
4153 || (ch >= 0x0A8F && ch <= 0x0A91)
4154 || (ch >= 0x0A93 && ch <= 0x0AA8)
4155 || (ch >= 0x0AAA && ch <= 0x0AB0)
4156 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4157 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4158 || (ch >= 0x0ABD && ch <= 0x0AC5)
4159 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4160 || (ch >= 0x0ACB && ch <= 0x0ACD)
4164 || (ch >= 0x0B01 && ch <= 0x0B03)
4165 || (ch >= 0x0B05 && ch <= 0x0B0C)
4166 || (ch >= 0x0B0F && ch <= 0x0B10)
4167 || (ch >= 0x0B13 && ch <= 0x0B28)
4168 || (ch >= 0x0B2A && ch <= 0x0B30)
4169 || (ch >= 0x0B32 && ch <= 0x0B33)
4170 || (ch >= 0x0B36 && ch <= 0x0B39)
4171 || (ch >= 0x0B3E && ch <= 0x0B43)
4172 || (ch >= 0x0B47 && ch <= 0x0B48)
4173 || (ch >= 0x0B4B && ch <= 0x0B4D)
4174 || (ch >= 0x0B5C && ch <= 0x0B5D)
4175 || (ch >= 0x0B5F && ch <= 0x0B61)
4177 || (ch >= 0x0B82 && ch <= 0x0B83)
4178 || (ch >= 0x0B85 && ch <= 0x0B8A)
4179 || (ch >= 0x0B8E && ch <= 0x0B90)
4180 || (ch >= 0x0B92 && ch <= 0x0B95)
4181 || (ch >= 0x0B99 && ch <= 0x0B9A)
4183 || (ch >= 0x0B9E && ch <= 0x0B9F)
4184 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4185 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4186 || (ch >= 0x0BAE && ch <= 0x0BB5)
4187 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4188 || (ch >= 0x0BBE && ch <= 0x0BC2)
4189 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4190 || (ch >= 0x0BCA && ch <= 0x0BCD)
4192 || (ch >= 0x0C01 && ch <= 0x0C03)
4193 || (ch >= 0x0C05 && ch <= 0x0C0C)
4194 || (ch >= 0x0C0E && ch <= 0x0C10)
4195 || (ch >= 0x0C12 && ch <= 0x0C28)
4196 || (ch >= 0x0C2A && ch <= 0x0C33)
4197 || (ch >= 0x0C35 && ch <= 0x0C39)
4198 || (ch >= 0x0C3E && ch <= 0x0C44)
4199 || (ch >= 0x0C46 && ch <= 0x0C48)
4200 || (ch >= 0x0C4A && ch <= 0x0C4D)
4201 || (ch >= 0x0C60 && ch <= 0x0C61)
4203 || (ch >= 0x0C82 && ch <= 0x0C83)
4204 || (ch >= 0x0C85 && ch <= 0x0C8C)
4205 || (ch >= 0x0C8E && ch <= 0x0C90)
4206 || (ch >= 0x0C92 && ch <= 0x0CA8)
4207 || (ch >= 0x0CAA && ch <= 0x0CB3)
4208 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4209 || (ch >= 0x0CBE && ch <= 0x0CC4)
4210 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4211 || (ch >= 0x0CCA && ch <= 0x0CCD)
4213 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4215 || (ch >= 0x0D02 && ch <= 0x0D03)
4216 || (ch >= 0x0D05 && ch <= 0x0D0C)
4217 || (ch >= 0x0D0E && ch <= 0x0D10)
4218 || (ch >= 0x0D12 && ch <= 0x0D28)
4219 || (ch >= 0x0D2A && ch <= 0x0D39)
4220 || (ch >= 0x0D3E && ch <= 0x0D43)
4221 || (ch >= 0x0D46 && ch <= 0x0D48)
4222 || (ch >= 0x0D4A && ch <= 0x0D4D)
4223 || (ch >= 0x0D60 && ch <= 0x0D61)
4225 || (ch >= 0x0E01 && ch <= 0x0E3A)
4226 || (ch >= 0x0E40 && ch <= 0x0E5B)
4228 || (ch >= 0x0E81 && ch <= 0x0E82)
4230 || (ch >= 0x0E87 && ch <= 0x0E88)
4233 || (ch >= 0x0E94 && ch <= 0x0E97)
4234 || (ch >= 0x0E99 && ch <= 0x0E9F)
4235 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4238 || (ch >= 0x0EAA && ch <= 0x0EAB)
4239 || (ch >= 0x0EAD && ch <= 0x0EAE)
4240 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4241 || (ch >= 0x0EBB && ch <= 0x0EBD)
4242 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4244 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4245 || (ch >= 0x0EDC && ch <= 0x0EDD)
4248 || (ch >= 0x0F18 && ch <= 0x0F19)
4252 || (ch >= 0x0F3E && ch <= 0x0F47)
4253 || (ch >= 0x0F49 && ch <= 0x0F69)
4254 || (ch >= 0x0F71 && ch <= 0x0F84)
4255 || (ch >= 0x0F86 && ch <= 0x0F8B)
4256 || (ch >= 0x0F90 && ch <= 0x0F95)
4258 || (ch >= 0x0F99 && ch <= 0x0FAD)
4259 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4262 || (ch >= 0x10A0 && ch <= 0x10C5)
4263 || (ch >= 0x10D0 && ch <= 0x10F6)
4265 || (ch >= 0x3041 && ch <= 0x3093)
4266 || (ch >= 0x309B && ch <= 0x309C)
4268 || (ch >= 0x30A1 && ch <= 0x30F6)
4269 || (ch >= 0x30FB && ch <= 0x30FC)
4271 || (ch >= 0x3105 && ch <= 0x312C)
4272 /* CJK Unified Ideographs */
4273 || (ch >= 0x4E00 && ch <= 0x9FA5)
4275 || (ch >= 0xAC00 && ch <= 0xD7A3)
4277 || (ch >= 0x0660 && ch <= 0x0669)
4278 || (ch >= 0x06F0 && ch <= 0x06F9)
4279 || (ch >= 0x0966 && ch <= 0x096F)
4280 || (ch >= 0x09E6 && ch <= 0x09EF)
4281 || (ch >= 0x0A66 && ch <= 0x0A6F)
4282 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4283 || (ch >= 0x0B66 && ch <= 0x0B6F)
4284 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4285 || (ch >= 0x0C66 && ch <= 0x0C6F)
4286 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4287 || (ch >= 0x0D66 && ch <= 0x0D6F)
4288 || (ch >= 0x0E50 && ch <= 0x0E59)
4289 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4290 || (ch >= 0x0F20 && ch <= 0x0F33)
4291 /* Special characters */
4294 || (ch >= 0x02B0 && ch <= 0x02B8)
4296 || (ch >= 0x02BD && ch <= 0x02C1)
4297 || (ch >= 0x02D0 && ch <= 0x02D1)
4298 || (ch >= 0x02E0 && ch <= 0x02E4)
4304 || (ch >= 0x203F && ch <= 0x2040)
4307 || (ch >= 0x210A && ch <= 0x2113)
4309 || (ch >= 0x2118 && ch <= 0x211D)
4313 || (ch >= 0x212A && ch <= 0x2131)
4314 || (ch >= 0x2133 && ch <= 0x2138)
4315 || (ch >= 0x2160 && ch <= 0x2182)
4316 || (ch >= 0x3005 && ch <= 0x3007)
4317 || (ch >= 0x3021 && ch <= 0x3029)
4319 return UC_IDENTIFIER_START;
4320 return UC_IDENTIFIER_INVALID;
4323 /* The Java Language Specification, 3rd edition, §3.6.
4324 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4326 is_java_whitespace (unsigned int ch)
4328 return (ch == ' ' || ch == '\t' || ch == '\f'
4329 || ch == '\n' || ch == '\r');
4332 /* The Java Language Specification, 3rd edition, §3.8.
4333 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4334 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4336 java_ident_category (unsigned int ch)
4338 /* FIXME: Check this against Sun's JDK implementation. */
4339 if (is_category_L (ch) /* = Character.isLetter(ch) */
4340 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4341 || is_category_Sc (ch) /* currency symbol */
4342 || is_category_Pc (ch) /* connector punctuation */
4344 return UC_IDENTIFIER_START;
4345 if (is_category_Nd (ch) /* digit */
4346 || is_category_Mc (ch) /* combining mark */
4347 || is_category_Mn (ch) /* non-spacing mark */
4349 return UC_IDENTIFIER_VALID;
4350 if ((ch >= 0x0000 && ch <= 0x0008)
4351 || (ch >= 0x000E && ch <= 0x001B)
4352 || (ch >= 0x007F && ch <= 0x009F)
4353 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4355 return UC_IDENTIFIER_IGNORABLE;
4356 return UC_IDENTIFIER_INVALID;
4359 /* Construction of sparse 3-level tables. */
4360 #define TABLE identsyntax_table
4361 #define ELEMENT uint8_t
4362 #define DEFAULT UC_IDENTIFIER_INVALID
4363 #define xmalloc malloc
4364 #define xrealloc realloc
4367 /* Output an identifier syntax categorization in a three-level bitmap. */
4369 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4373 struct identsyntax_table t;
4374 unsigned int level1_offset, level2_offset, level3_offset;
4376 stream = fopen (filename, "w");
4379 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4383 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4384 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4385 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4390 identsyntax_table_init (&t);
4392 for (ch = 0; ch < 0x110000; ch++)
4394 int syntaxcode = predicate (ch);
4395 if (syntaxcode != UC_IDENTIFIER_INVALID)
4396 identsyntax_table_add (&t, ch, syntaxcode);
4399 identsyntax_table_finalize (&t);
4401 /* Offsets in t.result, in memory of this process. */
4403 5 * sizeof (uint32_t);
4405 5 * sizeof (uint32_t)
4406 + t.level1_size * sizeof (uint32_t);
4408 5 * sizeof (uint32_t)
4409 + t.level1_size * sizeof (uint32_t)
4410 + (t.level2_size << t.q) * sizeof (uint32_t);
4412 for (i = 0; i < 5; i++)
4413 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4414 ((uint32_t *) t.result)[i]);
4415 fprintf (stream, "static const\n");
4416 fprintf (stream, "struct\n");
4417 fprintf (stream, " {\n");
4418 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4419 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4420 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4421 (1 << t.p) * 2 / 16);
4422 fprintf (stream, " }\n");
4423 fprintf (stream, "%s =\n", name);
4424 fprintf (stream, "{\n");
4425 fprintf (stream, " {");
4426 if (t.level1_size > 8)
4427 fprintf (stream, "\n ");
4428 for (i = 0; i < t.level1_size; i++)
4431 if (i > 0 && (i % 8) == 0)
4432 fprintf (stream, "\n ");
4433 offset = ((uint32_t *) (t.result + level1_offset))[i];
4435 fprintf (stream, " %5d", -1);
4437 fprintf (stream, " %5zu",
4438 (offset - level2_offset) / sizeof (uint32_t));
4439 if (i+1 < t.level1_size)
4440 fprintf (stream, ",");
4442 if (t.level1_size > 8)
4443 fprintf (stream, "\n ");
4444 fprintf (stream, " },\n");
4445 fprintf (stream, " {");
4446 if (t.level2_size << t.q > 8)
4447 fprintf (stream, "\n ");
4448 for (i = 0; i < t.level2_size << t.q; i++)
4451 if (i > 0 && (i % 8) == 0)
4452 fprintf (stream, "\n ");
4453 offset = ((uint32_t *) (t.result + level2_offset))[i];
4455 fprintf (stream, " %5d", -1);
4457 fprintf (stream, " %5zu",
4458 (offset - level3_offset) / sizeof (uint8_t));
4459 if (i+1 < t.level2_size << t.q)
4460 fprintf (stream, ",");
4462 if (t.level2_size << t.q > 8)
4463 fprintf (stream, "\n ");
4464 fprintf (stream, " },\n");
4465 /* Pack the level3 array. Each entry needs 2 bits only. */
4466 fprintf (stream, " {");
4467 if ((t.level3_size << t.p) * 2 / 16 > 8)
4468 fprintf (stream, "\n ");
4469 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4471 if (i > 0 && (i % 8) == 0)
4472 fprintf (stream, "\n ");
4473 fprintf (stream, " 0x%04x",
4474 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4479 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4480 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4481 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4482 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4483 fprintf (stream, ",");
4485 if ((t.level3_size << t.p) * 2 / 16 > 8)
4486 fprintf (stream, "\n ");
4487 fprintf (stream, " }\n");
4488 fprintf (stream, "};\n");
4490 if (ferror (stream) || fclose (stream))
4492 fprintf (stderr, "error writing to '%s'\n", filename);
4498 output_ident_properties (const char *version)
4500 #define PROPERTY(P) \
4501 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4502 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4503 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4504 PROPERTY(c_whitespace)
4505 PROPERTY(java_whitespace)
4508 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4509 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4512 /* ========================================================================= */
4514 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4515 glibc/localedata/locales/i18n file, generated by
4516 glibc/localedata/gen-unicode-ctype.c. */
4518 /* Character mappings. */
4521 to_upper (unsigned int ch)
4523 if (unicode_attributes[ch].name != NULL
4524 && unicode_attributes[ch].upper != NONE)
4525 return unicode_attributes[ch].upper;
4531 to_lower (unsigned int ch)
4533 if (unicode_attributes[ch].name != NULL
4534 && unicode_attributes[ch].lower != NONE)
4535 return unicode_attributes[ch].lower;
4541 to_title (unsigned int ch)
4543 if (unicode_attributes[ch].name != NULL
4544 && unicode_attributes[ch].title != NONE)
4545 return unicode_attributes[ch].title;
4550 /* Character class properties. */
4553 is_upper (unsigned int ch)
4555 return (to_lower (ch) != ch);
4559 is_lower (unsigned int ch)
4561 return (to_upper (ch) != ch)
4562 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4567 is_alpha (unsigned int ch)
4569 return (unicode_attributes[ch].name != NULL
4570 && ((unicode_attributes[ch].category[0] == 'L'
4571 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4572 <U0E2F>, <U0E46> should belong to is_punct. */
4573 && (ch != 0x0E2F) && (ch != 0x0E46))
4574 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4575 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4577 || (ch >= 0x0E34 && ch <= 0x0E3A)
4578 || (ch >= 0x0E47 && ch <= 0x0E4E)
4579 /* Avoid warning for <U0345>. */
4581 /* Avoid warnings for <U2160>..<U217F>. */
4582 || (unicode_attributes[ch].category[0] == 'N'
4583 && unicode_attributes[ch].category[1] == 'l')
4584 /* Avoid warnings for <U24B6>..<U24E9>. */
4585 || (unicode_attributes[ch].category[0] == 'S'
4586 && unicode_attributes[ch].category[1] == 'o'
4587 && strstr (unicode_attributes[ch].name, " LETTER ")
4589 /* Consider all the non-ASCII digits as alphabetic.
4590 ISO C 99 forbids us to have them in category "digit",
4591 but we want iswalnum to return true on them. */
4592 || (unicode_attributes[ch].category[0] == 'N'
4593 && unicode_attributes[ch].category[1] == 'd'
4594 && !(ch >= 0x0030 && ch <= 0x0039))));
4598 is_digit (unsigned int ch)
4601 return (unicode_attributes[ch].name != NULL
4602 && unicode_attributes[ch].category[0] == 'N'
4603 && unicode_attributes[ch].category[1] == 'd');
4604 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4605 a zero. Must add <0> in front of them by hand. */
4607 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4610 The iswdigit function tests for any wide character that corresponds
4611 to a decimal-digit character (as defined in 5.2.1).
4613 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4615 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_outdigit (unsigned int ch)
4622 return (ch >= 0x0030 && ch <= 0x0039);
4626 is_alnum (unsigned int ch)
4628 return is_alpha (ch) || is_digit (ch);
4632 is_blank (unsigned int ch)
4634 return (ch == 0x0009 /* '\t' */
4635 /* Category Zs without mention of "<noBreak>" */
4636 || (unicode_attributes[ch].name != NULL
4637 && unicode_attributes[ch].category[0] == 'Z'
4638 && unicode_attributes[ch].category[1] == 's'
4639 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4643 is_space (unsigned int ch)
4645 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4646 should treat it like a punctuation character, not like a space. */
4647 return (ch == 0x0020 /* ' ' */
4648 || ch == 0x000C /* '\f' */
4649 || ch == 0x000A /* '\n' */
4650 || ch == 0x000D /* '\r' */
4651 || ch == 0x0009 /* '\t' */
4652 || ch == 0x000B /* '\v' */
4653 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4654 || (unicode_attributes[ch].name != NULL
4655 && unicode_attributes[ch].category[0] == 'Z'
4656 && (unicode_attributes[ch].category[1] == 'l'
4657 || unicode_attributes[ch].category[1] == 'p'
4658 || (unicode_attributes[ch].category[1] == 's'
4659 && !strstr (unicode_attributes[ch].decomposition,
4664 is_cntrl (unsigned int ch)
4666 return (unicode_attributes[ch].name != NULL
4667 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4668 /* Categories Zl and Zp */
4669 || (unicode_attributes[ch].category[0] == 'Z'
4670 && (unicode_attributes[ch].category[1] == 'l'
4671 || unicode_attributes[ch].category[1] == 'p'))));
4675 is_xdigit (unsigned int ch)
4678 return is_digit (ch)
4679 || (ch >= 0x0041 && ch <= 0x0046)
4680 || (ch >= 0x0061 && ch <= 0x0066);
4682 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4685 The iswxdigit function tests for any wide character that corresponds
4686 to a hexadecimal-digit character (as defined in 6.4.4.1).
4688 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4690 return (ch >= 0x0030 && ch <= 0x0039)
4691 || (ch >= 0x0041 && ch <= 0x0046)
4692 || (ch >= 0x0061 && ch <= 0x0066);
4697 is_graph (unsigned int ch)
4699 return (unicode_attributes[ch].name != NULL
4700 && strcmp (unicode_attributes[ch].name, "<control>")
4705 is_print (unsigned int ch)
4707 return (unicode_attributes[ch].name != NULL
4708 && strcmp (unicode_attributes[ch].name, "<control>")
4709 /* Categories Zl and Zp */
4710 && !(unicode_attributes[ch].name != NULL
4711 && unicode_attributes[ch].category[0] == 'Z'
4712 && (unicode_attributes[ch].category[1] == 'l'
4713 || unicode_attributes[ch].category[1] == 'p')));
4717 is_punct (unsigned int ch)
4720 return (unicode_attributes[ch].name != NULL
4721 && unicode_attributes[ch].category[0] == 'P');
4723 /* The traditional POSIX definition of punctuation is every graphic,
4724 non-alphanumeric character. */
4725 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4729 /* Output all properties. */
4731 output_old_ctype (const char *version)
4733 #define PROPERTY(P) \
4734 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4735 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4736 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4755 is_combining (unsigned int ch)
4757 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4758 file. In 3.0.1 it was identical to the union of the general categories
4759 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4760 PropList.txt file, so we take the latter definition. */
4761 return (unicode_attributes[ch].name != NULL
4762 && unicode_attributes[ch].category[0] == 'M'
4763 && (unicode_attributes[ch].category[1] == 'n'
4764 || unicode_attributes[ch].category[1] == 'c'
4765 || unicode_attributes[ch].category[1] == 'e'));
4769 is_combining_level3 (unsigned int ch)
4771 return is_combining (ch)
4772 && !(unicode_attributes[ch].combining[0] != '\0'
4773 && unicode_attributes[ch].combining[0] != '0'
4774 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4777 /* Return the UCS symbol string for a Unicode character. */
4779 ucs_symbol (unsigned int i)
4781 static char buf[11+1];
4783 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4787 /* Return the UCS symbol range string for a Unicode characters interval. */
4789 ucs_symbol_range (unsigned int low, unsigned int high)
4791 static char buf[24+1];
4793 strcpy (buf, ucs_symbol (low));
4795 strcat (buf, ucs_symbol (high));
4799 /* Output a character class (= property) table. */
4802 output_charclass (FILE *stream, const char *classname,
4803 bool (*func) (unsigned int))
4805 char table[0x110000];
4807 bool need_semicolon;
4808 const int max_column = 75;
4811 for (i = 0; i < 0x110000; i++)
4812 table[i] = (int) func (i);
4814 fprintf (stream, "%s ", classname);
4815 need_semicolon = false;
4817 for (i = 0; i < 0x110000; )
4823 unsigned int low, high;
4829 while (i < 0x110000 && table[i]);
4833 strcpy (buf, ucs_symbol (low));
4835 strcpy (buf, ucs_symbol_range (low, high));
4839 fprintf (stream, ";");
4843 if (column + strlen (buf) > max_column)
4845 fprintf (stream, "/\n ");
4849 fprintf (stream, "%s", buf);
4850 column += strlen (buf);
4851 need_semicolon = true;
4854 fprintf (stream, "\n");
4857 /* Output a character mapping table. */
4860 output_charmap (FILE *stream, const char *mapname,
4861 unsigned int (*func) (unsigned int))
4863 char table[0x110000];
4865 bool need_semicolon;
4866 const int max_column = 75;
4869 for (i = 0; i < 0x110000; i++)
4870 table[i] = (func (i) != i);
4872 fprintf (stream, "%s ", mapname);
4873 need_semicolon = false;
4875 for (i = 0; i < 0x110000; i++)
4881 strcat (buf, ucs_symbol (i));
4883 strcat (buf, ucs_symbol (func (i)));
4888 fprintf (stream, ";");
4892 if (column + strlen (buf) > max_column)
4894 fprintf (stream, "/\n ");
4898 fprintf (stream, "%s", buf);
4899 column += strlen (buf);
4900 need_semicolon = true;
4902 fprintf (stream, "\n");
4905 /* Output the width table. */
4908 output_widthmap (FILE *stream)
4912 /* Output the tables to the given file. */
4915 output_tables (const char *filename, const char *version)
4920 stream = fopen (filename, "w");
4923 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4927 fprintf (stream, "escape_char /\n");
4928 fprintf (stream, "comment_char %%\n");
4929 fprintf (stream, "\n");
4930 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4932 fprintf (stream, "\n");
4934 fprintf (stream, "LC_IDENTIFICATION\n");
4935 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4936 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4937 fprintf (stream, "address \"\"\n");
4938 fprintf (stream, "contact \"\"\n");
4939 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4940 fprintf (stream, "tel \"\"\n");
4941 fprintf (stream, "fax \"\"\n");
4942 fprintf (stream, "language \"\"\n");
4943 fprintf (stream, "territory \"Earth\"\n");
4944 fprintf (stream, "revision \"%s\"\n", version);
4949 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4950 fprintf (stream, "date \"%s\"\n", date);
4952 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4953 fprintf (stream, "END LC_IDENTIFICATION\n");
4954 fprintf (stream, "\n");
4956 /* Verifications. */
4957 for (ch = 0; ch < 0x110000; ch++)
4959 /* toupper restriction: "Only characters specified for the keywords
4960 lower and upper shall be specified. */
4961 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4963 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4964 ucs_symbol (ch), ch, to_upper (ch));
4966 /* tolower restriction: "Only characters specified for the keywords
4967 lower and upper shall be specified. */
4968 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4970 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4971 ucs_symbol (ch), ch, to_lower (ch));
4973 /* alpha restriction: "Characters classified as either upper or lower
4974 shall automatically belong to this class. */
4975 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4976 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4978 /* alpha restriction: "No character specified for the keywords cntrl,
4979 digit, punct or space shall be specified." */
4980 if (is_alpha (ch) && is_cntrl (ch))
4981 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4982 if (is_alpha (ch) && is_digit (ch))
4983 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4984 if (is_alpha (ch) && is_punct (ch))
4985 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4986 if (is_alpha (ch) && is_space (ch))
4987 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4989 /* space restriction: "No character specified for the keywords upper,
4990 lower, alpha, digit, graph or xdigit shall be specified."
4991 upper, lower, alpha already checked above. */
4992 if (is_space (ch) && is_digit (ch))
4993 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4994 if (is_space (ch) && is_graph (ch))
4995 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4996 if (is_space (ch) && is_xdigit (ch))
4997 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4999 /* cntrl restriction: "No character specified for the keywords upper,
5000 lower, alpha, digit, punct, graph, print or xdigit shall be
5001 specified." upper, lower, alpha already checked above. */
5002 if (is_cntrl (ch) && is_digit (ch))
5003 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_punct (ch))
5005 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5006 if (is_cntrl (ch) && is_graph (ch))
5007 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5008 if (is_cntrl (ch) && is_print (ch))
5009 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5010 if (is_cntrl (ch) && is_xdigit (ch))
5011 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5013 /* punct restriction: "No character specified for the keywords upper,
5014 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5015 be specified." upper, lower, alpha, cntrl already checked above. */
5016 if (is_punct (ch) && is_digit (ch))
5017 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5018 if (is_punct (ch) && is_xdigit (ch))
5019 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5020 if (is_punct (ch) && (ch == 0x0020))
5021 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5023 /* graph restriction: "No character specified for the keyword cntrl
5024 shall be specified." Already checked above. */
5026 /* print restriction: "No character specified for the keyword cntrl
5027 shall be specified." Already checked above. */
5029 /* graph - print relation: differ only in the <space> character.
5030 How is this possible if there are more than one space character?!
5031 I think susv2/xbd/locale.html should speak of "space characters",
5032 not "space character". */
5033 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5035 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5036 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5038 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5041 fprintf (stream, "LC_CTYPE\n");
5042 output_charclass (stream, "upper", is_upper);
5043 output_charclass (stream, "lower", is_lower);
5044 output_charclass (stream, "alpha", is_alpha);
5045 output_charclass (stream, "digit", is_digit);
5046 output_charclass (stream, "outdigit", is_outdigit);
5047 output_charclass (stream, "blank", is_blank);
5048 output_charclass (stream, "space", is_space);
5049 output_charclass (stream, "cntrl", is_cntrl);
5050 output_charclass (stream, "punct", is_punct);
5051 output_charclass (stream, "xdigit", is_xdigit);
5052 output_charclass (stream, "graph", is_graph);
5053 output_charclass (stream, "print", is_print);
5054 output_charclass (stream, "class \"combining\";", is_combining);
5055 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5056 output_charmap (stream, "toupper", to_upper);
5057 output_charmap (stream, "tolower", to_lower);
5058 output_charmap (stream, "map \"totitle\";", to_title);
5059 output_widthmap (stream);
5060 fprintf (stream, "END LC_CTYPE\n");
5062 if (ferror (stream) || fclose (stream))
5064 fprintf (stderr, "error writing to '%s'\n", filename);
5071 /* ========================================================================= */
5073 /* The width property from the EastAsianWidth.txt file.
5074 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5075 const char * unicode_width[0x110000];
5077 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5080 fill_width (const char *width_filename)
5084 char field0[FIELDLEN];
5085 char field1[FIELDLEN];
5086 char field2[FIELDLEN];
5089 for (i = 0; i < 0x110000; i++)
5090 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5092 stream = fopen (width_filename, "r");
5095 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5110 do c = getc (stream); while (c != EOF && c != '\n');
5114 n = getfield (stream, field0, ';');
5115 n += getfield (stream, field1, ' ');
5116 n += getfield (stream, field2, '\n');
5121 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5124 i = strtoul (field0, NULL, 16);
5125 if (strstr (field0, "..") != NULL)
5127 /* Deal with a range. */
5128 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5130 unicode_width[i] = strdup (field1);
5134 /* Single character line. */
5135 unicode_width[i] = strdup (field1);
5138 if (ferror (stream) || fclose (stream))
5140 fprintf (stderr, "error reading from '%s'\n", width_filename);
5145 /* ========================================================================= */
5147 /* Line breaking classification. */
5151 /* Values >= 24 are resolved at run time. */
5152 LBP_BK = 24, /* mandatory break */
5153 /*LBP_CR, carriage return - not used here because it's a DOSism */
5154 /*LBP_LF, line feed - not used here because it's a DOSism */
5155 LBP_CM = 25, /* attached characters and combining marks */
5156 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5157 /*LBP_SG, surrogates - not used here because they are not characters */
5158 LBP_WJ = 0, /* word joiner */
5159 LBP_ZW = 26, /* zero width space */
5160 LBP_GL = 1, /* non-breaking (glue) */
5161 LBP_SP = 27, /* space */
5162 LBP_B2 = 2, /* break opportunity before and after */
5163 LBP_BA = 3, /* break opportunity after */
5164 LBP_BB = 4, /* break opportunity before */
5165 LBP_HY = 5, /* hyphen */
5166 LBP_CB = 28, /* contingent break opportunity */
5167 LBP_CL = 6, /* closing punctuation */
5168 LBP_EX = 7, /* exclamation/interrogation */
5169 LBP_IN = 8, /* inseparable */
5170 LBP_NS = 9, /* non starter */
5171 LBP_OP = 10, /* opening punctuation */
5172 LBP_QU = 11, /* ambiguous quotation */
5173 LBP_IS = 12, /* infix separator (numeric) */
5174 LBP_NU = 13, /* numeric */
5175 LBP_PO = 14, /* postfix (numeric) */
5176 LBP_PR = 15, /* prefix (numeric) */
5177 LBP_SY = 16, /* symbols allowing breaks */
5178 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5179 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5180 LBP_H2 = 18, /* Hangul LV syllable */
5181 LBP_H3 = 19, /* Hangul LVT syllable */
5182 LBP_ID = 20, /* ideographic */
5183 LBP_JL = 21, /* Hangul L Jamo */
5184 LBP_JV = 22, /* Hangul V Jamo */
5185 LBP_JT = 23, /* Hangul T Jamo */
5186 LBP_SA = 30, /* complex context (South East Asian) */
5187 LBP_XX = 31 /* unknown */
5190 /* Returns the line breaking classification for ch, as a bit mask. */
5192 get_lbp (unsigned int ch)
5196 if (unicode_attributes[ch].name != NULL)
5198 /* mandatory break */
5199 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5200 || ch == 0x000C /* form feed */
5201 || ch == 0x000B /* line tabulation */
5202 || ch == 0x2028 /* LINE SEPARATOR */
5203 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5204 attr |= 1 << LBP_BK;
5206 if (ch == 0x2060 /* WORD JOINER */
5207 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5208 attr |= 1 << LBP_WJ;
5210 /* zero width space */
5211 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5212 attr |= 1 << LBP_ZW;
5214 /* non-breaking (glue) */
5215 if (ch == 0x00A0 /* NO-BREAK SPACE */
5216 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5217 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5218 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5219 || ch == 0x2007 /* FIGURE SPACE */
5220 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5221 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5222 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5223 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5224 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5225 attr |= 1 << LBP_GL;
5228 if (ch == 0x0020 /* SPACE */)
5229 attr |= 1 << LBP_SP;
5231 /* break opportunity before and after */
5232 if (ch == 0x2014 /* EM DASH */)
5233 attr |= 1 << LBP_B2;
5235 /* break opportunity after */
5236 if (ch == 0x1680 /* OGHAM SPACE MARK */
5237 || ch == 0x2000 /* EN QUAD */
5238 || ch == 0x2001 /* EM QUAD */
5239 || ch == 0x2002 /* EN SPACE */
5240 || ch == 0x2003 /* EM SPACE */
5241 || ch == 0x2004 /* THREE-PER-EM SPACE */
5242 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5243 || ch == 0x2006 /* SIX-PER-EM SPACE */
5244 || ch == 0x2008 /* PUNCTUATION SPACE */
5245 || ch == 0x2009 /* THIN SPACE */
5246 || ch == 0x200A /* HAIR SPACE */
5247 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5248 || ch == 0x0009 /* tab */
5249 || ch == 0x00AD /* SOFT HYPHEN */
5250 || ch == 0x058A /* ARMENIAN HYPHEN */
5251 || ch == 0x2010 /* HYPHEN */
5252 || ch == 0x2012 /* FIGURE DASH */
5253 || ch == 0x2013 /* EN DASH */
5254 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5255 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5256 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5257 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5258 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5259 || ch == 0x2027 /* HYPHENATION POINT */
5260 || ch == 0x007C /* VERTICAL LINE */
5261 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5262 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5263 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5264 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5265 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5266 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5267 || ch == 0x205A /* TWO DOT PUNCTUATION */
5268 || ch == 0x205B /* FOUR DOT MARK */
5269 || ch == 0x205D /* TRICOLON */
5270 || ch == 0x205E /* VERTICAL FOUR DOTS */
5271 || ch == 0x2E19 /* PALM BRANCH */
5272 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5273 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5274 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5275 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5276 || ch == 0x2E30 /* RING POINT */
5277 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5278 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5279 || ch == 0x10102 /* AEGEAN CHECK MARK */
5280 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5281 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5282 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5283 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5284 || ch == 0x0964 /* DEVANAGARI DANDA */
5285 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5286 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5287 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5288 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5289 || ch == 0x104B /* MYANMAR SIGN SECTION */
5290 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5291 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5292 || ch == 0x17D4 /* KHMER SIGN KHAN */
5293 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5294 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5295 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5296 || ch == 0xA8CE /* SAURASHTRA DANDA */
5297 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5298 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5299 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5300 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5301 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5302 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5303 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5304 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5305 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5306 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5307 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5308 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5309 || ch == 0x1804 /* MONGOLIAN COLON */
5310 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5311 || ch == 0x1B5A /* BALINESE PANTI */
5312 || ch == 0x1B5B /* BALINESE PAMADA */
5313 || ch == 0x1B5C /* BALINESE WINDU */
5314 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5315 || ch == 0x1B60 /* BALINESE PAMENENG */
5316 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5317 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5318 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5319 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5320 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5321 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5322 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5323 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5324 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5325 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5326 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5327 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5328 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5329 || ch == 0xA60D /* VAI COMMA */
5330 || ch == 0xA60F /* VAI QUESTION MARK */
5331 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5332 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5333 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5334 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5335 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5336 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5337 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5338 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5339 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5340 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5341 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5342 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5343 attr |= 1 << LBP_BA;
5345 /* break opportunity before */
5346 if (ch == 0x00B4 /* ACUTE ACCENT */
5347 || ch == 0x1FFD /* GREEK OXIA */
5348 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5349 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5350 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5351 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5352 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5353 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5354 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5355 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5356 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5357 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5358 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5359 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5360 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5361 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5362 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5363 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5364 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5365 attr |= 1 << LBP_BB;
5368 if (ch == 0x002D /* HYPHEN-MINUS */)
5369 attr |= 1 << LBP_HY;
5371 /* contingent break opportunity */
5372 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5373 attr |= 1 << LBP_CB;
5375 /* closing punctuation */
5376 if ((unicode_attributes[ch].category[0] == 'P'
5377 && unicode_attributes[ch].category[1] == 'e')
5378 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5379 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5380 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5381 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5382 || ch == 0xFE50 /* SMALL COMMA */
5383 || ch == 0xFE52 /* SMALL FULL STOP */
5384 || ch == 0xFF0C /* FULLWIDTH COMMA */
5385 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5386 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5387 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5388 attr |= 1 << LBP_CL;
5390 /* exclamation/interrogation */
5391 if (ch == 0x0021 /* EXCLAMATION MARK */
5392 || ch == 0x003F /* QUESTION MARK */
5393 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5394 || ch == 0x061B /* ARABIC SEMICOLON */
5395 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5396 || ch == 0x061F /* ARABIC QUESTION MARK */
5397 || ch == 0x06D4 /* ARABIC FULL STOP */
5398 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5399 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5400 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5401 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5402 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5403 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5404 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5405 || ch == 0x1802 /* MONGOLIAN COMMA */
5406 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5407 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5408 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5409 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5410 || ch == 0x1945 /* LIMBU QUESTION MARK */
5411 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5412 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5413 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5414 || ch == 0x2CFE /* COPTIC FULL STOP */
5415 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5417 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5419 || ch == 0xA60E /* VAI FULL STOP */
5420 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5421 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5422 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5423 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5424 || ch == 0xFE56 /* SMALL QUESTION MARK */
5425 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5426 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5427 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5428 attr |= 1 << LBP_EX;
5431 if (ch == 0x2024 /* ONE DOT LEADER */
5432 || ch == 0x2025 /* TWO DOT LEADER */
5433 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5434 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5435 attr |= 1 << LBP_IN;
5438 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5439 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5440 || ch == 0x203D /* INTERROBANG */
5441 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5442 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5443 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5444 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5445 || ch == 0x301C /* WAVE DASH */
5446 || ch == 0x303C /* MASU MARK */
5447 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5448 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5449 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5450 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5451 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5452 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5453 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5454 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5455 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5456 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5457 || ch == 0xA015 /* YI SYLLABLE WU */
5458 || ch == 0xFE54 /* SMALL SEMICOLON */
5459 || ch == 0xFE55 /* SMALL COLON */
5460 || ch == 0xFF1A /* FULLWIDTH COLON */
5461 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5462 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5463 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5464 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5465 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5466 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5467 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5468 attr |= 1 << LBP_NS;
5470 /* opening punctuation */
5471 if ((unicode_attributes[ch].category[0] == 'P'
5472 && unicode_attributes[ch].category[1] == 's')
5473 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5474 || ch == 0x00BF /* INVERTED QUESTION MARK */
5475 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5476 attr |= 1 << LBP_OP;
5478 /* ambiguous quotation */
5479 if ((unicode_attributes[ch].category[0] == 'P'
5480 && (unicode_attributes[ch].category[1] == 'f'
5481 || unicode_attributes[ch].category[1] == 'i'))
5482 || ch == 0x0022 /* QUOTATION MARK */
5483 || ch == 0x0027 /* APOSTROPHE */
5484 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5485 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5486 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5487 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5488 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5489 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5490 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5491 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5492 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5493 || ch == 0x2E0B /* RAISED SQUARE */)
5494 attr |= 1 << LBP_QU;
5496 /* infix separator (numeric) */
5497 if (ch == 0x002C /* COMMA */
5498 || ch == 0x002E /* FULL STOP */
5499 || ch == 0x003A /* COLON */
5500 || ch == 0x003B /* SEMICOLON */
5501 || ch == 0x037E /* GREEK QUESTION MARK */
5502 || ch == 0x0589 /* ARMENIAN FULL STOP */
5503 || ch == 0x060C /* ARABIC COMMA */
5504 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5505 || ch == 0x07F8 /* NKO COMMA */
5506 || ch == 0x2044 /* FRACTION SLASH */
5507 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5508 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5509 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5510 attr |= 1 << LBP_IS;
5513 if ((unicode_attributes[ch].category[0] == 'N'
5514 && unicode_attributes[ch].category[1] == 'd'
5515 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5516 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5517 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5518 attr |= 1 << LBP_NU;
5520 /* postfix (numeric) */
5521 if (ch == 0x0025 /* PERCENT SIGN */
5522 || ch == 0x00A2 /* CENT SIGN */
5523 || ch == 0x00B0 /* DEGREE SIGN */
5524 || ch == 0x060B /* AFGHANI SIGN */
5525 || ch == 0x066A /* ARABIC PERCENT SIGN */
5526 || ch == 0x2030 /* PER MILLE SIGN */
5527 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5528 || ch == 0x2032 /* PRIME */
5529 || ch == 0x2033 /* DOUBLE PRIME */
5530 || ch == 0x2034 /* TRIPLE PRIME */
5531 || ch == 0x2035 /* REVERSED PRIME */
5532 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5533 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5534 || ch == 0x20A7 /* PESETA SIGN */
5535 || ch == 0x2103 /* DEGREE CELSIUS */
5536 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5537 || ch == 0xFDFC /* RIAL SIGN */
5538 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5539 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5540 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5541 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5542 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5543 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5544 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5545 attr |= 1 << LBP_PO;
5547 /* prefix (numeric) */
5548 if ((unicode_attributes[ch].category[0] == 'S'
5549 && unicode_attributes[ch].category[1] == 'c')
5550 || ch == 0x002B /* PLUS SIGN */
5551 || ch == 0x005C /* REVERSE SOLIDUS */
5552 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5553 || ch == 0x2116 /* NUMERO SIGN */
5554 || ch == 0x2212 /* MINUS SIGN */
5555 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5556 if (!(attr & (1 << LBP_PO)))
5557 attr |= 1 << LBP_PR;
5559 /* symbols allowing breaks */
5560 if (ch == 0x002F /* SOLIDUS */)
5561 attr |= 1 << LBP_SY;
5563 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5564 attr |= 1 << LBP_H2;
5566 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5567 attr |= 1 << LBP_H3;
5569 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5570 attr |= 1 << LBP_JL;
5572 if (ch >= 0x1160 && ch <= 0x11A2)
5573 attr |= 1 << LBP_JV;
5575 if (ch >= 0x11A8 && ch <= 0x11F9)
5576 attr |= 1 << LBP_JT;
5578 /* complex context (South East Asian) */
5579 if (((unicode_attributes[ch].category[0] == 'C'
5580 && unicode_attributes[ch].category[1] == 'f')
5581 || (unicode_attributes[ch].category[0] == 'L'
5582 && (unicode_attributes[ch].category[1] == 'm'
5583 || unicode_attributes[ch].category[1] == 'o'))
5584 || (unicode_attributes[ch].category[0] == 'M'
5585 && (unicode_attributes[ch].category[1] == 'c'
5586 || unicode_attributes[ch].category[1] == 'n'))
5587 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5588 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5589 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5590 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5591 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5592 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5593 || (ch >= 0x1000 && ch <= 0x109F)
5594 || (ch >= 0x1780 && ch <= 0x17FF)
5595 || (ch >= 0x1950 && ch <= 0x19DF)))
5596 attr |= 1 << LBP_SA;
5598 /* attached characters and combining marks */
5599 if ((unicode_attributes[ch].category[0] == 'M'
5600 && (unicode_attributes[ch].category[1] == 'c'
5601 || unicode_attributes[ch].category[1] == 'e'
5602 || unicode_attributes[ch].category[1] == 'n'))
5603 || (unicode_attributes[ch].category[0] == 'C'
5604 && (unicode_attributes[ch].category[1] == 'c'
5605 || unicode_attributes[ch].category[1] == 'f')))
5606 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5607 attr |= 1 << LBP_CM;
5610 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5611 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5612 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5613 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5614 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5615 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5616 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5617 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5618 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5619 || ch == 0xFE62 /* SMALL PLUS SIGN */
5620 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5621 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5622 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5623 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5624 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5625 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5626 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5627 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5628 || (ch >= 0x3000 && ch <= 0x33FF
5629 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5630 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5631 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5632 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5633 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5634 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5635 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5636 || ch == 0xFE45 /* SESAME DOT */
5637 || ch == 0xFE46 /* WHITE SESAME DOT */
5638 || ch == 0xFE49 /* DASHED OVERLINE */
5639 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5640 || ch == 0xFE4B /* WAVY OVERLINE */
5641 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5642 || ch == 0xFE4D /* DASHED LOW LINE */
5643 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5644 || ch == 0xFE4F /* WAVY LOW LINE */
5645 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5646 || ch == 0xFE58 /* SMALL EM DASH */
5647 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5648 || ch == 0xFE60 /* SMALL AMPERSAND */
5649 || ch == 0xFE61 /* SMALL ASTERISK */
5650 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5651 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5652 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5653 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5654 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5655 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5656 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5657 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5658 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5659 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5660 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5661 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5662 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5663 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5664 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5665 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5666 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5667 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5668 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5669 || ch == 0xFF5E /* FULLWIDTH TILDE */
5670 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5671 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5672 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5673 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5675 /* ambiguous (ideograph) ? */
5676 if ((unicode_width[ch] != NULL
5677 && unicode_width[ch][0] == 'A'
5679 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5680 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5681 attr |= 1 << LBP_AI;
5683 attr |= 1 << LBP_ID;
5686 /* ordinary alphabetic and symbol characters */
5687 if ((unicode_attributes[ch].category[0] == 'L'
5688 && (unicode_attributes[ch].category[1] == 'u'
5689 || unicode_attributes[ch].category[1] == 'l'
5690 || unicode_attributes[ch].category[1] == 't'
5691 || unicode_attributes[ch].category[1] == 'm'
5692 || unicode_attributes[ch].category[1] == 'o'))
5693 || (unicode_attributes[ch].category[0] == 'S'
5694 && (unicode_attributes[ch].category[1] == 'm'
5695 || unicode_attributes[ch].category[1] == 'k'
5696 || unicode_attributes[ch].category[1] == 'o'))
5697 || (unicode_attributes[ch].category[0] == 'N'
5698 && (unicode_attributes[ch].category[1] == 'l'
5699 || unicode_attributes[ch].category[1] == 'o'))
5700 || (unicode_attributes[ch].category[0] == 'P'
5701 && (unicode_attributes[ch].category[1] == 'c'
5702 || unicode_attributes[ch].category[1] == 'd'
5703 || unicode_attributes[ch].category[1] == 'o'))
5704 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5705 || ch == 0x0601 /* ARABIC SIGN SANAH */
5706 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5707 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5708 || ch == 0x06DD /* ARABIC END OF AYAH */
5709 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5710 || ch == 0x2061 /* FUNCTION APPLICATION */
5711 || ch == 0x2062 /* INVISIBLE TIMES */
5712 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5713 || ch == 0x2064 /* INVISIBLE PLUS */)
5714 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5716 /* ambiguous (alphabetic) ? */
5717 if ((unicode_width[ch] != NULL
5718 && unicode_width[ch][0] == 'A'
5720 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5721 && ch != 0x2022 /* BULLET */
5722 && ch != 0x203E /* OVERLINE */
5723 && ch != 0x2126 /* OHM SIGN */
5724 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5725 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5726 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5727 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5728 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5729 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5730 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5731 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5733 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5734 || ch == 0x00A7 /* SECTION SIGN */
5735 || ch == 0x00A8 /* DIAERESIS */
5736 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5737 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5738 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5739 || ch == 0x00B6 /* PILCROW SIGN */
5740 || ch == 0x00B7 /* MIDDLE DOT */
5741 || ch == 0x00B8 /* CEDILLA */
5742 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5743 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5744 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5745 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5746 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5747 || ch == 0x00BF /* INVERTED QUESTION MARK */
5748 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5749 || ch == 0x00F7 /* DIVISION SIGN */
5750 || ch == 0x02C7 /* CARON */
5751 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5752 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5753 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5754 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5755 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5756 || ch == 0x02D8 /* BREVE */
5757 || ch == 0x02D9 /* DOT ABOVE */
5758 || ch == 0x02DA /* RING ABOVE */
5759 || ch == 0x02DB /* OGONEK */
5760 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5762 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5763 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5764 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5765 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5766 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5767 || ch == 0x2616 /* WHITE SHOGI PIECE */
5768 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5769 attr |= 1 << LBP_AI;
5771 attr |= 1 << LBP_AL;
5772 attr &= ~(1 << LBP_CM);
5778 attr |= 1 << LBP_XX;
5783 /* Output the line breaking properties in a human readable format. */
5785 debug_output_lbp (FILE *stream)
5789 for (i = 0; i < 0x110000; i++)
5791 int attr = get_lbp (i);
5792 if (attr != 1 << LBP_XX)
5794 fprintf (stream, "0x%04X", i);
5795 #define PRINT_BIT(attr,bit) \
5796 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5797 PRINT_BIT(attr,LBP_BK);
5798 PRINT_BIT(attr,LBP_CM);
5799 PRINT_BIT(attr,LBP_WJ);
5800 PRINT_BIT(attr,LBP_ZW);
5801 PRINT_BIT(attr,LBP_GL);
5802 PRINT_BIT(attr,LBP_SP);
5803 PRINT_BIT(attr,LBP_B2);
5804 PRINT_BIT(attr,LBP_BA);
5805 PRINT_BIT(attr,LBP_BB);
5806 PRINT_BIT(attr,LBP_HY);
5807 PRINT_BIT(attr,LBP_CB);
5808 PRINT_BIT(attr,LBP_CL);
5809 PRINT_BIT(attr,LBP_EX);
5810 PRINT_BIT(attr,LBP_IN);
5811 PRINT_BIT(attr,LBP_NS);
5812 PRINT_BIT(attr,LBP_OP);
5813 PRINT_BIT(attr,LBP_QU);
5814 PRINT_BIT(attr,LBP_IS);
5815 PRINT_BIT(attr,LBP_NU);
5816 PRINT_BIT(attr,LBP_PO);
5817 PRINT_BIT(attr,LBP_PR);
5818 PRINT_BIT(attr,LBP_SY);
5819 PRINT_BIT(attr,LBP_AI);
5820 PRINT_BIT(attr,LBP_AL);
5821 PRINT_BIT(attr,LBP_H2);
5822 PRINT_BIT(attr,LBP_H3);
5823 PRINT_BIT(attr,LBP_ID);
5824 PRINT_BIT(attr,LBP_JL);
5825 PRINT_BIT(attr,LBP_JV);
5826 PRINT_BIT(attr,LBP_JT);
5827 PRINT_BIT(attr,LBP_SA);
5828 PRINT_BIT(attr,LBP_XX);
5830 fprintf (stream, "\n");
5836 debug_output_lbrk_tables (const char *filename)
5840 stream = fopen (filename, "w");
5843 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5847 debug_output_lbp (stream);
5849 if (ferror (stream) || fclose (stream))
5851 fprintf (stderr, "error writing to '%s'\n", filename);
5856 /* The line breaking property from the LineBreak.txt file. */
5857 int unicode_org_lbp[0x110000];
5859 /* Stores in unicode_org_lbp[] the line breaking property from the
5860 LineBreak.txt file. */
5862 fill_org_lbp (const char *linebreak_filename)
5866 char field0[FIELDLEN];
5867 char field1[FIELDLEN];
5868 char field2[FIELDLEN];
5871 for (i = 0; i < 0x110000; i++)
5872 unicode_org_lbp[i] = LBP_XX;
5874 stream = fopen (linebreak_filename, "r");
5877 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5893 do c = getc (stream); while (c != EOF && c != '\n');
5897 n = getfield (stream, field0, ';');
5898 n += getfield (stream, field1, ' ');
5899 n += getfield (stream, field2, '\n');
5904 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5908 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5943 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5944 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5945 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5946 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5949 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5950 field1, linebreak_filename, lineno);
5953 i = strtoul (field0, NULL, 16);
5954 if (strstr (field0, "..") != NULL)
5956 /* Deal with a range. */
5957 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5959 unicode_org_lbp[i] = value;
5963 /* Single character line. */
5964 unicode_org_lbp[i] = value;
5967 if (ferror (stream) || fclose (stream))
5969 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5974 /* Output the line breaking properties in a human readable format. */
5976 debug_output_org_lbp (FILE *stream)
5980 for (i = 0; i < 0x110000; i++)
5982 int attr = unicode_org_lbp[i];
5985 fprintf (stream, "0x%04X", i);
5986 #define PRINT_BIT(attr,bit) \
5987 if (attr == bit) fprintf (stream, " " #bit);
5988 PRINT_BIT(attr,LBP_BK);
5989 PRINT_BIT(attr,LBP_CM);
5990 PRINT_BIT(attr,LBP_WJ);
5991 PRINT_BIT(attr,LBP_ZW);
5992 PRINT_BIT(attr,LBP_GL);
5993 PRINT_BIT(attr,LBP_SP);
5994 PRINT_BIT(attr,LBP_B2);
5995 PRINT_BIT(attr,LBP_BA);
5996 PRINT_BIT(attr,LBP_BB);
5997 PRINT_BIT(attr,LBP_HY);
5998 PRINT_BIT(attr,LBP_CB);
5999 PRINT_BIT(attr,LBP_CL);
6000 PRINT_BIT(attr,LBP_EX);
6001 PRINT_BIT(attr,LBP_IN);
6002 PRINT_BIT(attr,LBP_NS);
6003 PRINT_BIT(attr,LBP_OP);
6004 PRINT_BIT(attr,LBP_QU);
6005 PRINT_BIT(attr,LBP_IS);
6006 PRINT_BIT(attr,LBP_NU);
6007 PRINT_BIT(attr,LBP_PO);
6008 PRINT_BIT(attr,LBP_PR);
6009 PRINT_BIT(attr,LBP_SY);
6010 PRINT_BIT(attr,LBP_AI);
6011 PRINT_BIT(attr,LBP_AL);
6012 PRINT_BIT(attr,LBP_H2);
6013 PRINT_BIT(attr,LBP_H3);
6014 PRINT_BIT(attr,LBP_ID);
6015 PRINT_BIT(attr,LBP_JL);
6016 PRINT_BIT(attr,LBP_JV);
6017 PRINT_BIT(attr,LBP_JT);
6018 PRINT_BIT(attr,LBP_SA);
6019 PRINT_BIT(attr,LBP_XX);
6021 fprintf (stream, "\n");
6027 debug_output_org_lbrk_tables (const char *filename)
6031 stream = fopen (filename, "w");
6034 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6038 debug_output_org_lbp (stream);
6040 if (ferror (stream) || fclose (stream))
6042 fprintf (stderr, "error writing to '%s'\n", filename);
6047 /* Construction of sparse 3-level tables. */
6048 #define TABLE lbp_table
6049 #define ELEMENT unsigned char
6050 #define DEFAULT LBP_XX
6051 #define xmalloc malloc
6052 #define xrealloc realloc
6056 output_lbp (FILE *stream1, FILE *stream2)
6060 unsigned int level1_offset, level2_offset, level3_offset;
6064 lbp_table_init (&t);
6066 for (i = 0; i < 0x110000; i++)
6068 int attr = get_lbp (i);
6070 /* Now attr should contain exactly one bit. */
6071 if (attr == 0 || ((attr & (attr - 1)) != 0))
6074 if (attr != 1 << LBP_XX)
6076 unsigned int log2_attr;
6077 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6079 lbp_table_add (&t, i, log2_attr);
6083 lbp_table_finalize (&t);
6086 5 * sizeof (uint32_t);
6088 5 * sizeof (uint32_t)
6089 + t.level1_size * sizeof (uint32_t);
6091 5 * sizeof (uint32_t)
6092 + t.level1_size * sizeof (uint32_t)
6093 + (t.level2_size << t.q) * sizeof (uint32_t);
6095 for (i = 0; i < 5; i++)
6096 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6097 ((uint32_t *) t.result)[i]);
6098 fprintf (stream1, "\n");
6099 fprintf (stream1, "typedef struct\n");
6100 fprintf (stream1, " {\n");
6101 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6102 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6103 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6104 fprintf (stream1, " }\n");
6105 fprintf (stream1, "lbrkprop_t;\n");
6106 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6108 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6109 fprintf (stream2, "{\n");
6110 fprintf (stream2, " {");
6111 if (t.level1_size > 8)
6112 fprintf (stream2, "\n ");
6113 for (i = 0; i < t.level1_size; i++)
6116 if (i > 0 && (i % 8) == 0)
6117 fprintf (stream2, "\n ");
6118 offset = ((uint32_t *) (t.result + level1_offset))[i];
6120 fprintf (stream2, " %5d", -1);
6122 fprintf (stream2, " %5zu",
6123 (offset - level2_offset) / sizeof (uint32_t));
6124 if (i+1 < t.level1_size)
6125 fprintf (stream2, ",");
6127 if (t.level1_size > 8)
6128 fprintf (stream2, "\n ");
6129 fprintf (stream2, " },\n");
6130 fprintf (stream2, " {");
6131 if (t.level2_size << t.q > 8)
6132 fprintf (stream2, "\n ");
6133 for (i = 0; i < t.level2_size << t.q; i++)
6136 if (i > 0 && (i % 8) == 0)
6137 fprintf (stream2, "\n ");
6138 offset = ((uint32_t *) (t.result + level2_offset))[i];
6140 fprintf (stream2, " %5d", -1);
6142 fprintf (stream2, " %5zu",
6143 (offset - level3_offset) / sizeof (unsigned char));
6144 if (i+1 < t.level2_size << t.q)
6145 fprintf (stream2, ",");
6147 if (t.level2_size << t.q > 8)
6148 fprintf (stream2, "\n ");
6149 fprintf (stream2, " },\n");
6150 fprintf (stream2, " {");
6151 if (t.level3_size << t.p > 8)
6152 fprintf (stream2, "\n ");
6153 for (i = 0; i < t.level3_size << t.p; i++)
6155 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6156 const char *value_string;
6159 #define CASE(x) case x: value_string = #x; break;
6196 if (i > 0 && (i % 8) == 0)
6197 fprintf (stream2, "\n ");
6198 fprintf (stream2, " %s%s", value_string,
6199 (i+1 < t.level3_size << t.p ? "," : ""));
6201 if (t.level3_size << t.p > 8)
6202 fprintf (stream2, "\n ");
6203 fprintf (stream2, " }\n");
6204 fprintf (stream2, "};\n");
6208 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6210 const char *filenames[2];
6214 filenames[0] = filename1;
6215 filenames[1] = filename2;
6217 for (i = 0; i < 2; i++)
6219 streams[i] = fopen (filenames[i], "w");
6220 if (streams[i] == NULL)
6222 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6227 for (i = 0; i < 2; i++)
6229 FILE *stream = streams[i];
6231 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6232 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6233 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6235 fprintf (stream, "\n");
6237 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6238 still carries the GPL header), and it's gnulib-tool which replaces the
6239 GPL header with an LGPL header. */
6240 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6241 fprintf (stream, "\n");
6242 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6243 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6244 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6245 fprintf (stream, " (at your option) any later version.\n");
6246 fprintf (stream, "\n");
6247 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6248 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6249 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6250 fprintf (stream, " GNU General Public License for more details.\n");
6251 fprintf (stream, "\n");
6252 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6253 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6254 fprintf (stream, "\n");
6257 output_lbp (streams[0], streams[1]);
6259 for (i = 0; i < 2; i++)
6261 if (ferror (streams[i]) || fclose (streams[i]))
6263 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6269 /* ========================================================================= */
6271 /* Word break property. */
6273 /* Possible values of the Word_Break property. */
6288 WBP_EXTENDNUMLET = 7
6291 /* Returns the word breaking property for ch, as a bit mask. */
6293 get_wbp (unsigned int ch)
6297 if (unicode_attributes[ch].name != NULL)
6300 attr |= 1 << WBP_CR;
6303 attr |= 1 << WBP_LF;
6305 if (ch == 0x000B || ch == 0x000C
6307 || ch == 0x2028 || ch == 0x2029)
6308 attr |= 1 << WBP_NEWLINE;
6310 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6311 || (unicode_attributes[ch].category != NULL
6312 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6313 attr |= 1 << WBP_EXTEND;
6315 if (unicode_attributes[ch].category != NULL
6316 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6317 && ch != 0x200C && ch != 0x200D)
6318 attr |= 1 << WBP_FORMAT;
6320 if ((unicode_scripts[ch] < numscripts
6321 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6322 || (ch >= 0x3031 && ch <= 0x3035)
6323 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6325 attr |= 1 << WBP_KATAKANA;
6327 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6329 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6330 && (attr & (1 << WBP_KATAKANA)) == 0
6331 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6332 && !(unicode_scripts[ch] < numscripts
6333 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6334 && (attr & (1 << WBP_EXTEND)) == 0)
6335 attr |= 1 << WBP_ALETTER;
6337 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6338 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6339 attr |= 1 << WBP_MIDNUMLET;
6341 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6342 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6343 attr |= 1 << WBP_MIDLETTER;
6345 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6346 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6348 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6349 attr |= 1 << WBP_MIDNUM;
6351 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6353 attr |= 1 << WBP_NUMERIC;
6355 if (unicode_attributes[ch].category != NULL
6356 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6357 attr |= 1 << WBP_EXTENDNUMLET;
6362 attr |= 1 << WBP_OTHER;
6367 /* Output the word break property in a human readable format. */
6369 debug_output_wbp (FILE *stream)
6373 for (i = 0; i < 0x110000; i++)
6375 int attr = get_wbp (i);
6376 if (attr != 1 << WBP_OTHER)
6378 fprintf (stream, "0x%04X", i);
6379 if (attr & (1 << WBP_CR))
6380 fprintf (stream, " CR");
6381 if (attr & (1 << WBP_LF))
6382 fprintf (stream, " LF");
6383 if (attr & (1 << WBP_NEWLINE))
6384 fprintf (stream, " Newline");
6385 if (attr & (1 << WBP_EXTEND))
6386 fprintf (stream, " Extend");
6387 if (attr & (1 << WBP_FORMAT))
6388 fprintf (stream, " Format");
6389 if (attr & (1 << WBP_KATAKANA))
6390 fprintf (stream, " Katakana");
6391 if (attr & (1 << WBP_ALETTER))
6392 fprintf (stream, " ALetter");
6393 if (attr & (1 << WBP_MIDNUMLET))
6394 fprintf (stream, " MidNumLet");
6395 if (attr & (1 << WBP_MIDLETTER))
6396 fprintf (stream, " MidLetter");
6397 if (attr & (1 << WBP_MIDNUM))
6398 fprintf (stream, " MidNum");
6399 if (attr & (1 << WBP_NUMERIC))
6400 fprintf (stream, " Numeric");
6401 if (attr & (1 << WBP_EXTENDNUMLET))
6402 fprintf (stream, " ExtendNumLet");
6403 fprintf (stream, "\n");
6409 debug_output_wbrk_tables (const char *filename)
6413 stream = fopen (filename, "w");
6416 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6420 debug_output_wbp (stream);
6422 if (ferror (stream) || fclose (stream))
6424 fprintf (stderr, "error writing to '%s'\n", filename);
6429 /* The word break property from the WordBreakProperty.txt file. */
6430 int unicode_org_wbp[0x110000];
6432 /* Stores in unicode_org_wbp[] the word break property from the
6433 WordBreakProperty.txt file. */
6435 fill_org_wbp (const char *wordbreakproperty_filename)
6440 for (i = 0; i < 0x110000; i++)
6441 unicode_org_wbp[i] = WBP_OTHER;
6443 stream = fopen (wordbreakproperty_filename, "r");
6446 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6453 unsigned int i1, i2;
6454 char padding[200+1];
6455 char propname[200+1];
6458 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6461 if (buf[0] == '\0' || buf[0] == '#')
6464 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6466 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6468 fprintf (stderr, "parse error in '%s'\n",
6469 wordbreakproperty_filename);
6474 #define PROP(name,value) \
6475 if (strcmp (propname, name) == 0) propvalue = value; else
6478 PROP ("Newline", WBP_NEWLINE)
6479 PROP ("Extend", WBP_EXTEND)
6480 PROP ("Format", WBP_FORMAT)
6481 PROP ("Katakana", WBP_KATAKANA)
6482 PROP ("ALetter", WBP_ALETTER)
6483 PROP ("MidNumLet", WBP_MIDNUMLET)
6484 PROP ("MidLetter", WBP_MIDLETTER)
6485 PROP ("MidNum", WBP_MIDNUM)
6486 PROP ("Numeric", WBP_NUMERIC)
6487 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6490 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6491 wordbreakproperty_filename);
6494 if (!(i1 <= i2 && i2 < 0x110000))
6497 for (i = i1; i <= i2; i++)
6498 unicode_org_wbp[i] = propvalue;
6501 if (ferror (stream) || fclose (stream))
6503 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6508 /* Output the word break property in a human readable format. */
6510 debug_output_org_wbp (FILE *stream)
6514 for (i = 0; i < 0x110000; i++)
6516 int propvalue = unicode_org_wbp[i];
6517 if (propvalue != WBP_OTHER)
6519 fprintf (stream, "0x%04X", i);
6520 #define PROP(name,value) \
6521 if (propvalue == value) fprintf (stream, " " name); else
6524 PROP ("Newline", WBP_NEWLINE)
6525 PROP ("Extend", WBP_EXTEND)
6526 PROP ("Format", WBP_FORMAT)
6527 PROP ("Katakana", WBP_KATAKANA)
6528 PROP ("ALetter", WBP_ALETTER)
6529 PROP ("MidNumLet", WBP_MIDNUMLET)
6530 PROP ("MidLetter", WBP_MIDLETTER)
6531 PROP ("MidNum", WBP_MIDNUM)
6532 PROP ("Numeric", WBP_NUMERIC)
6533 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6535 fprintf (stream, " ??");
6536 fprintf (stream, "\n");
6542 debug_output_org_wbrk_tables (const char *filename)
6546 stream = fopen (filename, "w");
6549 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6553 debug_output_org_wbp (stream);
6555 if (ferror (stream) || fclose (stream))
6557 fprintf (stderr, "error writing to '%s'\n", filename);
6562 /* Construction of sparse 3-level tables. */
6563 #define TABLE wbp_table
6564 #define ELEMENT unsigned char
6565 #define DEFAULT WBP_OTHER
6566 #define xmalloc malloc
6567 #define xrealloc realloc
6571 output_wbp (FILE *stream)
6575 unsigned int level1_offset, level2_offset, level3_offset;
6579 wbp_table_init (&t);
6581 for (i = 0; i < 0x110000; i++)
6583 int attr = get_wbp (i);
6585 /* Now attr should contain exactly one bit. */
6586 if (attr == 0 || ((attr & (attr - 1)) != 0))
6589 if (attr != 1 << WBP_OTHER)
6591 unsigned int log2_attr;
6592 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6594 wbp_table_add (&t, i, log2_attr);
6598 wbp_table_finalize (&t);
6601 5 * sizeof (uint32_t);
6603 5 * sizeof (uint32_t)
6604 + t.level1_size * sizeof (uint32_t);
6606 5 * sizeof (uint32_t)
6607 + t.level1_size * sizeof (uint32_t)
6608 + (t.level2_size << t.q) * sizeof (uint32_t);
6610 for (i = 0; i < 5; i++)
6611 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6612 ((uint32_t *) t.result)[i]);
6613 fprintf (stream, "\n");
6614 fprintf (stream, "typedef struct\n");
6615 fprintf (stream, " {\n");
6616 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6617 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6618 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6619 fprintf (stream, " }\n");
6620 fprintf (stream, "wbrkprop_t;\n");
6621 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6622 fprintf (stream, "{\n");
6623 fprintf (stream, " {");
6624 if (t.level1_size > 8)
6625 fprintf (stream, "\n ");
6626 for (i = 0; i < t.level1_size; i++)
6629 if (i > 0 && (i % 8) == 0)
6630 fprintf (stream, "\n ");
6631 offset = ((uint32_t *) (t.result + level1_offset))[i];
6633 fprintf (stream, " %5d", -1);
6635 fprintf (stream, " %5zu",
6636 (offset - level2_offset) / sizeof (uint32_t));
6637 if (i+1 < t.level1_size)
6638 fprintf (stream, ",");
6640 if (t.level1_size > 8)
6641 fprintf (stream, "\n ");
6642 fprintf (stream, " },\n");
6643 fprintf (stream, " {");
6644 if (t.level2_size << t.q > 8)
6645 fprintf (stream, "\n ");
6646 for (i = 0; i < t.level2_size << t.q; i++)
6649 if (i > 0 && (i % 8) == 0)
6650 fprintf (stream, "\n ");
6651 offset = ((uint32_t *) (t.result + level2_offset))[i];
6653 fprintf (stream, " %5d", -1);
6655 fprintf (stream, " %5zu",
6656 (offset - level3_offset) / sizeof (unsigned char));
6657 if (i+1 < t.level2_size << t.q)
6658 fprintf (stream, ",");
6660 if (t.level2_size << t.q > 8)
6661 fprintf (stream, "\n ");
6662 fprintf (stream, " },\n");
6663 fprintf (stream, " {");
6664 if (t.level3_size << t.p > 4)
6665 fprintf (stream, "\n ");
6666 for (i = 0; i < t.level3_size << t.p; i++)
6668 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6669 const char *value_string;
6672 #define CASE(x) case x: value_string = #x; break;
6681 CASE(WBP_MIDNUMLET);
6682 CASE(WBP_MIDLETTER);
6685 CASE(WBP_EXTENDNUMLET);
6690 if (i > 0 && (i % 4) == 0)
6691 fprintf (stream, "\n ");
6692 fprintf (stream, " %s%s", value_string,
6693 (i+1 < t.level3_size << t.p ? "," : ""));
6695 if (t.level3_size << t.p > 4)
6696 fprintf (stream, "\n ");
6697 fprintf (stream, " }\n");
6698 fprintf (stream, "};\n");
6702 output_wbrk_tables (const char *filename, const char *version)
6706 stream = fopen (filename, "w");
6709 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6713 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6714 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6715 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6717 fprintf (stream, "\n");
6719 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6720 still carries the GPL header), and it's gnulib-tool which replaces the
6721 GPL header with an LGPL header. */
6722 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6723 fprintf (stream, "\n");
6724 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6725 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6726 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6727 fprintf (stream, " (at your option) any later version.\n");
6728 fprintf (stream, "\n");
6729 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6730 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6731 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6732 fprintf (stream, " GNU General Public License for more details.\n");
6733 fprintf (stream, "\n");
6734 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6735 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6736 fprintf (stream, "\n");
6738 output_wbp (stream);
6740 if (ferror (stream) || fclose (stream))
6742 fprintf (stderr, "error writing to '%s'\n", filename);
6747 /* ========================================================================= */
6749 /* Grapheme break property. */
6751 /* Possible values of the Grapheme_Cluster_Break property. */
6760 GBP_SPACINGMARK = 6,
6768 /* Construction of sparse 3-level tables. */
6769 #define TABLE gbp_table
6770 #define ELEMENT unsigned char
6771 #define DEFAULT GBP_OTHER
6772 #define xmalloc malloc
6773 #define xrealloc realloc
6776 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
6777 int unicode_org_gbp[0x110000];
6779 /* Output the unit test data for the grapheme break property. */
6781 output_gbp_test (const char *filename)
6787 stream = fopen (filename, "w");
6790 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6794 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6795 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
6796 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
6797 fprintf (stream, "\n");
6798 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6799 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6800 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6801 fprintf (stream, " (at your option) any later version.\n");
6802 fprintf (stream, "\n");
6803 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6804 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6805 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6806 fprintf (stream, " GNU General Public License for more details.\n");
6807 fprintf (stream, "\n");
6808 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6809 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6810 fprintf (stream, "\n");
6813 for (ch = 0; ch < 0x110000; ch++)
6815 int gbp = unicode_org_gbp[ch];
6816 const char *gbp_string;
6818 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
6823 #define CASE(x) case x: gbp_string = #x; break;
6830 CASE (GBP_SPACINGMARK)
6842 fprintf (stream, ",\n");
6843 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
6847 fprintf (stream, "\n");
6849 if (ferror (stream) || fclose (stream))
6851 fprintf (stderr, "error writing to '%s'\n", filename);
6856 /* Output the per-character grapheme break property table. */
6858 output_gbp_table (const char *filename, const char *version)
6863 unsigned int level1_offset, level2_offset, level3_offset;
6865 stream = fopen (filename, "w");
6868 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6872 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6873 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
6874 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6879 gbp_table_init (&t);
6881 for (ch = 0; ch < 0x110000; ch++)
6882 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
6884 gbp_table_finalize (&t);
6886 /* Offsets in t.result, in memory of this process. */
6888 5 * sizeof (uint32_t);
6890 5 * sizeof (uint32_t)
6891 + t.level1_size * sizeof (uint32_t);
6893 5 * sizeof (uint32_t)
6894 + t.level1_size * sizeof (uint32_t)
6895 + (t.level2_size << t.q) * sizeof (uint32_t);
6897 for (i = 0; i < 5; i++)
6898 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
6899 ((uint32_t *) t.result)[i]);
6900 fprintf (stream, "static const\n");
6901 fprintf (stream, "struct\n");
6902 fprintf (stream, " {\n");
6903 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6904 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6905 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
6906 t.level3_size, t.p);
6907 fprintf (stream, " }\n");
6908 fprintf (stream, "unigbrkprop =\n");
6909 fprintf (stream, "{\n");
6910 fprintf (stream, " {");
6911 if (t.level1_size > 8)
6912 fprintf (stream, "\n ");
6913 for (i = 0; i < t.level1_size; i++)
6916 if (i > 0 && (i % 8) == 0)
6917 fprintf (stream, "\n ");
6918 offset = ((uint32_t *) (t.result + level1_offset))[i];
6920 fprintf (stream, " %5d", -1);
6922 fprintf (stream, " %5zu",
6923 (offset - level2_offset) / sizeof (uint32_t));
6924 if (i+1 < t.level1_size)
6925 fprintf (stream, ",");
6927 if (t.level1_size > 8)
6928 fprintf (stream, "\n ");
6929 fprintf (stream, " },\n");
6930 fprintf (stream, " {");
6931 if (t.level2_size << t.q > 8)
6932 fprintf (stream, "\n ");
6933 for (i = 0; i < t.level2_size << t.q; i++)
6936 if (i > 0 && (i % 8) == 0)
6937 fprintf (stream, "\n ");
6938 offset = ((uint32_t *) (t.result + level2_offset))[i];
6940 fprintf (stream, " %5d", -1);
6942 fprintf (stream, " %5zu",
6943 (offset - level3_offset) / sizeof (uint8_t) / 2);
6944 if (i+1 < t.level2_size << t.q)
6945 fprintf (stream, ",");
6947 if (t.level2_size << t.q > 8)
6948 fprintf (stream, "\n ");
6949 fprintf (stream, " },\n");
6950 fprintf (stream, " {");
6951 if (t.level3_size << t.p > 8)
6952 fprintf (stream, "\n ");
6953 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
6955 unsigned char *p = (unsigned char *) (t.result + level3_offset);
6956 unsigned char value0 = p[i * 2];
6957 unsigned char value1 = p[i * 2 + 1];
6958 if (i > 0 && (i % 8) == 0)
6959 fprintf (stream, "\n ");
6960 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
6961 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
6963 if (t.level3_size << t.p > 8)
6964 fprintf (stream, "\n ");
6965 fprintf (stream, " }\n");
6966 fprintf (stream, "};\n");
6968 if (ferror (stream) || fclose (stream))
6970 fprintf (stderr, "error writing to '%s'\n", filename);
6975 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
6976 GraphemeBreakProperty.txt file. */
6978 fill_org_gbp (const char *graphemebreakproperty_filename)
6984 for (i = 0; i < 0x110000; i++)
6985 unicode_org_gbp[i] = GBP_OTHER;
6987 stream = fopen (graphemebreakproperty_filename, "r");
6990 fprintf (stderr, "error during fopen of '%s'\n",
6991 graphemebreakproperty_filename);
6998 unsigned int i1, i2;
6999 char padding[200+1];
7000 char propname[200+1];
7004 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7007 if (buf[0] == '\0' || buf[0] == '#')
7010 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7012 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7014 fprintf (stderr, "parse error in '%s'\n",
7015 graphemebreakproperty_filename);
7020 #define PROP(name,value) \
7021 if (strcmp (propname, name) == 0) propvalue = value; else
7024 PROP ("Control", GBP_CONTROL)
7025 PROP ("Extend", GBP_EXTEND)
7026 PROP ("Prepend", GBP_PREPEND)
7027 PROP ("SpacingMark", GBP_SPACINGMARK)
7032 PROP ("LVT", GBP_LVT)
7035 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
7036 graphemebreakproperty_filename, lineno);
7039 if (!(i1 <= i2 && i2 < 0x110000))
7042 for (i = i1; i <= i2; i++)
7043 unicode_org_gbp[i] = propvalue;
7045 if (ferror (stream) || fclose (stream))
7047 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
7052 /* ========================================================================= */
7054 /* Maximum number of characters into which a single Unicode character can be
7056 #define MAX_DECOMP_LENGTH 18
7060 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
7061 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
7062 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
7063 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
7064 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
7065 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
7066 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
7067 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
7068 UC_DECOMP_SUPER, /* <super> A superscript form. */
7069 UC_DECOMP_SUB, /* <sub> A subscript form. */
7070 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
7071 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
7072 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
7073 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
7074 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
7075 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
7076 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7079 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7080 decompositions). Return the type, or -1 for none. */
7082 get_decomposition (unsigned int ch,
7083 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7085 const char *decomposition = unicode_attributes[ch].decomposition;
7087 if (decomposition != NULL && decomposition[0] != '\0')
7089 int type = UC_DECOMP_CANONICAL;
7090 unsigned int length;
7093 if (decomposition[0] == '<')
7098 rangle = strchr (decomposition + 1, '>');
7101 typelen = rangle + 1 - decomposition;
7102 #define TYPE(t1,t2) \
7103 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7106 TYPE ("<font>", UC_DECOMP_FONT)
7107 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7108 TYPE ("<initial>", UC_DECOMP_INITIAL)
7109 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7110 TYPE ("<final>", UC_DECOMP_FINAL)
7111 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7112 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7113 TYPE ("<super>", UC_DECOMP_SUPER)
7114 TYPE ("<sub>", UC_DECOMP_SUB)
7115 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7116 TYPE ("<wide>", UC_DECOMP_WIDE)
7117 TYPE ("<narrow>", UC_DECOMP_NARROW)
7118 TYPE ("<small>", UC_DECOMP_SMALL)
7119 TYPE ("<square>", UC_DECOMP_SQUARE)
7120 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7121 TYPE ("<compat>", UC_DECOMP_COMPAT)
7123 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7127 decomposition = rangle + 1;
7128 if (decomposition[0] == ' ')
7131 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7133 decomposed[length] = strtoul (decomposition, &endptr, 16);
7134 if (endptr == decomposition)
7136 decomposition = endptr;
7137 if (decomposition[0] == ' ')
7140 if (*decomposition != '\0')
7141 /* MAX_DECOMP_LENGTH is too small. */
7151 /* Construction of sparse 3-level tables. */
7152 #define TABLE decomp_table
7153 #define ELEMENT uint16_t
7154 #define DEFAULT (uint16_t)(-1)
7155 #define xmalloc malloc
7156 #define xrealloc realloc
7160 output_decomposition (FILE *stream1, FILE *stream2)
7162 struct decomp_table t;
7163 unsigned int level1_offset, level2_offset, level3_offset;
7164 unsigned int offset;
7170 decomp_table_init (&t);
7172 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7173 fprintf (stream1, "\n");
7174 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7177 for (ch = 0; ch < 0x110000; ch++)
7179 unsigned int length;
7180 unsigned int decomposed[MAX_DECOMP_LENGTH];
7181 int type = get_decomposition (ch, &length, decomposed);
7185 if (!(offset < (1 << 15)))
7187 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7189 /* Produce length 3-bytes entries. */
7191 /* We would need a special representation of zero-length entries. */
7193 for (i = 0; i < length; i++)
7196 fprintf (stream2, ",");
7197 if ((offset % 4) == 0)
7198 fprintf (stream2, "\n ");
7199 if (!(decomposed[i] < (1 << 18)))
7201 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7202 (((i+1 < length ? (1 << 23) : 0)
7203 | (i == 0 ? (type << 18) : 0)
7204 | decomposed[i]) >> 16) & 0xff,
7205 (decomposed[i] >> 8) & 0xff,
7206 decomposed[i] & 0xff);
7212 fprintf (stream2, "\n};\n");
7213 fprintf (stream2, "\n");
7215 decomp_table_finalize (&t);
7218 5 * sizeof (uint32_t);
7220 5 * sizeof (uint32_t)
7221 + t.level1_size * sizeof (uint32_t);
7223 5 * sizeof (uint32_t)
7224 + t.level1_size * sizeof (uint32_t)
7225 + (t.level2_size << t.q) * sizeof (uint32_t);
7227 for (i = 0; i < 5; i++)
7228 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7229 ((uint32_t *) t.result)[i]);
7230 fprintf (stream1, "\n");
7231 fprintf (stream1, "typedef struct\n");
7232 fprintf (stream1, " {\n");
7233 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7234 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7235 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7236 fprintf (stream1, " }\n");
7237 fprintf (stream1, "decomp_index_table_t;\n");
7238 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7239 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7240 fprintf (stream2, "{\n");
7241 fprintf (stream2, " {");
7242 if (t.level1_size > 8)
7243 fprintf (stream2, "\n ");
7244 for (i = 0; i < t.level1_size; i++)
7247 if (i > 0 && (i % 8) == 0)
7248 fprintf (stream2, "\n ");
7249 offset = ((uint32_t *) (t.result + level1_offset))[i];
7251 fprintf (stream2, " %5d", -1);
7253 fprintf (stream2, " %5zu",
7254 (offset - level2_offset) / sizeof (uint32_t));
7255 if (i+1 < t.level1_size)
7256 fprintf (stream2, ",");
7258 if (t.level1_size > 8)
7259 fprintf (stream2, "\n ");
7260 fprintf (stream2, " },\n");
7261 fprintf (stream2, " {");
7262 if (t.level2_size << t.q > 8)
7263 fprintf (stream2, "\n ");
7264 for (i = 0; i < t.level2_size << t.q; i++)
7267 if (i > 0 && (i % 8) == 0)
7268 fprintf (stream2, "\n ");
7269 offset = ((uint32_t *) (t.result + level2_offset))[i];
7271 fprintf (stream2, " %5d", -1);
7273 fprintf (stream2, " %5zu",
7274 (offset - level3_offset) / sizeof (uint16_t));
7275 if (i+1 < t.level2_size << t.q)
7276 fprintf (stream2, ",");
7278 if (t.level2_size << t.q > 8)
7279 fprintf (stream2, "\n ");
7280 fprintf (stream2, " },\n");
7281 fprintf (stream2, " {");
7282 if (t.level3_size << t.p > 8)
7283 fprintf (stream2, "\n ");
7284 for (i = 0; i < t.level3_size << t.p; i++)
7286 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7287 if (i > 0 && (i % 8) == 0)
7288 fprintf (stream2, "\n ");
7289 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7290 if (i+1 < t.level3_size << t.p)
7291 fprintf (stream2, ",");
7293 if (t.level3_size << t.p > 8)
7294 fprintf (stream2, "\n ");
7295 fprintf (stream2, " }\n");
7296 fprintf (stream2, "};\n");
7300 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7302 const char *filenames[2];
7306 filenames[0] = filename1;
7307 filenames[1] = filename2;
7309 for (i = 0; i < 2; i++)
7311 streams[i] = fopen (filenames[i], "w");
7312 if (streams[i] == NULL)
7314 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7319 for (i = 0; i < 2; i++)
7321 FILE *stream = streams[i];
7323 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7324 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7325 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7327 fprintf (stream, "\n");
7330 output_decomposition (streams[0], streams[1]);
7332 for (i = 0; i < 2; i++)
7334 if (ferror (streams[i]) || fclose (streams[i]))
7336 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7342 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7343 char unicode_composition_exclusions[0x110000];
7346 fill_composition_exclusions (const char *compositionexclusions_filename)
7351 stream = fopen (compositionexclusions_filename, "r");
7354 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7358 for (i = 0; i < 0x110000; i++)
7359 unicode_composition_exclusions[i] = 0;
7366 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7369 if (buf[0] == '\0' || buf[0] == '#')
7372 if (sscanf (buf, "%X", &i) != 1)
7374 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7377 if (!(i < 0x110000))
7380 unicode_composition_exclusions[i] = 1;
7383 if (ferror (stream) || fclose (stream))
7385 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7391 debug_output_composition_tables (const char *filename)
7396 stream = fopen (filename, "w");
7399 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7403 for (ch = 0; ch < 0x110000; ch++)
7405 unsigned int length;
7406 unsigned int decomposed[MAX_DECOMP_LENGTH];
7407 int type = get_decomposition (ch, &length, decomposed);
7409 if (type == UC_DECOMP_CANONICAL
7410 /* Consider only binary decompositions.
7411 Exclude singleton decompositions. */
7414 unsigned int code1 = decomposed[0];
7415 unsigned int code2 = decomposed[1];
7416 unsigned int combined = ch;
7418 /* Exclude decompositions where the first part is not a starter,
7419 i.e. is not of canonical combining class 0. */
7420 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7421 /* Exclude characters listed in CompositionExclusions.txt. */
7422 && !unicode_composition_exclusions[combined])
7424 /* The combined character must now also be a starter.
7426 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7429 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7433 unicode_attributes[code2].combining);
7438 if (ferror (stream) || fclose (stream))
7440 fprintf (stderr, "error writing to '%s'\n", filename);
7446 output_composition_tables (const char *filename, const char *version)
7451 stream = fopen (filename, "w");
7454 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7458 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7459 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7460 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7462 fprintf (stream, "\n");
7464 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7465 still carries the GPL header), and it's gnulib-tool which replaces the
7466 GPL header with an LGPL header. */
7467 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7468 fprintf (stream, "\n");
7469 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7470 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7471 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7472 fprintf (stream, " (at your option) any later version.\n");
7473 fprintf (stream, "\n");
7474 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7475 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7476 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7477 fprintf (stream, " GNU General Public License for more details.\n");
7478 fprintf (stream, "\n");
7479 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7480 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7481 fprintf (stream, "\n");
7483 /* The composition table is a set of mappings (code1, code2) -> combined,
7485 367 values for code1 (from 0x003C to 0x30FD),
7486 54 values for code2 (from 0x0300 to 0x309A).
7487 For a fixed code1, there are from 1 to 19 possible values for code2.
7488 For a fixed code2, there are from 1 to 117 possible values for code1.
7489 This is a very sparse matrix.
7491 We want an O(1) hash lookup.
7493 We could implement the hash lookup by mapping (code1, code2) to a linear
7494 combination mul1*code1 + mul2*code2, which is then used as an index into
7495 a 3-level table. But this leads to a table of size 37 KB.
7497 We use gperf to implement the hash lookup, giving it the 928 sets of
7498 4 bytes (code1, code2) as input. gperf generates a hash table of size
7499 1527, which is quite good (60% filled). It requires an auxiliary table
7500 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7502 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7503 fprintf (stream, "%%struct-type\n");
7504 fprintf (stream, "%%language=ANSI-C\n");
7505 fprintf (stream, "%%define slot-name codes\n");
7506 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7507 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7508 fprintf (stream, "%%compare-lengths\n");
7509 fprintf (stream, "%%compare-strncmp\n");
7510 fprintf (stream, "%%readonly-tables\n");
7511 fprintf (stream, "%%omit-struct-type\n");
7512 fprintf (stream, "%%%%\n");
7514 for (ch = 0; ch < 0x110000; ch++)
7516 unsigned int length;
7517 unsigned int decomposed[MAX_DECOMP_LENGTH];
7518 int type = get_decomposition (ch, &length, decomposed);
7520 if (type == UC_DECOMP_CANONICAL
7521 /* Consider only binary decompositions.
7522 Exclude singleton decompositions. */
7525 unsigned int code1 = decomposed[0];
7526 unsigned int code2 = decomposed[1];
7527 unsigned int combined = ch;
7529 /* Exclude decompositions where the first part is not a starter,
7530 i.e. is not of canonical combining class 0. */
7531 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7532 /* Exclude characters listed in CompositionExclusions.txt. */
7533 && !unicode_composition_exclusions[combined])
7535 /* The combined character must now also be a starter.
7537 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7540 if (!(code1 < 0x10000))
7542 if (!(code2 < 0x10000))
7544 if (!(combined < 0x10000))
7547 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7548 (code1 >> 8) & 0xff, code1 & 0xff,
7549 (code2 >> 8) & 0xff, code2 & 0xff,
7555 if (ferror (stream) || fclose (stream))
7557 fprintf (stderr, "error writing to '%s'\n", filename);
7562 /* ========================================================================= */
7564 /* Output the test for a simple character mapping table to the given file. */
7567 output_simple_mapping_test (const char *filename,
7568 const char *function_name,
7569 unsigned int (*func) (unsigned int),
7570 const char *version)
7576 stream = fopen (filename, "w");
7579 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7583 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7584 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7585 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7586 fprintf (stream, "\n");
7587 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7588 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7589 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7590 fprintf (stream, " (at your option) any later version.\n");
7591 fprintf (stream, "\n");
7592 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7593 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7594 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7595 fprintf (stream, " GNU General Public License for more details.\n");
7596 fprintf (stream, "\n");
7597 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7598 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7599 fprintf (stream, "\n");
7600 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7602 fprintf (stream, "\n");
7603 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7604 fprintf (stream, "\n");
7607 for (ch = 0; ch < 0x110000; ch++)
7609 unsigned int value = func (ch);
7614 fprintf (stream, ",\n");
7615 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7620 fprintf (stream, "\n");
7622 fprintf (stream, "\n");
7623 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7624 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7626 if (ferror (stream) || fclose (stream))
7628 fprintf (stderr, "error writing to '%s'\n", filename);
7633 /* Construction of sparse 3-level tables. */
7634 #define TABLE mapping_table
7635 #define ELEMENT int32_t
7637 #define xmalloc malloc
7638 #define xrealloc realloc
7641 /* Output a simple character mapping table to the given file. */
7644 output_simple_mapping (const char *filename,
7645 unsigned int (*func) (unsigned int),
7646 const char *version)
7650 struct mapping_table t;
7651 unsigned int level1_offset, level2_offset, level3_offset;
7653 stream = fopen (filename, "w");
7656 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7660 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7661 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7662 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7667 mapping_table_init (&t);
7669 for (ch = 0; ch < 0x110000; ch++)
7671 int value = (int) func (ch) - (int) ch;
7673 mapping_table_add (&t, ch, value);
7676 mapping_table_finalize (&t);
7678 /* Offsets in t.result, in memory of this process. */
7680 5 * sizeof (uint32_t);
7682 5 * sizeof (uint32_t)
7683 + t.level1_size * sizeof (uint32_t);
7685 5 * sizeof (uint32_t)
7686 + t.level1_size * sizeof (uint32_t)
7687 + (t.level2_size << t.q) * sizeof (uint32_t);
7689 for (i = 0; i < 5; i++)
7690 fprintf (stream, "#define mapping_header_%d %d\n", i,
7691 ((uint32_t *) t.result)[i]);
7692 fprintf (stream, "static const\n");
7693 fprintf (stream, "struct\n");
7694 fprintf (stream, " {\n");
7695 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7696 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7697 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7698 fprintf (stream, " }\n");
7699 fprintf (stream, "u_mapping =\n");
7700 fprintf (stream, "{\n");
7701 fprintf (stream, " {");
7702 if (t.level1_size > 8)
7703 fprintf (stream, "\n ");
7704 for (i = 0; i < t.level1_size; i++)
7707 if (i > 0 && (i % 8) == 0)
7708 fprintf (stream, "\n ");
7709 offset = ((uint32_t *) (t.result + level1_offset))[i];
7711 fprintf (stream, " %5d", -1);
7713 fprintf (stream, " %5zu",
7714 (offset - level2_offset) / sizeof (uint32_t));
7715 if (i+1 < t.level1_size)
7716 fprintf (stream, ",");
7718 if (t.level1_size > 8)
7719 fprintf (stream, "\n ");
7720 fprintf (stream, " },\n");
7721 fprintf (stream, " {");
7722 if (t.level2_size << t.q > 8)
7723 fprintf (stream, "\n ");
7724 for (i = 0; i < t.level2_size << t.q; i++)
7727 if (i > 0 && (i % 8) == 0)
7728 fprintf (stream, "\n ");
7729 offset = ((uint32_t *) (t.result + level2_offset))[i];
7731 fprintf (stream, " %5d", -1);
7733 fprintf (stream, " %5zu",
7734 (offset - level3_offset) / sizeof (int32_t));
7735 if (i+1 < t.level2_size << t.q)
7736 fprintf (stream, ",");
7738 if (t.level2_size << t.q > 8)
7739 fprintf (stream, "\n ");
7740 fprintf (stream, " },\n");
7741 fprintf (stream, " {");
7742 if (t.level3_size << t.p > 8)
7743 fprintf (stream, "\n ");
7744 for (i = 0; i < t.level3_size << t.p; i++)
7746 if (i > 0 && (i % 8) == 0)
7747 fprintf (stream, "\n ");
7748 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7749 if (i+1 < t.level3_size << t.p)
7750 fprintf (stream, ",");
7752 if (t.level3_size << t.p > 8)
7753 fprintf (stream, "\n ");
7754 fprintf (stream, " }\n");
7755 fprintf (stream, "};\n");
7757 if (ferror (stream) || fclose (stream))
7759 fprintf (stderr, "error writing to '%s'\n", filename);
7764 /* ========================================================================= */
7766 /* A special casing context.
7767 A context is negated through x -> -x. */
7772 SCC_AFTER_SOFT_DOTTED,
7778 /* A special casing rule. */
7779 struct special_casing_rule
7782 unsigned int lower_mapping[3];
7783 unsigned int title_mapping[3];
7784 unsigned int upper_mapping[3];
7785 unsigned int casefold_mapping[3];
7786 const char *language;
7790 /* The special casing rules. */
7791 struct special_casing_rule **casing_rules;
7792 unsigned int num_casing_rules;
7793 unsigned int allocated_casing_rules;
7796 add_casing_rule (struct special_casing_rule *new_rule)
7798 if (num_casing_rules == allocated_casing_rules)
7800 allocated_casing_rules = 2 * allocated_casing_rules;
7801 if (allocated_casing_rules < 16)
7802 allocated_casing_rules = 16;
7804 (struct special_casing_rule **)
7805 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
7807 casing_rules[num_casing_rules++] = new_rule;
7810 /* Stores in casing_rules the special casing rules found in
7811 specialcasing_filename. */
7813 fill_casing_rules (const char *specialcasing_filename)
7817 stream = fopen (specialcasing_filename, "r");
7820 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
7824 casing_rules = NULL;
7825 num_casing_rules = 0;
7826 allocated_casing_rules = 0;
7836 unsigned int lower_mapping[3];
7837 unsigned int title_mapping[3];
7838 unsigned int upper_mapping[3];
7842 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7845 if (buf[0] == '\0' || buf[0] == '#')
7850 code = strtoul (scanptr, &endptr, 16);
7851 if (endptr == scanptr)
7853 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7857 if (*scanptr != ';')
7859 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7864 /* Scan lower mapping. */
7865 for (i = 0; i < 3; i++)
7866 lower_mapping[i] = 0;
7867 for (i = 0; i < 3; i++)
7869 while (*scanptr == ' ')
7871 if (*scanptr == ';')
7873 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
7874 if (endptr == scanptr)
7876 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7881 if (*scanptr != ';')
7883 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7888 /* Scan title mapping. */
7889 for (i = 0; i < 3; i++)
7890 title_mapping[i] = 0;
7891 for (i = 0; i < 3; i++)
7893 while (*scanptr == ' ')
7895 if (*scanptr == ';')
7897 title_mapping[i] = strtoul (scanptr, &endptr, 16);
7898 if (endptr == scanptr)
7900 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7905 if (*scanptr != ';')
7907 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7912 /* Scan upper mapping. */
7913 for (i = 0; i < 3; i++)
7914 upper_mapping[i] = 0;
7915 for (i = 0; i < 3; i++)
7917 while (*scanptr == ' ')
7919 if (*scanptr == ';')
7921 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
7922 if (endptr == scanptr)
7924 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7929 if (*scanptr != ';')
7931 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7936 /* Scan language and context. */
7938 context = SCC_ALWAYS;
7939 while (*scanptr == ' ')
7941 if (*scanptr != '\0' && *scanptr != '#')
7943 const char *word_begin = scanptr;
7944 const char *word_end;
7946 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7950 while (*scanptr == ' ')
7953 if (word_end - word_begin == 2)
7955 language = (char *) malloc ((word_end - word_begin) + 1);
7956 memcpy (language, word_begin, 2);
7957 language[word_end - word_begin] = '\0';
7958 word_begin = word_end = NULL;
7960 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7962 word_begin = scanptr;
7963 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7969 if (word_end > word_begin)
7971 bool negate = false;
7973 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
7978 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
7979 context = SCC_FINAL_SIGMA;
7980 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
7981 context = SCC_AFTER_SOFT_DOTTED;
7982 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
7983 context = SCC_MORE_ABOVE;
7984 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
7985 context = SCC_BEFORE_DOT;
7986 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
7987 context = SCC_AFTER_I;
7990 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
7994 context = - context;
7997 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7999 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8004 /* Store the rule. */
8006 struct special_casing_rule *new_rule =
8007 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8008 new_rule->code = code;
8009 new_rule->language = language;
8010 new_rule->context = context;
8011 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
8012 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
8013 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
8015 add_casing_rule (new_rule);
8019 if (ferror (stream) || fclose (stream))
8021 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
8026 /* A casefolding rule. */
8027 struct casefold_rule
8030 unsigned int mapping[3];
8031 const char *language;
8034 /* The casefolding rules. */
8035 struct casefold_rule **casefolding_rules;
8036 unsigned int num_casefolding_rules;
8037 unsigned int allocated_casefolding_rules;
8039 /* Stores in casefolding_rules the case folding rules found in
8040 casefolding_filename. */
8042 fill_casefolding_rules (const char *casefolding_filename)
8046 stream = fopen (casefolding_filename, "r");
8049 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
8053 casefolding_rules = NULL;
8054 num_casefolding_rules = 0;
8055 allocated_casefolding_rules = 0;
8066 unsigned int mapping[3];
8068 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8071 if (buf[0] == '\0' || buf[0] == '#')
8076 code = strtoul (scanptr, &endptr, 16);
8077 if (endptr == scanptr)
8079 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8083 if (*scanptr != ';')
8085 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8091 while (*scanptr == ' ')
8096 case 'C': case 'F': case 'S': case 'T':
8100 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8104 if (*scanptr != ';')
8106 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8111 /* Scan casefold mapping. */
8112 for (i = 0; i < 3; i++)
8114 for (i = 0; i < 3; i++)
8116 while (*scanptr == ' ')
8118 if (*scanptr == ';')
8120 mapping[i] = strtoul (scanptr, &endptr, 16);
8121 if (endptr == scanptr)
8123 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8128 if (*scanptr != ';')
8130 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8135 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8138 const char * const *languages;
8139 unsigned int languages_count;
8141 /* Type 'T' indicates that the rule is applicable to Turkish
8145 static const char * const turkish_languages[] = { "tr", "az" };
8146 languages = turkish_languages;
8147 languages_count = 2;
8151 static const char * const all_languages[] = { NULL };
8152 languages = all_languages;
8153 languages_count = 1;
8156 for (i = 0; i < languages_count; i++)
8158 /* Store a new rule. */
8159 struct casefold_rule *new_rule =
8160 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8161 new_rule->code = code;
8162 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8163 new_rule->language = languages[i];
8165 if (num_casefolding_rules == allocated_casefolding_rules)
8167 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8168 if (allocated_casefolding_rules < 16)
8169 allocated_casefolding_rules = 16;
8171 (struct casefold_rule **)
8172 realloc (casefolding_rules,
8173 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8175 casefolding_rules[num_casefolding_rules++] = new_rule;
8180 if (ferror (stream) || fclose (stream))
8182 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8187 /* Casefold mapping, when it maps to a single character. */
8188 unsigned int unicode_casefold[0x110000];
8191 to_casefold (unsigned int ch)
8193 return unicode_casefold[ch];
8196 /* Redistribute the casefolding_rules:
8197 - Rules that map to a single character, language independently, are stored
8198 in unicode_casefold.
8199 - Other rules are merged into casing_rules. */
8201 redistribute_casefolding_rules (void)
8203 unsigned int ch, i, j;
8205 /* Fill unicode_casefold[]. */
8206 for (ch = 0; ch < 0x110000; ch++)
8207 unicode_casefold[ch] = ch;
8208 for (i = 0; i < num_casefolding_rules; i++)
8210 struct casefold_rule *cfrule = casefolding_rules[i];
8212 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8215 if (!(ch < 0x110000))
8217 unicode_casefold[ch] = cfrule->mapping[0];
8221 /* Extend the special casing rules by filling in their casefold_mapping[]
8223 for (j = 0; j < num_casing_rules; j++)
8225 struct special_casing_rule *rule = casing_rules[j];
8228 rule->casefold_mapping[0] = to_casefold (rule->code);
8229 for (k = 1; k < 3; k++)
8230 rule->casefold_mapping[k] = 0;
8233 /* Now merge the other casefolding rules into casing_rules. */
8234 for (i = 0; i < num_casefolding_rules; i++)
8236 struct casefold_rule *cfrule = casefolding_rules[i];
8238 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8240 /* Find a rule that applies to the same code, same language, and it
8241 has context SCC_ALWAYS. At the same time, update all rules that
8242 have the same code and same or more specific language. */
8243 struct special_casing_rule *found_rule = NULL;
8245 for (j = 0; j < num_casing_rules; j++)
8247 struct special_casing_rule *rule = casing_rules[j];
8249 if (rule->code == cfrule->code
8250 && (cfrule->language == NULL
8251 || (rule->language != NULL
8252 && strcmp (rule->language, cfrule->language) == 0)))
8254 memcpy (rule->casefold_mapping, cfrule->mapping,
8255 sizeof (rule->casefold_mapping));
8257 if ((cfrule->language == NULL
8258 ? rule->language == NULL
8259 : rule->language != NULL
8260 && strcmp (rule->language, cfrule->language) == 0)
8261 && rule->context == SCC_ALWAYS)
8269 if (found_rule == NULL)
8271 /* Create a new rule. */
8272 struct special_casing_rule *new_rule =
8273 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8275 /* Try to find a rule that applies to the same code, no language
8276 restriction, and with context SCC_ALWAYS. */
8277 for (j = 0; j < num_casing_rules; j++)
8279 struct special_casing_rule *rule = casing_rules[j];
8281 if (rule->code == cfrule->code
8282 && rule->context == SCC_ALWAYS
8283 && rule->language == NULL)
8291 new_rule->code = cfrule->code;
8292 new_rule->language = cfrule->language;
8293 new_rule->context = SCC_ALWAYS;
8294 if (found_rule != NULL)
8296 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8297 sizeof (new_rule->lower_mapping));
8298 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8299 sizeof (new_rule->title_mapping));
8300 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8301 sizeof (new_rule->upper_mapping));
8307 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8308 for (k = 1; k < 3; k++)
8309 new_rule->lower_mapping[k] = 0;
8310 new_rule->title_mapping[0] = to_title (cfrule->code);
8311 for (k = 1; k < 3; k++)
8312 new_rule->title_mapping[k] = 0;
8313 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8314 for (k = 1; k < 3; k++)
8315 new_rule->upper_mapping[k] = 0;
8317 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8318 sizeof (new_rule->casefold_mapping));
8320 add_casing_rule (new_rule);
8327 compare_casing_rules (const void *a, const void *b)
8329 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8330 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8331 unsigned int a_code = a_rule->code;
8332 unsigned int b_code = b_rule->code;
8334 if (a_code < b_code)
8336 if (a_code > b_code)
8339 /* Sort the more specific rules before the more general ones. */
8340 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8341 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8345 sort_casing_rules (void)
8347 /* Sort the rules 1. by code, 2. by specificity. */
8348 if (num_casing_rules > 1)
8349 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8350 compare_casing_rules);
8353 /* Output the special casing rules. */
8355 output_casing_rules (const char *filename, const char *version)
8361 stream = fopen (filename, "w");
8364 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8368 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8369 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8370 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8372 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8373 fprintf (stream, "%%struct-type\n");
8374 fprintf (stream, "%%language=ANSI-C\n");
8375 fprintf (stream, "%%define slot-name code\n");
8376 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8377 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8378 fprintf (stream, "%%compare-lengths\n");
8379 fprintf (stream, "%%compare-strncmp\n");
8380 fprintf (stream, "%%readonly-tables\n");
8381 fprintf (stream, "%%omit-struct-type\n");
8382 fprintf (stream, "%%%%\n");
8385 for (i = 0; i < num_casing_rules; i++)
8387 struct special_casing_rule *rule = casing_rules[i];
8390 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8395 if (!(rule->code < 0x10000))
8397 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8401 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8402 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8404 fprintf (stream, "%d, ",
8405 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8407 context = rule->context;
8410 fprintf (stream, "-");
8411 context = - context;
8414 fprintf (stream, " ");
8418 fprintf (stream, "SCC_ALWAYS ");
8420 case SCC_FINAL_SIGMA:
8421 fprintf (stream, "SCC_FINAL_SIGMA ");
8423 case SCC_AFTER_SOFT_DOTTED:
8424 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8426 case SCC_MORE_ABOVE:
8427 fprintf (stream, "SCC_MORE_ABOVE ");
8429 case SCC_BEFORE_DOT:
8430 fprintf (stream, "SCC_BEFORE_DOT ");
8433 fprintf (stream, "SCC_AFTER_I ");
8438 fprintf (stream, ", ");
8440 if (rule->language != NULL)
8442 if (strlen (rule->language) != 2)
8444 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8447 fprintf (stream, "{ '\\0', '\\0' }, ");
8449 fprintf (stream, "{ ");
8450 for (j = 0; j < 3; j++)
8453 fprintf (stream, ", ");
8454 if (!(rule->upper_mapping[j] < 0x10000))
8456 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8459 if (rule->upper_mapping[j] != 0)
8460 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8462 fprintf (stream, " 0");
8464 fprintf (stream, " }, { ");
8465 for (j = 0; j < 3; j++)
8468 fprintf (stream, ", ");
8469 if (!(rule->lower_mapping[j] < 0x10000))
8471 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8474 if (rule->lower_mapping[j] != 0)
8475 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8477 fprintf (stream, " 0");
8479 fprintf (stream, " }, { ");
8480 for (j = 0; j < 3; j++)
8483 fprintf (stream, ", ");
8484 if (!(rule->title_mapping[j] < 0x10000))
8486 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8489 if (rule->title_mapping[j] != 0)
8490 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8492 fprintf (stream, " 0");
8494 fprintf (stream, " }, { ");
8495 for (j = 0; j < 3; j++)
8498 fprintf (stream, ", ");
8499 if (!(rule->casefold_mapping[j] < 0x10000))
8501 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8504 if (rule->casefold_mapping[j] != 0)
8505 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8507 fprintf (stream, " 0");
8509 fprintf (stream, " }\n");
8512 if (ferror (stream) || fclose (stream))
8514 fprintf (stderr, "error writing to '%s'\n", filename);
8519 /* ========================================================================= */
8521 /* Quoting the Unicode standard:
8522 Definition: A character is defined to be "cased" if it has the Lowercase
8523 or Uppercase property or has a General_Category value of
8524 Titlecase_Letter. */
8526 is_cased (unsigned int ch)
8528 return (is_property_lowercase (ch)
8529 || is_property_uppercase (ch)
8530 || is_category_Lt (ch));
8533 /* Quoting the Unicode standard:
8534 Definition: A character is defined to be "case-ignorable" if it has the
8535 value MidLetter {or the value MidNumLet} for the Word_Break property or
8536 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8537 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8538 The text marked in braces was added in Unicode 5.1.0, see
8539 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8540 Definition of case-ignorable". */
8541 /* Since this predicate is only used for the "Before C" and "After C"
8542 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8543 This simplifies the evaluation of the regular expressions
8544 \p{cased} (\p{case-ignorable})* C
8546 C (\p{case-ignorable})* \p{cased}
8549 is_case_ignorable (unsigned int ch)
8551 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8552 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8553 || is_category_Mn (ch)
8554 || is_category_Me (ch)
8555 || is_category_Cf (ch)
8556 || is_category_Lm (ch)
8557 || is_category_Sk (ch))
8561 /* ------------------------------------------------------------------------- */
8563 /* Output all case related properties. */
8565 output_casing_properties (const char *version)
8567 #define PROPERTY(FN,P) \
8568 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
8569 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
8570 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
8571 PROPERTY(cased, cased)
8572 PROPERTY(ignorable, case_ignorable)
8576 /* ========================================================================= */
8579 main (int argc, char * argv[])
8581 const char *unicodedata_filename;
8582 const char *proplist_filename;
8583 const char *derivedproplist_filename;
8584 const char *scripts_filename;
8585 const char *blocks_filename;
8586 const char *proplist30_filename;
8587 const char *eastasianwidth_filename;
8588 const char *linebreak_filename;
8589 const char *wordbreakproperty_filename;
8590 const char *graphemebreakproperty_filename;
8591 const char *compositionexclusions_filename;
8592 const char *specialcasing_filename;
8593 const char *casefolding_filename;
8594 const char *version;
8598 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
8603 unicodedata_filename = argv[1];
8604 proplist_filename = argv[2];
8605 derivedproplist_filename = argv[3];
8606 scripts_filename = argv[4];
8607 blocks_filename = argv[5];
8608 proplist30_filename = argv[6];
8609 eastasianwidth_filename = argv[7];
8610 linebreak_filename = argv[8];
8611 wordbreakproperty_filename = argv[9];
8612 graphemebreakproperty_filename = argv[10];
8613 compositionexclusions_filename = argv[11];
8614 specialcasing_filename = argv[12];
8615 casefolding_filename = argv[13];
8618 fill_attributes (unicodedata_filename);
8619 clear_properties ();
8620 fill_properties (proplist_filename);
8621 fill_properties (derivedproplist_filename);
8622 fill_properties30 (proplist30_filename);
8623 fill_scripts (scripts_filename);
8624 fill_blocks (blocks_filename);
8625 fill_width (eastasianwidth_filename);
8626 fill_org_lbp (linebreak_filename);
8627 fill_org_wbp (wordbreakproperty_filename);
8628 fill_org_gbp (graphemebreakproperty_filename);
8629 fill_composition_exclusions (compositionexclusions_filename);
8630 fill_casing_rules (specialcasing_filename);
8631 fill_casefolding_rules (casefolding_filename);
8632 redistribute_casefolding_rules ();
8633 sort_casing_rules ();
8635 output_categories (version);
8636 output_category ("unictype/categ_of.h", version);
8637 output_combclass ("unictype/combining.h", version);
8638 output_bidi_category ("unictype/bidi_of.h", version);
8639 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
8640 output_decimal_digit ("unictype/decdigit.h", version);
8641 output_digit_test ("../tests/unictype/test-digit.h", version);
8642 output_digit ("unictype/digit.h", version);
8643 output_numeric_test ("../tests/unictype/test-numeric.h", version);
8644 output_numeric ("unictype/numeric.h", version);
8645 output_mirror ("unictype/mirror.h", version);
8646 output_properties (version);
8647 output_scripts (version);
8648 output_scripts_byname (version);
8649 output_blocks (version);
8650 output_ident_properties (version);
8651 output_old_ctype (version);
8653 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
8654 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
8655 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
8657 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
8658 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
8659 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
8661 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
8662 output_gbp_table ("unigbrk/gbrkprop.h", version);
8664 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
8665 debug_output_composition_tables ("uninorm/composition.txt");
8666 output_composition_tables ("uninorm/composition-table.gperf", version);
8668 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
8669 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
8670 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
8671 output_simple_mapping ("unicase/toupper.h", to_upper, version);
8672 output_simple_mapping ("unicase/tolower.h", to_lower, version);
8673 output_simple_mapping ("unicase/totitle.h", to_title, version);
8674 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
8675 output_casing_rules ("unicase/special-casing-table.gperf", version);
8676 output_casing_properties (version);
8682 * For Emacs M-x compile
8684 * compile-command: "
8685 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
8687 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
8688 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
8689 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
8690 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
8691 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
8692 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
8693 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
8694 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
8695 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
8696 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
8697 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
8698 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
8699 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \