1 /* Generate Unicode conforming character classification tables and
2 Line Break Properties tables from a UnicodeData file.
3 Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/PropList.txt \
22 /usr/local/share/Unidata/DerivedCoreProperties.txt \
23 /usr/local/share/Unidata/Scripts.txt \
24 /usr/local/share/Unidata/Blocks.txt \
25 /usr/local/share/Unidata/PropList-3.0.1.txt \
26 /usr/local/share/Unidata/EastAsianWidth.txt \
27 /usr/local/share/Unidata/LineBreak.txt \
38 /* ========================================================================= */
40 /* Reading UnicodeData.txt. */
43 /* This structure represents one line in the UnicodeData.txt file. */
44 struct unicode_attribute
46 const char *name; /* Character name */
47 const char *category; /* General category */
48 const char *combining; /* Canonical combining class */
49 const char *bidi; /* Bidirectional category */
50 const char *decomposition; /* Character decomposition mapping */
51 const char *decdigit; /* Decimal digit value */
52 const char *digit; /* Digit value */
53 const char *numeric; /* Numeric value */
54 bool mirrored; /* mirrored */
55 const char *oldname; /* Old Unicode 1.0 name */
56 const char *comment; /* Comment */
57 unsigned int upper; /* Uppercase mapping */
58 unsigned int lower; /* Lowercase mapping */
59 unsigned int title; /* Titlecase mapping */
62 /* Missing fields are represented with "" for strings, and NONE for
64 #define NONE (~(unsigned int)0)
66 /* The entire contents of the UnicodeData.txt file. */
67 struct unicode_attribute unicode_attributes [0x110000];
69 /* Stores in unicode_attributes[i] the values from the given fields. */
71 fill_attribute (unsigned int i,
72 const char *field1, const char *field2,
73 const char *field3, const char *field4,
74 const char *field5, const char *field6,
75 const char *field7, const char *field8,
76 const char *field9, const char *field10,
77 const char *field11, const char *field12,
78 const char *field13, const char *field14)
80 struct unicode_attribute * uni;
84 fprintf (stderr, "index too large\n");
87 if (strcmp (field2, "Cs") == 0)
88 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
90 uni = &unicode_attributes[i];
91 /* Copy the strings. */
92 uni->name = strdup (field1);
93 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
94 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
95 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
96 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
97 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
98 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
99 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
100 uni->mirrored = (field9[0] == 'Y');
101 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
102 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
103 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
104 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
105 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
108 /* Maximum length of a field in the UnicodeData.txt file. */
111 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
112 Reads up to (but excluding) DELIM.
113 Returns 1 when a field was successfully read, otherwise 0. */
115 getfield (FILE *stream, char *buffer, int delim)
120 for (; (c = getc (stream)), (c != EOF && c != delim); )
122 /* The original unicode.org UnicodeData.txt file happens to have
123 CR/LF line terminators. Silently convert to LF. */
127 /* Put c into the buffer. */
128 if (++count >= FIELDLEN - 1)
130 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
143 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
146 fill_attributes (const char *unicodedata_filename)
150 char field0[FIELDLEN];
151 char field1[FIELDLEN];
152 char field2[FIELDLEN];
153 char field3[FIELDLEN];
154 char field4[FIELDLEN];
155 char field5[FIELDLEN];
156 char field6[FIELDLEN];
157 char field7[FIELDLEN];
158 char field8[FIELDLEN];
159 char field9[FIELDLEN];
160 char field10[FIELDLEN];
161 char field11[FIELDLEN];
162 char field12[FIELDLEN];
163 char field13[FIELDLEN];
164 char field14[FIELDLEN];
167 for (i = 0; i < 0x110000; i++)
168 unicode_attributes[i].name = NULL;
170 stream = fopen (unicodedata_filename, "r");
173 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
182 n = getfield (stream, field0, ';');
183 n += getfield (stream, field1, ';');
184 n += getfield (stream, field2, ';');
185 n += getfield (stream, field3, ';');
186 n += getfield (stream, field4, ';');
187 n += getfield (stream, field5, ';');
188 n += getfield (stream, field6, ';');
189 n += getfield (stream, field7, ';');
190 n += getfield (stream, field8, ';');
191 n += getfield (stream, field9, ';');
192 n += getfield (stream, field10, ';');
193 n += getfield (stream, field11, ';');
194 n += getfield (stream, field12, ';');
195 n += getfield (stream, field13, ';');
196 n += getfield (stream, field14, '\n');
201 fprintf (stderr, "short line in '%s':%d\n",
202 unicodedata_filename, lineno);
205 i = strtoul (field0, NULL, 16);
207 && strlen (field1) >= 9
208 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
210 /* Deal with a range. */
212 n = getfield (stream, field0, ';');
213 n += getfield (stream, field1, ';');
214 n += getfield (stream, field2, ';');
215 n += getfield (stream, field3, ';');
216 n += getfield (stream, field4, ';');
217 n += getfield (stream, field5, ';');
218 n += getfield (stream, field6, ';');
219 n += getfield (stream, field7, ';');
220 n += getfield (stream, field8, ';');
221 n += getfield (stream, field9, ';');
222 n += getfield (stream, field10, ';');
223 n += getfield (stream, field11, ';');
224 n += getfield (stream, field12, ';');
225 n += getfield (stream, field13, ';');
226 n += getfield (stream, field14, '\n');
229 fprintf (stderr, "missing end range in '%s':%d\n",
230 unicodedata_filename, lineno);
233 if (!(field1[0] == '<'
234 && strlen (field1) >= 8
235 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
237 fprintf (stderr, "missing end range in '%s':%d\n",
238 unicodedata_filename, lineno);
241 field1[strlen (field1) - 7] = '\0';
242 j = strtoul (field0, NULL, 16);
244 fill_attribute (i, field1+1, field2, field3, field4, field5,
245 field6, field7, field8, field9, field10,
246 field11, field12, field13, field14);
250 /* Single character line */
251 fill_attribute (i, field1, field2, field3, field4, field5,
252 field6, field7, field8, field9, field10,
253 field11, field12, field13, field14);
256 if (ferror (stream) || fclose (stream))
258 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
263 /* ========================================================================= */
265 /* General category. */
266 /* See Unicode 3.0 book, section 4.5,
270 is_category_L (unsigned int ch)
272 return (unicode_attributes[ch].name != NULL
273 && unicode_attributes[ch].category[0] == 'L');
277 is_category_Lu (unsigned int ch)
279 return (unicode_attributes[ch].name != NULL
280 && unicode_attributes[ch].category[0] == 'L'
281 && unicode_attributes[ch].category[1] == 'u');
285 is_category_Ll (unsigned int ch)
287 return (unicode_attributes[ch].name != NULL
288 && unicode_attributes[ch].category[0] == 'L'
289 && unicode_attributes[ch].category[1] == 'l');
293 is_category_Lt (unsigned int ch)
295 return (unicode_attributes[ch].name != NULL
296 && unicode_attributes[ch].category[0] == 'L'
297 && unicode_attributes[ch].category[1] == 't');
301 is_category_Lm (unsigned int ch)
303 return (unicode_attributes[ch].name != NULL
304 && unicode_attributes[ch].category[0] == 'L'
305 && unicode_attributes[ch].category[1] == 'm');
309 is_category_Lo (unsigned int ch)
311 return (unicode_attributes[ch].name != NULL
312 && unicode_attributes[ch].category[0] == 'L'
313 && unicode_attributes[ch].category[1] == 'o');
317 is_category_M (unsigned int ch)
319 return (unicode_attributes[ch].name != NULL
320 && unicode_attributes[ch].category[0] == 'M');
324 is_category_Mn (unsigned int ch)
326 return (unicode_attributes[ch].name != NULL
327 && unicode_attributes[ch].category[0] == 'M'
328 && unicode_attributes[ch].category[1] == 'n');
332 is_category_Mc (unsigned int ch)
334 return (unicode_attributes[ch].name != NULL
335 && unicode_attributes[ch].category[0] == 'M'
336 && unicode_attributes[ch].category[1] == 'c');
340 is_category_Me (unsigned int ch)
342 return (unicode_attributes[ch].name != NULL
343 && unicode_attributes[ch].category[0] == 'M'
344 && unicode_attributes[ch].category[1] == 'e');
348 is_category_N (unsigned int ch)
350 return (unicode_attributes[ch].name != NULL
351 && unicode_attributes[ch].category[0] == 'N');
355 is_category_Nd (unsigned int ch)
357 return (unicode_attributes[ch].name != NULL
358 && unicode_attributes[ch].category[0] == 'N'
359 && unicode_attributes[ch].category[1] == 'd');
363 is_category_Nl (unsigned int ch)
365 return (unicode_attributes[ch].name != NULL
366 && unicode_attributes[ch].category[0] == 'N'
367 && unicode_attributes[ch].category[1] == 'l');
371 is_category_No (unsigned int ch)
373 return (unicode_attributes[ch].name != NULL
374 && unicode_attributes[ch].category[0] == 'N'
375 && unicode_attributes[ch].category[1] == 'o');
379 is_category_P (unsigned int ch)
381 return (unicode_attributes[ch].name != NULL
382 && unicode_attributes[ch].category[0] == 'P');
386 is_category_Pc (unsigned int ch)
388 return (unicode_attributes[ch].name != NULL
389 && unicode_attributes[ch].category[0] == 'P'
390 && unicode_attributes[ch].category[1] == 'c');
394 is_category_Pd (unsigned int ch)
396 return (unicode_attributes[ch].name != NULL
397 && unicode_attributes[ch].category[0] == 'P'
398 && unicode_attributes[ch].category[1] == 'd');
402 is_category_Ps (unsigned int ch)
404 return (unicode_attributes[ch].name != NULL
405 && unicode_attributes[ch].category[0] == 'P'
406 && unicode_attributes[ch].category[1] == 's');
410 is_category_Pe (unsigned int ch)
412 return (unicode_attributes[ch].name != NULL
413 && unicode_attributes[ch].category[0] == 'P'
414 && unicode_attributes[ch].category[1] == 'e');
418 is_category_Pi (unsigned int ch)
420 return (unicode_attributes[ch].name != NULL
421 && unicode_attributes[ch].category[0] == 'P'
422 && unicode_attributes[ch].category[1] == 'i');
426 is_category_Pf (unsigned int ch)
428 return (unicode_attributes[ch].name != NULL
429 && unicode_attributes[ch].category[0] == 'P'
430 && unicode_attributes[ch].category[1] == 'f');
434 is_category_Po (unsigned int ch)
436 return (unicode_attributes[ch].name != NULL
437 && unicode_attributes[ch].category[0] == 'P'
438 && unicode_attributes[ch].category[1] == 'o');
442 is_category_S (unsigned int ch)
444 return (unicode_attributes[ch].name != NULL
445 && unicode_attributes[ch].category[0] == 'S');
449 is_category_Sm (unsigned int ch)
451 return (unicode_attributes[ch].name != NULL
452 && unicode_attributes[ch].category[0] == 'S'
453 && unicode_attributes[ch].category[1] == 'm');
457 is_category_Sc (unsigned int ch)
459 return (unicode_attributes[ch].name != NULL
460 && unicode_attributes[ch].category[0] == 'S'
461 && unicode_attributes[ch].category[1] == 'c');
465 is_category_Sk (unsigned int ch)
467 return (unicode_attributes[ch].name != NULL
468 && unicode_attributes[ch].category[0] == 'S'
469 && unicode_attributes[ch].category[1] == 'k');
473 is_category_So (unsigned int ch)
475 return (unicode_attributes[ch].name != NULL
476 && unicode_attributes[ch].category[0] == 'S'
477 && unicode_attributes[ch].category[1] == 'o');
481 is_category_Z (unsigned int ch)
483 return (unicode_attributes[ch].name != NULL
484 && unicode_attributes[ch].category[0] == 'Z');
488 is_category_Zs (unsigned int ch)
490 return (unicode_attributes[ch].name != NULL
491 && unicode_attributes[ch].category[0] == 'Z'
492 && unicode_attributes[ch].category[1] == 's');
496 is_category_Zl (unsigned int ch)
498 return (unicode_attributes[ch].name != NULL
499 && unicode_attributes[ch].category[0] == 'Z'
500 && unicode_attributes[ch].category[1] == 'l');
504 is_category_Zp (unsigned int ch)
506 return (unicode_attributes[ch].name != NULL
507 && unicode_attributes[ch].category[0] == 'Z'
508 && unicode_attributes[ch].category[1] == 'p');
512 is_category_C (unsigned int ch)
514 return (unicode_attributes[ch].name == NULL
515 || unicode_attributes[ch].category[0] == 'C');
519 is_category_Cc (unsigned int ch)
521 return (unicode_attributes[ch].name != NULL
522 && unicode_attributes[ch].category[0] == 'C'
523 && unicode_attributes[ch].category[1] == 'c');
527 is_category_Cf (unsigned int ch)
529 return (unicode_attributes[ch].name != NULL
530 && unicode_attributes[ch].category[0] == 'C'
531 && unicode_attributes[ch].category[1] == 'f');
535 is_category_Cs (unsigned int ch)
537 return (ch >= 0xd800 && ch < 0xe000);
541 is_category_Co (unsigned int ch)
543 return (unicode_attributes[ch].name != NULL
544 && unicode_attributes[ch].category[0] == 'C'
545 && unicode_attributes[ch].category[1] == 'o');
549 is_category_Cn (unsigned int ch)
551 return (unicode_attributes[ch].name == NULL
552 && !(ch >= 0xd800 && ch < 0xe000));
555 /* Output a boolean property in a human readable format. */
557 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
562 stream = fopen (filename, "w");
565 fprintf (stderr, "cannot open '%s' for writing\n", filename);
569 #if 0 /* This yields huge text output. */
570 for (ch = 0; ch < 0x110000; ch++)
573 fprintf (stream, "0x%04X\n", ch);
576 for (ch = 0; ch < 0x110000; ch++)
579 unsigned int first = ch;
582 while (ch + 1 < 0x110000 && predicate (ch + 1))
586 fprintf (stream, "0x%04X..0x%04X\n", first, last);
588 fprintf (stream, "0x%04X\n", ch);
592 if (ferror (stream) || fclose (stream))
594 fprintf (stderr, "error writing to '%s'\n", filename);
599 /* Output the unit test for a boolean property. */
601 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
607 stream = fopen (filename, "w");
610 fprintf (stderr, "cannot open '%s' for writing\n", filename);
614 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
615 fprintf (stream, "/* Test the Unicode character type functions.\n");
616 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
617 fprintf (stream, "\n");
618 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
619 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
620 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
621 fprintf (stream, " (at your option) any later version.\n");
622 fprintf (stream, "\n");
623 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
624 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
625 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
626 fprintf (stream, " GNU General Public License for more details.\n");
627 fprintf (stream, "\n");
628 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
629 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
630 fprintf (stream, "\n");
631 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
632 fprintf (stream, "\n");
635 for (ch = 0; ch < 0x110000; ch++)
638 unsigned int first = ch;
641 while (ch + 1 < 0x110000 && predicate (ch + 1))
645 fprintf (stream, ",\n");
646 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
650 fprintf (stream, "\n");
652 fprintf (stream, "\n");
653 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
654 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
656 if (ferror (stream) || fclose (stream))
658 fprintf (stderr, "error writing to '%s'\n", filename);
663 /* Construction of sparse 3-level tables. */
664 #define TABLE predicate_table
665 #define xmalloc malloc
666 #define xrealloc realloc
667 #include "3levelbit.h"
669 /* Output a boolean property in a three-level bitmap. */
671 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
675 struct predicate_table t;
676 unsigned int level1_offset, level2_offset, level3_offset;
678 stream = fopen (filename, "w");
681 fprintf (stderr, "cannot open '%s' for writing\n", filename);
685 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
686 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
687 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
692 predicate_table_init (&t);
694 for (ch = 0; ch < 0x110000; ch++)
696 predicate_table_add (&t, ch);
698 predicate_table_finalize (&t);
700 /* Offsets in t.result, in memory of this process. */
702 5 * sizeof (uint32_t);
704 5 * sizeof (uint32_t)
705 + t.level1_size * sizeof (uint32_t);
707 5 * sizeof (uint32_t)
708 + t.level1_size * sizeof (uint32_t)
709 + (t.level2_size << t.q) * sizeof (uint32_t);
711 for (i = 0; i < 5; i++)
713 fprintf (stream, "#define header_%d %d\n", i,
714 ((uint32_t *) t.result)[i]);
716 fprintf (stream, "static const\n");
717 fprintf (stream, "struct\n");
718 fprintf (stream, " {\n");
719 fprintf (stream, " int header[1];\n");
720 fprintf (stream, " int level1[%zu];\n", t.level1_size);
721 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
722 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
723 fprintf (stream, " }\n");
724 fprintf (stream, "%s =\n", name);
725 fprintf (stream, "{\n");
726 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
727 fprintf (stream, " {");
728 if (t.level1_size > 1)
729 fprintf (stream, "\n ");
730 for (i = 0; i < t.level1_size; i++)
733 if (i > 0 && (i % 1) == 0)
734 fprintf (stream, "\n ");
735 offset = ((uint32_t *) (t.result + level1_offset))[i];
737 fprintf (stream, " %5d", -1);
739 fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
740 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
741 if (i+1 < t.level1_size)
742 fprintf (stream, ",");
744 if (t.level1_size > 1)
745 fprintf (stream, "\n ");
746 fprintf (stream, " },\n");
747 fprintf (stream, " {");
748 if (t.level2_size << t.q > 1)
749 fprintf (stream, "\n ");
750 for (i = 0; i < t.level2_size << t.q; i++)
753 if (i > 0 && (i % 1) == 0)
754 fprintf (stream, "\n ");
755 offset = ((uint32_t *) (t.result + level2_offset))[i];
757 fprintf (stream, " %5d", -1);
759 fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
760 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
761 if (i+1 < t.level2_size << t.q)
762 fprintf (stream, ",");
764 if (t.level2_size << t.q > 1)
765 fprintf (stream, "\n ");
766 fprintf (stream, " },\n");
767 fprintf (stream, " {");
768 if (t.level3_size << t.p > 4)
769 fprintf (stream, "\n ");
770 for (i = 0; i < t.level3_size << t.p; i++)
772 if (i > 0 && (i % 4) == 0)
773 fprintf (stream, "\n ");
774 fprintf (stream, " 0x%08X",
775 ((uint32_t *) (t.result + level3_offset))[i]);
776 if (i+1 < t.level3_size << t.p)
777 fprintf (stream, ",");
779 if (t.level3_size << t.p > 4)
780 fprintf (stream, "\n ");
781 fprintf (stream, " }\n");
782 fprintf (stream, "};\n");
784 if (ferror (stream) || fclose (stream))
786 fprintf (stderr, "error writing to '%s'\n", filename);
791 /* Output all categories. */
793 output_categories (const char *version)
795 #define CATEGORY(C) \
796 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
797 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
798 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
841 UC_CATEGORY_MASK_L = 0x0000001f,
842 UC_CATEGORY_MASK_Lu = 0x00000001,
843 UC_CATEGORY_MASK_Ll = 0x00000002,
844 UC_CATEGORY_MASK_Lt = 0x00000004,
845 UC_CATEGORY_MASK_Lm = 0x00000008,
846 UC_CATEGORY_MASK_Lo = 0x00000010,
847 UC_CATEGORY_MASK_M = 0x000000e0,
848 UC_CATEGORY_MASK_Mn = 0x00000020,
849 UC_CATEGORY_MASK_Mc = 0x00000040,
850 UC_CATEGORY_MASK_Me = 0x00000080,
851 UC_CATEGORY_MASK_N = 0x00000700,
852 UC_CATEGORY_MASK_Nd = 0x00000100,
853 UC_CATEGORY_MASK_Nl = 0x00000200,
854 UC_CATEGORY_MASK_No = 0x00000400,
855 UC_CATEGORY_MASK_P = 0x0003f800,
856 UC_CATEGORY_MASK_Pc = 0x00000800,
857 UC_CATEGORY_MASK_Pd = 0x00001000,
858 UC_CATEGORY_MASK_Ps = 0x00002000,
859 UC_CATEGORY_MASK_Pe = 0x00004000,
860 UC_CATEGORY_MASK_Pi = 0x00008000,
861 UC_CATEGORY_MASK_Pf = 0x00010000,
862 UC_CATEGORY_MASK_Po = 0x00020000,
863 UC_CATEGORY_MASK_S = 0x003c0000,
864 UC_CATEGORY_MASK_Sm = 0x00040000,
865 UC_CATEGORY_MASK_Sc = 0x00080000,
866 UC_CATEGORY_MASK_Sk = 0x00100000,
867 UC_CATEGORY_MASK_So = 0x00200000,
868 UC_CATEGORY_MASK_Z = 0x01c00000,
869 UC_CATEGORY_MASK_Zs = 0x00400000,
870 UC_CATEGORY_MASK_Zl = 0x00800000,
871 UC_CATEGORY_MASK_Zp = 0x01000000,
872 UC_CATEGORY_MASK_C = 0x3e000000,
873 UC_CATEGORY_MASK_Cc = 0x02000000,
874 UC_CATEGORY_MASK_Cf = 0x04000000,
875 UC_CATEGORY_MASK_Cs = 0x08000000,
876 UC_CATEGORY_MASK_Co = 0x10000000,
877 UC_CATEGORY_MASK_Cn = 0x20000000
881 general_category_byname (const char *category_name)
883 if (category_name[0] != '\0'
884 && (category_name[1] == '\0' || category_name[2] == '\0'))
885 switch (category_name[0])
888 switch (category_name[1])
890 case '\0': return UC_CATEGORY_MASK_L;
891 case 'u': return UC_CATEGORY_MASK_Lu;
892 case 'l': return UC_CATEGORY_MASK_Ll;
893 case 't': return UC_CATEGORY_MASK_Lt;
894 case 'm': return UC_CATEGORY_MASK_Lm;
895 case 'o': return UC_CATEGORY_MASK_Lo;
899 switch (category_name[1])
901 case '\0': return UC_CATEGORY_MASK_M;
902 case 'n': return UC_CATEGORY_MASK_Mn;
903 case 'c': return UC_CATEGORY_MASK_Mc;
904 case 'e': return UC_CATEGORY_MASK_Me;
908 switch (category_name[1])
910 case '\0': return UC_CATEGORY_MASK_N;
911 case 'd': return UC_CATEGORY_MASK_Nd;
912 case 'l': return UC_CATEGORY_MASK_Nl;
913 case 'o': return UC_CATEGORY_MASK_No;
917 switch (category_name[1])
919 case '\0': return UC_CATEGORY_MASK_P;
920 case 'c': return UC_CATEGORY_MASK_Pc;
921 case 'd': return UC_CATEGORY_MASK_Pd;
922 case 's': return UC_CATEGORY_MASK_Ps;
923 case 'e': return UC_CATEGORY_MASK_Pe;
924 case 'i': return UC_CATEGORY_MASK_Pi;
925 case 'f': return UC_CATEGORY_MASK_Pf;
926 case 'o': return UC_CATEGORY_MASK_Po;
930 switch (category_name[1])
932 case '\0': return UC_CATEGORY_MASK_S;
933 case 'm': return UC_CATEGORY_MASK_Sm;
934 case 'c': return UC_CATEGORY_MASK_Sc;
935 case 'k': return UC_CATEGORY_MASK_Sk;
936 case 'o': return UC_CATEGORY_MASK_So;
940 switch (category_name[1])
942 case '\0': return UC_CATEGORY_MASK_Z;
943 case 's': return UC_CATEGORY_MASK_Zs;
944 case 'l': return UC_CATEGORY_MASK_Zl;
945 case 'p': return UC_CATEGORY_MASK_Zp;
949 switch (category_name[1])
951 case '\0': return UC_CATEGORY_MASK_C;
952 case 'c': return UC_CATEGORY_MASK_Cc;
953 case 'f': return UC_CATEGORY_MASK_Cf;
954 case 's': return UC_CATEGORY_MASK_Cs;
955 case 'o': return UC_CATEGORY_MASK_Co;
956 case 'n': return UC_CATEGORY_MASK_Cn;
960 /* Invalid category name. */
964 /* Construction of sparse 3-level tables. */
965 #define TABLE category_table
966 #define ELEMENT uint8_t
967 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
968 #define xmalloc malloc
969 #define xrealloc realloc
972 /* Output the per-character category table. */
974 output_category (const char *filename, const char *version)
978 struct category_table t;
979 unsigned int level1_offset, level2_offset, level3_offset;
980 uint16_t *level3_packed;
982 stream = fopen (filename, "w");
985 fprintf (stderr, "cannot open '%s' for writing\n", filename);
989 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
990 fprintf (stream, "/* Categories of Unicode characters. */\n");
991 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
996 category_table_init (&t);
998 for (ch = 0; ch < 0x110000; ch++)
1001 unsigned int log2_value;
1003 if (is_category_Cs (ch))
1004 value = UC_CATEGORY_MASK_Cs;
1005 else if (unicode_attributes[ch].name != NULL)
1006 value = general_category_byname (unicode_attributes[ch].category);
1010 /* Now value should contain exactly one bit. */
1011 if (value == 0 || ((value & (value - 1)) != 0))
1014 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1016 category_table_add (&t, ch, log2_value);
1019 category_table_finalize (&t);
1021 /* Offsets in t.result, in memory of this process. */
1023 5 * sizeof (uint32_t);
1025 5 * sizeof (uint32_t)
1026 + t.level1_size * sizeof (uint32_t);
1028 5 * sizeof (uint32_t)
1029 + t.level1_size * sizeof (uint32_t)
1030 + (t.level2_size << t.q) * sizeof (uint32_t);
1032 for (i = 0; i < 5; i++)
1033 fprintf (stream, "#define category_header_%d %d\n", i,
1034 ((uint32_t *) t.result)[i]);
1035 fprintf (stream, "static const\n");
1036 fprintf (stream, "struct\n");
1037 fprintf (stream, " {\n");
1038 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1039 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1040 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1041 (1 << t.p) * 5 / 16);
1042 fprintf (stream, " }\n");
1043 fprintf (stream, "u_category =\n");
1044 fprintf (stream, "{\n");
1045 fprintf (stream, " {");
1046 if (t.level1_size > 8)
1047 fprintf (stream, "\n ");
1048 for (i = 0; i < t.level1_size; i++)
1051 if (i > 0 && (i % 8) == 0)
1052 fprintf (stream, "\n ");
1053 offset = ((uint32_t *) (t.result + level1_offset))[i];
1055 fprintf (stream, " %5d", -1);
1057 fprintf (stream, " %5zd",
1058 (offset - level2_offset) / sizeof (uint32_t));
1059 if (i+1 < t.level1_size)
1060 fprintf (stream, ",");
1062 if (t.level1_size > 8)
1063 fprintf (stream, "\n ");
1064 fprintf (stream, " },\n");
1065 fprintf (stream, " {");
1066 if (t.level2_size << t.q > 8)
1067 fprintf (stream, "\n ");
1068 for (i = 0; i < t.level2_size << t.q; i++)
1071 if (i > 0 && (i % 8) == 0)
1072 fprintf (stream, "\n ");
1073 offset = ((uint32_t *) (t.result + level2_offset))[i];
1075 fprintf (stream, " %5d", -1);
1077 fprintf (stream, " %5zd",
1078 (offset - level3_offset) / sizeof (uint8_t));
1079 if (i+1 < t.level2_size << t.q)
1080 fprintf (stream, ",");
1082 if (t.level2_size << t.q > 8)
1083 fprintf (stream, "\n ");
1084 fprintf (stream, " },\n");
1085 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1086 not 32-bit units, in order to make the lookup function easier. */
1089 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1090 for (i = 0; i < t.level3_size << t.p; i++)
1092 unsigned int j = (i * 5) / 16;
1093 unsigned int k = (i * 5) % 16;
1094 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1095 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1096 level3_packed[j] = value & 0xffff;
1097 level3_packed[j+1] = value >> 16;
1099 fprintf (stream, " {");
1100 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1101 fprintf (stream, "\n ");
1102 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1104 if (i > 0 && (i % 8) == 0)
1105 fprintf (stream, "\n ");
1106 fprintf (stream, " 0x%04x", level3_packed[i]);
1107 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1108 fprintf (stream, ",");
1110 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " }\n");
1113 free (level3_packed);
1114 fprintf (stream, "};\n");
1116 if (ferror (stream) || fclose (stream))
1118 fprintf (stderr, "error writing to '%s'\n", filename);
1123 /* ========================================================================= */
1125 /* Canonical combining class. */
1126 /* See Unicode 3.0 book, section 4.2,
1129 /* Construction of sparse 3-level tables. */
1130 #define TABLE combclass_table
1131 #define ELEMENT uint8_t
1133 #define xmalloc malloc
1134 #define xrealloc realloc
1137 /* Output the per-character combining class table. */
1139 output_combclass (const char *filename, const char *version)
1143 struct combclass_table t;
1144 unsigned int level1_offset, level2_offset, level3_offset;
1146 stream = fopen (filename, "w");
1149 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1153 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1154 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1155 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1160 combclass_table_init (&t);
1162 for (ch = 0; ch < 0x110000; ch++)
1163 if (unicode_attributes[ch].name != NULL)
1165 int value = atoi (unicode_attributes[ch].combining);
1166 if (!(value >= 0 && value <= 255))
1168 combclass_table_add (&t, ch, value);
1171 combclass_table_finalize (&t);
1173 /* Offsets in t.result, in memory of this process. */
1175 5 * sizeof (uint32_t);
1177 5 * sizeof (uint32_t)
1178 + t.level1_size * sizeof (uint32_t);
1180 5 * sizeof (uint32_t)
1181 + t.level1_size * sizeof (uint32_t)
1182 + (t.level2_size << t.q) * sizeof (uint32_t);
1184 for (i = 0; i < 5; i++)
1185 fprintf (stream, "#define combclass_header_%d %d\n", i,
1186 ((uint32_t *) t.result)[i]);
1187 fprintf (stream, "static const\n");
1188 fprintf (stream, "struct\n");
1189 fprintf (stream, " {\n");
1190 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1191 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1192 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1193 fprintf (stream, " }\n");
1194 fprintf (stream, "u_combclass =\n");
1195 fprintf (stream, "{\n");
1196 fprintf (stream, " {");
1197 if (t.level1_size > 8)
1198 fprintf (stream, "\n ");
1199 for (i = 0; i < t.level1_size; i++)
1202 if (i > 0 && (i % 8) == 0)
1203 fprintf (stream, "\n ");
1204 offset = ((uint32_t *) (t.result + level1_offset))[i];
1206 fprintf (stream, " %5d", -1);
1208 fprintf (stream, " %5zd",
1209 (offset - level2_offset) / sizeof (uint32_t));
1210 if (i+1 < t.level1_size)
1211 fprintf (stream, ",");
1213 if (t.level1_size > 8)
1214 fprintf (stream, "\n ");
1215 fprintf (stream, " },\n");
1216 fprintf (stream, " {");
1217 if (t.level2_size << t.q > 8)
1218 fprintf (stream, "\n ");
1219 for (i = 0; i < t.level2_size << t.q; i++)
1222 if (i > 0 && (i % 8) == 0)
1223 fprintf (stream, "\n ");
1224 offset = ((uint32_t *) (t.result + level2_offset))[i];
1226 fprintf (stream, " %5d", -1);
1228 fprintf (stream, " %5zd",
1229 (offset - level3_offset) / sizeof (uint8_t));
1230 if (i+1 < t.level2_size << t.q)
1231 fprintf (stream, ",");
1233 if (t.level2_size << t.q > 8)
1234 fprintf (stream, "\n ");
1235 fprintf (stream, " },\n");
1236 fprintf (stream, " {");
1237 if (t.level3_size << t.p > 8)
1238 fprintf (stream, "\n ");
1239 for (i = 0; i < t.level3_size << t.p; i++)
1241 if (i > 0 && (i % 8) == 0)
1242 fprintf (stream, "\n ");
1243 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1244 if (i+1 < t.level3_size << t.p)
1245 fprintf (stream, ",");
1247 if (t.level3_size << t.p > 8)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " }\n");
1250 fprintf (stream, "};\n");
1252 if (ferror (stream) || fclose (stream))
1254 fprintf (stderr, "error writing to '%s'\n", filename);
1259 /* ========================================================================= */
1261 /* Bidirectional category. */
1262 /* See Unicode 3.0 book, section 4.3,
1267 UC_BIDI_L, /* Left-to-Right */
1268 UC_BIDI_LRE, /* Left-to-Right Embedding */
1269 UC_BIDI_LRO, /* Left-to-Right Override */
1270 UC_BIDI_R, /* Right-to-Left */
1271 UC_BIDI_AL, /* Right-to-Left Arabic */
1272 UC_BIDI_RLE, /* Right-to-Left Embedding */
1273 UC_BIDI_RLO, /* Right-to-Left Override */
1274 UC_BIDI_PDF, /* Pop Directional Format */
1275 UC_BIDI_EN, /* European Number */
1276 UC_BIDI_ES, /* European Number Separator */
1277 UC_BIDI_ET, /* European Number Terminator */
1278 UC_BIDI_AN, /* Arabic Number */
1279 UC_BIDI_CS, /* Common Number Separator */
1280 UC_BIDI_NSM, /* Non-Spacing Mark */
1281 UC_BIDI_BN, /* Boundary Neutral */
1282 UC_BIDI_B, /* Paragraph Separator */
1283 UC_BIDI_S, /* Segment Separator */
1284 UC_BIDI_WS, /* Whitespace */
1285 UC_BIDI_ON /* Other Neutral */
1289 bidi_category_byname (const char *category_name)
1291 switch (category_name[0])
1294 switch (category_name[1])
1297 if (category_name[2] == '\0')
1301 if (category_name[2] == '\0')
1307 switch (category_name[1])
1312 if (category_name[2] == '\0')
1318 switch (category_name[1])
1321 if (category_name[2] == '\0')
1327 switch (category_name[1])
1330 if (category_name[2] == '\0')
1334 if (category_name[2] == '\0')
1338 if (category_name[2] == '\0')
1344 switch (category_name[1])
1349 switch (category_name[2])
1352 if (category_name[3] == '\0')
1356 if (category_name[3] == '\0')
1364 switch (category_name[1])
1367 switch (category_name[2])
1370 if (category_name[3] == '\0')
1378 switch (category_name[1])
1381 if (category_name[2] == '\0')
1387 switch (category_name[1])
1390 switch (category_name[2])
1393 if (category_name[3] == '\0')
1401 switch (category_name[1])
1406 switch (category_name[2])
1409 if (category_name[3] == '\0')
1413 if (category_name[3] == '\0')
1421 if (category_name[1] == '\0')
1425 switch (category_name[1])
1428 if (category_name[2] == '\0')
1434 /* Invalid bidi category name. */
1439 get_bidi_category (unsigned int ch)
1441 if (unicode_attributes[ch].name != NULL)
1442 return bidi_category_byname (unicode_attributes[ch].bidi);
1445 /* The bidi category of unassigned characters depends on the range.
1446 See UTR #9 and DerivedBidiClass.txt. */
1447 if ((ch >= 0x0590 && ch <= 0x05FF)
1448 || (ch >= 0x07FB && ch <= 0x08FF)
1449 || (ch >= 0xFB37 && ch <= 0xFB45)
1450 || (ch >= 0x10800 && ch <= 0x10FFF))
1452 else if ((ch >= 0x0600 && ch <= 0x07BF)
1453 || (ch >= 0x2064 && ch <= 0x2069)
1454 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1455 || (ch >= 0xFDFE && ch <= 0xFEFE))
1457 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1458 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1459 || (ch & 0xFFFF) == 0xFFFE
1460 || (ch & 0xFFFF) == 0xFFFF
1461 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1468 /* Construction of sparse 3-level tables. */
1469 #define TABLE bidi_category_table
1470 #define ELEMENT uint8_t
1471 #define DEFAULT UC_BIDI_L
1472 #define xmalloc malloc
1473 #define xrealloc realloc
1476 /* Output the per-character bidi category table. */
1478 output_bidi_category (const char *filename, const char *version)
1482 struct bidi_category_table t;
1483 unsigned int level1_offset, level2_offset, level3_offset;
1484 uint16_t *level3_packed;
1486 stream = fopen (filename, "w");
1489 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1493 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1494 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1495 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1500 bidi_category_table_init (&t);
1502 for (ch = 0; ch < 0x110000; ch++)
1504 int value = get_bidi_category (ch);
1506 bidi_category_table_add (&t, ch, value);
1509 bidi_category_table_finalize (&t);
1511 /* Offsets in t.result, in memory of this process. */
1513 5 * sizeof (uint32_t);
1515 5 * sizeof (uint32_t)
1516 + t.level1_size * sizeof (uint32_t);
1518 5 * sizeof (uint32_t)
1519 + t.level1_size * sizeof (uint32_t)
1520 + (t.level2_size << t.q) * sizeof (uint32_t);
1522 for (i = 0; i < 5; i++)
1523 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1524 ((uint32_t *) t.result)[i]);
1525 fprintf (stream, "static const\n");
1526 fprintf (stream, "struct\n");
1527 fprintf (stream, " {\n");
1528 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1529 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1530 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1531 (1 << t.p) * 5 / 16);
1532 fprintf (stream, " }\n");
1533 fprintf (stream, "u_bidi_category =\n");
1534 fprintf (stream, "{\n");
1535 fprintf (stream, " {");
1536 if (t.level1_size > 8)
1537 fprintf (stream, "\n ");
1538 for (i = 0; i < t.level1_size; i++)
1541 if (i > 0 && (i % 8) == 0)
1542 fprintf (stream, "\n ");
1543 offset = ((uint32_t *) (t.result + level1_offset))[i];
1545 fprintf (stream, " %5d", -1);
1547 fprintf (stream, " %5zd",
1548 (offset - level2_offset) / sizeof (uint32_t));
1549 if (i+1 < t.level1_size)
1550 fprintf (stream, ",");
1552 if (t.level1_size > 8)
1553 fprintf (stream, "\n ");
1554 fprintf (stream, " },\n");
1555 fprintf (stream, " {");
1556 if (t.level2_size << t.q > 8)
1557 fprintf (stream, "\n ");
1558 for (i = 0; i < t.level2_size << t.q; i++)
1561 if (i > 0 && (i % 8) == 0)
1562 fprintf (stream, "\n ");
1563 offset = ((uint32_t *) (t.result + level2_offset))[i];
1565 fprintf (stream, " %5d", -1);
1567 fprintf (stream, " %5zd",
1568 (offset - level3_offset) / sizeof (uint8_t));
1569 if (i+1 < t.level2_size << t.q)
1570 fprintf (stream, ",");
1572 if (t.level2_size << t.q > 8)
1573 fprintf (stream, "\n ");
1574 fprintf (stream, " },\n");
1575 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1576 not 32-bit units, in order to make the lookup function easier. */
1579 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1580 for (i = 0; i < t.level3_size << t.p; i++)
1582 unsigned int j = (i * 5) / 16;
1583 unsigned int k = (i * 5) % 16;
1584 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1585 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1586 level3_packed[j] = value & 0xffff;
1587 level3_packed[j+1] = value >> 16;
1589 fprintf (stream, " {");
1590 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1591 fprintf (stream, "\n ");
1592 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1594 if (i > 0 && (i % 8) == 0)
1595 fprintf (stream, "\n ");
1596 fprintf (stream, " 0x%04x", level3_packed[i]);
1597 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1598 fprintf (stream, ",");
1600 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " }\n");
1603 free (level3_packed);
1604 fprintf (stream, "};\n");
1606 if (ferror (stream) || fclose (stream))
1608 fprintf (stderr, "error writing to '%s'\n", filename);
1613 /* ========================================================================= */
1615 /* Decimal digit value. */
1616 /* See Unicode 3.0 book, section 4.6. */
1619 get_decdigit_value (unsigned int ch)
1621 if (unicode_attributes[ch].name != NULL
1622 && unicode_attributes[ch].decdigit[0] != '\0')
1623 return atoi (unicode_attributes[ch].decdigit);
1627 /* Construction of sparse 3-level tables. */
1628 #define TABLE decdigit_table
1629 #define ELEMENT uint8_t
1631 #define xmalloc malloc
1632 #define xrealloc realloc
1635 /* Output the unit test for the per-character decimal digit value table. */
1637 output_decimal_digit_test (const char *filename, const char *version)
1643 stream = fopen (filename, "w");
1646 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1650 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1651 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1652 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1656 for (ch = 0; ch < 0x110000; ch++)
1658 int value = get_decdigit_value (ch);
1660 if (!(value >= -1 && value < 10))
1666 fprintf (stream, ",\n");
1667 fprintf (stream, " { 0x%04X, %d }", ch, value);
1672 fprintf (stream, "\n");
1674 if (ferror (stream) || fclose (stream))
1676 fprintf (stderr, "error writing to '%s'\n", filename);
1681 /* Output the per-character decimal digit value table. */
1683 output_decimal_digit (const char *filename, const char *version)
1687 struct decdigit_table t;
1688 unsigned int level1_offset, level2_offset, level3_offset;
1690 stream = fopen (filename, "w");
1693 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1697 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1698 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1699 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1704 decdigit_table_init (&t);
1706 for (ch = 0; ch < 0x110000; ch++)
1708 int value = 1 + get_decdigit_value (ch);
1710 if (!(value >= 0 && value <= 10))
1713 decdigit_table_add (&t, ch, value);
1716 decdigit_table_finalize (&t);
1718 /* Offsets in t.result, in memory of this process. */
1720 5 * sizeof (uint32_t);
1722 5 * sizeof (uint32_t)
1723 + t.level1_size * sizeof (uint32_t);
1725 5 * sizeof (uint32_t)
1726 + t.level1_size * sizeof (uint32_t)
1727 + (t.level2_size << t.q) * sizeof (uint32_t);
1729 for (i = 0; i < 5; i++)
1730 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1731 ((uint32_t *) t.result)[i]);
1732 fprintf (stream, "static const\n");
1733 fprintf (stream, "struct\n");
1734 fprintf (stream, " {\n");
1735 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1736 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1737 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1739 fprintf (stream, " }\n");
1740 fprintf (stream, "u_decdigit =\n");
1741 fprintf (stream, "{\n");
1742 fprintf (stream, " {");
1743 if (t.level1_size > 8)
1744 fprintf (stream, "\n ");
1745 for (i = 0; i < t.level1_size; i++)
1748 if (i > 0 && (i % 8) == 0)
1749 fprintf (stream, "\n ");
1750 offset = ((uint32_t *) (t.result + level1_offset))[i];
1752 fprintf (stream, " %5d", -1);
1754 fprintf (stream, " %5zd",
1755 (offset - level2_offset) / sizeof (uint32_t));
1756 if (i+1 < t.level1_size)
1757 fprintf (stream, ",");
1759 if (t.level1_size > 8)
1760 fprintf (stream, "\n ");
1761 fprintf (stream, " },\n");
1762 fprintf (stream, " {");
1763 if (t.level2_size << t.q > 8)
1764 fprintf (stream, "\n ");
1765 for (i = 0; i < t.level2_size << t.q; i++)
1768 if (i > 0 && (i % 8) == 0)
1769 fprintf (stream, "\n ");
1770 offset = ((uint32_t *) (t.result + level2_offset))[i];
1772 fprintf (stream, " %5d", -1);
1774 fprintf (stream, " %5zd",
1775 (offset - level3_offset) / sizeof (uint8_t));
1776 if (i+1 < t.level2_size << t.q)
1777 fprintf (stream, ",");
1779 if (t.level2_size << t.q > 8)
1780 fprintf (stream, "\n ");
1781 fprintf (stream, " },\n");
1782 /* Pack the level3 array. Each entry needs 4 bits only. */
1783 fprintf (stream, " {");
1784 if (t.level3_size << (t.p - 1) > 8)
1785 fprintf (stream, "\n ");
1786 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1788 if (i > 0 && (i % 8) == 0)
1789 fprintf (stream, "\n ");
1790 fprintf (stream, " 0x%02x",
1791 ((uint8_t *) (t.result + level3_offset))[2*i]
1792 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1793 if (i+1 < t.level3_size << (t.p - 1))
1794 fprintf (stream, ",");
1796 if (t.level3_size << (t.p - 1) > 8)
1797 fprintf (stream, "\n ");
1798 fprintf (stream, " }\n");
1799 fprintf (stream, "};\n");
1801 if (ferror (stream) || fclose (stream))
1803 fprintf (stderr, "error writing to '%s'\n", filename);
1808 /* ========================================================================= */
1811 /* See Unicode 3.0 book, section 4.6. */
1814 get_digit_value (unsigned int ch)
1816 if (unicode_attributes[ch].name != NULL
1817 && unicode_attributes[ch].digit[0] != '\0')
1818 return atoi (unicode_attributes[ch].digit);
1822 /* Output the unit test for the per-character digit value table. */
1824 output_digit_test (const char *filename, const char *version)
1830 stream = fopen (filename, "w");
1833 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1837 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1838 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1839 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1843 for (ch = 0; ch < 0x110000; ch++)
1845 int value = get_digit_value (ch);
1847 if (!(value >= -1 && value < 10))
1853 fprintf (stream, ",\n");
1854 fprintf (stream, " { 0x%04X, %d }", ch, value);
1859 fprintf (stream, "\n");
1861 if (ferror (stream) || fclose (stream))
1863 fprintf (stderr, "error writing to '%s'\n", filename);
1868 /* Output the per-character digit value table. */
1870 output_digit (const char *filename, const char *version)
1874 struct decdigit_table t;
1875 unsigned int level1_offset, level2_offset, level3_offset;
1877 stream = fopen (filename, "w");
1880 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1884 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1885 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1886 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1891 decdigit_table_init (&t);
1893 for (ch = 0; ch < 0x110000; ch++)
1895 int value = 1 + get_digit_value (ch);
1897 if (!(value >= 0 && value <= 10))
1900 decdigit_table_add (&t, ch, value);
1903 decdigit_table_finalize (&t);
1905 /* Offsets in t.result, in memory of this process. */
1907 5 * sizeof (uint32_t);
1909 5 * sizeof (uint32_t)
1910 + t.level1_size * sizeof (uint32_t);
1912 5 * sizeof (uint32_t)
1913 + t.level1_size * sizeof (uint32_t)
1914 + (t.level2_size << t.q) * sizeof (uint32_t);
1916 for (i = 0; i < 5; i++)
1917 fprintf (stream, "#define digit_header_%d %d\n", i,
1918 ((uint32_t *) t.result)[i]);
1919 fprintf (stream, "static const\n");
1920 fprintf (stream, "struct\n");
1921 fprintf (stream, " {\n");
1922 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1923 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1924 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1926 fprintf (stream, " }\n");
1927 fprintf (stream, "u_digit =\n");
1928 fprintf (stream, "{\n");
1929 fprintf (stream, " {");
1930 if (t.level1_size > 8)
1931 fprintf (stream, "\n ");
1932 for (i = 0; i < t.level1_size; i++)
1935 if (i > 0 && (i % 8) == 0)
1936 fprintf (stream, "\n ");
1937 offset = ((uint32_t *) (t.result + level1_offset))[i];
1939 fprintf (stream, " %5d", -1);
1941 fprintf (stream, " %5zd",
1942 (offset - level2_offset) / sizeof (uint32_t));
1943 if (i+1 < t.level1_size)
1944 fprintf (stream, ",");
1946 if (t.level1_size > 8)
1947 fprintf (stream, "\n ");
1948 fprintf (stream, " },\n");
1949 fprintf (stream, " {");
1950 if (t.level2_size << t.q > 8)
1951 fprintf (stream, "\n ");
1952 for (i = 0; i < t.level2_size << t.q; i++)
1955 if (i > 0 && (i % 8) == 0)
1956 fprintf (stream, "\n ");
1957 offset = ((uint32_t *) (t.result + level2_offset))[i];
1959 fprintf (stream, " %5d", -1);
1961 fprintf (stream, " %5zd",
1962 (offset - level3_offset) / sizeof (uint8_t));
1963 if (i+1 < t.level2_size << t.q)
1964 fprintf (stream, ",");
1966 if (t.level2_size << t.q > 8)
1967 fprintf (stream, "\n ");
1968 fprintf (stream, " },\n");
1969 /* Pack the level3 array. Each entry needs 4 bits only. */
1970 fprintf (stream, " {");
1971 if (t.level3_size << (t.p - 1) > 8)
1972 fprintf (stream, "\n ");
1973 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1975 if (i > 0 && (i % 8) == 0)
1976 fprintf (stream, "\n ");
1977 fprintf (stream, " 0x%02x",
1978 ((uint8_t *) (t.result + level3_offset))[2*i]
1979 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1980 if (i+1 < t.level3_size << (t.p - 1))
1981 fprintf (stream, ",");
1983 if (t.level3_size << (t.p - 1) > 8)
1984 fprintf (stream, "\n ");
1985 fprintf (stream, " }\n");
1986 fprintf (stream, "};\n");
1988 if (ferror (stream) || fclose (stream))
1990 fprintf (stderr, "error writing to '%s'\n", filename);
1995 /* ========================================================================= */
1997 /* Numeric value. */
1998 /* See Unicode 3.0 book, section 4.6. */
2000 typedef struct { int numerator; int denominator; } uc_fraction_t;
2002 static uc_fraction_t
2003 get_numeric_value (unsigned int ch)
2005 uc_fraction_t value;
2007 if (unicode_attributes[ch].name != NULL
2008 && unicode_attributes[ch].numeric[0] != '\0')
2010 const char *str = unicode_attributes[ch].numeric;
2011 /* str is of the form "integer" or "integer/posinteger". */
2012 value.numerator = atoi (str);
2013 if (strchr (str, '/') != NULL)
2014 value.denominator = atoi (strchr (str, '/') + 1);
2016 value.denominator = 1;
2020 value.numerator = 0;
2021 value.denominator = 0;
2026 /* Output the unit test for the per-character numeric value table. */
2028 output_numeric_test (const char *filename, const char *version)
2034 stream = fopen (filename, "w");
2037 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2041 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2042 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2043 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2047 for (ch = 0; ch < 0x110000; ch++)
2049 uc_fraction_t value = get_numeric_value (ch);
2051 if (value.numerator != 0 || value.denominator != 0)
2054 fprintf (stream, ",\n");
2055 fprintf (stream, " { 0x%04X, %d, %d }",
2056 ch, value.numerator, value.denominator);
2061 fprintf (stream, "\n");
2063 if (ferror (stream) || fclose (stream))
2065 fprintf (stderr, "error writing to '%s'\n", filename);
2070 /* Construction of sparse 3-level tables. */
2071 #define TABLE numeric_table
2072 #define ELEMENT uint8_t
2074 #define xmalloc malloc
2075 #define xrealloc realloc
2078 /* Output the per-character numeric value table. */
2080 output_numeric (const char *filename, const char *version)
2083 uc_fraction_t fractions[128];
2084 unsigned int nfractions;
2085 unsigned int ch, i, j;
2086 struct numeric_table t;
2087 unsigned int level1_offset, level2_offset, level3_offset;
2088 uint16_t *level3_packed;
2090 stream = fopen (filename, "w");
2093 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2097 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2098 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2099 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2102 /* Create table of occurring fractions. */
2104 for (ch = 0; ch < 0x110000; ch++)
2106 uc_fraction_t value = get_numeric_value (ch);
2108 for (i = 0; i < nfractions; i++)
2109 if (value.numerator == fractions[i].numerator
2110 && value.denominator == fractions[i].denominator)
2112 if (i == nfractions)
2114 if (nfractions == 128)
2116 for (i = 0; i < nfractions; i++)
2117 if (value.denominator < fractions[i].denominator
2118 || (value.denominator == fractions[i].denominator
2119 && value.numerator < fractions[i].numerator))
2121 for (j = nfractions; j > i; j--)
2122 fractions[j] = fractions[j - 1];
2123 fractions[i] = value;
2128 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2130 fprintf (stream, "{\n");
2131 for (i = 0; i < nfractions; i++)
2133 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2134 fractions[i].denominator);
2135 if (i+1 < nfractions)
2136 fprintf (stream, ",");
2137 fprintf (stream, "\n");
2139 fprintf (stream, "};\n");
2143 numeric_table_init (&t);
2145 for (ch = 0; ch < 0x110000; ch++)
2147 uc_fraction_t value = get_numeric_value (ch);
2149 for (i = 0; i < nfractions; i++)
2150 if (value.numerator == fractions[i].numerator
2151 && value.denominator == fractions[i].denominator)
2153 if (i == nfractions)
2156 numeric_table_add (&t, ch, i);
2159 numeric_table_finalize (&t);
2161 /* Offsets in t.result, in memory of this process. */
2163 5 * sizeof (uint32_t);
2165 5 * sizeof (uint32_t)
2166 + t.level1_size * sizeof (uint32_t);
2168 5 * sizeof (uint32_t)
2169 + t.level1_size * sizeof (uint32_t)
2170 + (t.level2_size << t.q) * sizeof (uint32_t);
2172 for (i = 0; i < 5; i++)
2173 fprintf (stream, "#define numeric_header_%d %d\n", i,
2174 ((uint32_t *) t.result)[i]);
2175 fprintf (stream, "static const\n");
2176 fprintf (stream, "struct\n");
2177 fprintf (stream, " {\n");
2178 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2179 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2180 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2181 (1 << t.p) * 7 / 16);
2182 fprintf (stream, " }\n");
2183 fprintf (stream, "u_numeric =\n");
2184 fprintf (stream, "{\n");
2185 fprintf (stream, " {");
2186 if (t.level1_size > 8)
2187 fprintf (stream, "\n ");
2188 for (i = 0; i < t.level1_size; i++)
2191 if (i > 0 && (i % 8) == 0)
2192 fprintf (stream, "\n ");
2193 offset = ((uint32_t *) (t.result + level1_offset))[i];
2195 fprintf (stream, " %5d", -1);
2197 fprintf (stream, " %5zd",
2198 (offset - level2_offset) / sizeof (uint32_t));
2199 if (i+1 < t.level1_size)
2200 fprintf (stream, ",");
2202 if (t.level1_size > 8)
2203 fprintf (stream, "\n ");
2204 fprintf (stream, " },\n");
2205 fprintf (stream, " {");
2206 if (t.level2_size << t.q > 8)
2207 fprintf (stream, "\n ");
2208 for (i = 0; i < t.level2_size << t.q; i++)
2211 if (i > 0 && (i % 8) == 0)
2212 fprintf (stream, "\n ");
2213 offset = ((uint32_t *) (t.result + level2_offset))[i];
2215 fprintf (stream, " %5d", -1);
2217 fprintf (stream, " %5zd",
2218 (offset - level3_offset) / sizeof (uint8_t));
2219 if (i+1 < t.level2_size << t.q)
2220 fprintf (stream, ",");
2222 if (t.level2_size << t.q > 8)
2223 fprintf (stream, "\n ");
2224 fprintf (stream, " },\n");
2225 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2226 not 32-bit units, in order to make the lookup function easier. */
2229 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2230 for (i = 0; i < t.level3_size << t.p; i++)
2232 unsigned int j = (i * 7) / 16;
2233 unsigned int k = (i * 7) % 16;
2234 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2235 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2236 level3_packed[j] = value & 0xffff;
2237 level3_packed[j+1] = value >> 16;
2239 fprintf (stream, " {");
2240 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2241 fprintf (stream, "\n ");
2242 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2244 if (i > 0 && (i % 8) == 0)
2245 fprintf (stream, "\n ");
2246 fprintf (stream, " 0x%04x", level3_packed[i]);
2247 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2248 fprintf (stream, ",");
2250 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " }\n");
2253 free (level3_packed);
2254 fprintf (stream, "};\n");
2256 if (ferror (stream) || fclose (stream))
2258 fprintf (stderr, "error writing to '%s'\n", filename);
2263 /* ========================================================================= */
2266 /* See Unicode 3.0 book, section 4.7,
2269 /* List of mirrored character pairs. This is a subset of the characters
2270 having the BidiMirrored property. */
2271 static unsigned int mirror_pairs[][2] =
2328 get_mirror_value (unsigned int ch)
2331 unsigned int mirror_char;
2334 mirrored = (unicode_attributes[ch].name != NULL
2335 && unicode_attributes[ch].mirrored);
2336 mirror_char = 0xfffd;
2337 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2338 if (ch == mirror_pairs[i][0])
2340 mirror_char = mirror_pairs[i][1];
2343 else if (ch == mirror_pairs[i][1])
2345 mirror_char = mirror_pairs[i][0];
2349 return (int) mirror_char - (int) ch;
2352 if (mirror_char != 0xfffd)
2358 /* Construction of sparse 3-level tables. */
2359 #define TABLE mirror_table
2360 #define ELEMENT int32_t
2362 #define xmalloc malloc
2363 #define xrealloc realloc
2366 /* Output the per-character mirror table. */
2368 output_mirror (const char *filename, const char *version)
2372 struct mirror_table t;
2373 unsigned int level1_offset, level2_offset, level3_offset;
2375 stream = fopen (filename, "w");
2378 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2382 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2383 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2384 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2389 mirror_table_init (&t);
2391 for (ch = 0; ch < 0x110000; ch++)
2393 int value = get_mirror_value (ch);
2395 mirror_table_add (&t, ch, value);
2398 mirror_table_finalize (&t);
2400 /* Offsets in t.result, in memory of this process. */
2402 5 * sizeof (uint32_t);
2404 5 * sizeof (uint32_t)
2405 + t.level1_size * sizeof (uint32_t);
2407 5 * sizeof (uint32_t)
2408 + t.level1_size * sizeof (uint32_t)
2409 + (t.level2_size << t.q) * sizeof (uint32_t);
2411 for (i = 0; i < 5; i++)
2412 fprintf (stream, "#define mirror_header_%d %d\n", i,
2413 ((uint32_t *) t.result)[i]);
2414 fprintf (stream, "static const\n");
2415 fprintf (stream, "struct\n");
2416 fprintf (stream, " {\n");
2417 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2418 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2419 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2420 fprintf (stream, " }\n");
2421 fprintf (stream, "u_mirror =\n");
2422 fprintf (stream, "{\n");
2423 fprintf (stream, " {");
2424 if (t.level1_size > 8)
2425 fprintf (stream, "\n ");
2426 for (i = 0; i < t.level1_size; i++)
2429 if (i > 0 && (i % 8) == 0)
2430 fprintf (stream, "\n ");
2431 offset = ((uint32_t *) (t.result + level1_offset))[i];
2433 fprintf (stream, " %5d", -1);
2435 fprintf (stream, " %5zd",
2436 (offset - level2_offset) / sizeof (uint32_t));
2437 if (i+1 < t.level1_size)
2438 fprintf (stream, ",");
2440 if (t.level1_size > 8)
2441 fprintf (stream, "\n ");
2442 fprintf (stream, " },\n");
2443 fprintf (stream, " {");
2444 if (t.level2_size << t.q > 8)
2445 fprintf (stream, "\n ");
2446 for (i = 0; i < t.level2_size << t.q; i++)
2449 if (i > 0 && (i % 8) == 0)
2450 fprintf (stream, "\n ");
2451 offset = ((uint32_t *) (t.result + level2_offset))[i];
2453 fprintf (stream, " %5d", -1);
2455 fprintf (stream, " %5zd",
2456 (offset - level3_offset) / sizeof (int32_t));
2457 if (i+1 < t.level2_size << t.q)
2458 fprintf (stream, ",");
2460 if (t.level2_size << t.q > 8)
2461 fprintf (stream, "\n ");
2462 fprintf (stream, " },\n");
2463 fprintf (stream, " {");
2464 if (t.level3_size << t.p > 8)
2465 fprintf (stream, "\n ");
2466 for (i = 0; i < t.level3_size << t.p; i++)
2468 if (i > 0 && (i % 8) == 0)
2469 fprintf (stream, "\n ");
2470 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2471 if (i+1 < t.level3_size << t.p)
2472 fprintf (stream, ",");
2474 if (t.level3_size << t.p > 8)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " }\n");
2477 fprintf (stream, "};\n");
2479 if (ferror (stream) || fclose (stream))
2481 fprintf (stderr, "error writing to '%s'\n", filename);
2486 /* ========================================================================= */
2490 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2499 PROP_QUOTATION_MARK,
2500 PROP_TERMINAL_PUNCTUATION,
2503 PROP_ASCII_HEX_DIGIT,
2504 PROP_OTHER_ALPHABETIC,
2508 PROP_OTHER_LOWERCASE,
2509 PROP_OTHER_UPPERCASE,
2510 PROP_NONCHARACTER_CODE_POINT,
2511 PROP_OTHER_GRAPHEME_EXTEND,
2512 PROP_IDS_BINARY_OPERATOR,
2513 PROP_IDS_TRINARY_OPERATOR,
2515 PROP_UNIFIED_IDEOGRAPH,
2516 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2519 PROP_LOGICAL_ORDER_EXCEPTION,
2520 PROP_OTHER_ID_START,
2521 PROP_OTHER_ID_CONTINUE,
2523 PROP_VARIATION_SELECTOR,
2524 PROP_PATTERN_WHITE_SPACE,
2525 PROP_PATTERN_SYNTAX,
2526 /* DerivedCoreProperties.txt */
2535 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2536 PROP_GRAPHEME_EXTEND,
2540 unsigned long long unicode_properties[0x110000];
2543 clear_properties (void)
2547 for (i = 0; i < 0x110000; i++)
2548 unicode_properties[i] = 0;
2551 /* Stores in unicode_properties[] the properties from the
2552 PropList.txt or DerivedCoreProperties.txt file. */
2554 fill_properties (const char *proplist_filename)
2559 stream = fopen (proplist_filename, "r");
2562 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2569 unsigned int i1, i2;
2570 char padding[200+1];
2571 char propname[200+1];
2572 unsigned int propvalue;
2574 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2577 if (buf[0] == '\0' || buf[0] == '#')
2580 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2582 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2584 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2589 #define PROP(name,value) \
2590 if (strcmp (propname, name) == 0) propvalue = value; else
2592 PROP ("White_Space", PROP_WHITE_SPACE)
2593 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2594 PROP ("Join_Control", PROP_JOIN_CONTROL)
2595 PROP ("Dash", PROP_DASH)
2596 PROP ("Hyphen", PROP_HYPHEN)
2597 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2598 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2599 PROP ("Other_Math", PROP_OTHER_MATH)
2600 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2601 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2602 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2603 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2604 PROP ("Diacritic", PROP_DIACRITIC)
2605 PROP ("Extender", PROP_EXTENDER)
2606 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2607 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2608 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2609 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2610 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2611 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2612 PROP ("Radical", PROP_RADICAL)
2613 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2614 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2615 PROP ("Deprecated", PROP_DEPRECATED)
2616 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2617 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2618 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2619 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2620 PROP ("STerm", PROP_STERM)
2621 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2622 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2623 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2624 /* DerivedCoreProperties.txt */
2625 PROP ("Math", PROP_MATH)
2626 PROP ("Alphabetic", PROP_ALPHABETIC)
2627 PROP ("Lowercase", PROP_LOWERCASE)
2628 PROP ("Uppercase", PROP_UPPERCASE)
2629 PROP ("ID_Start", PROP_ID_START)
2630 PROP ("ID_Continue", PROP_ID_CONTINUE)
2631 PROP ("XID_Start", PROP_XID_START)
2632 PROP ("XID_Continue", PROP_XID_CONTINUE)
2633 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2634 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2635 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2636 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2639 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2643 if (!(i1 <= i2 && i2 < 0x110000))
2646 for (i = i1; i <= i2; i++)
2647 unicode_properties[i] |= 1ULL << propvalue;
2650 if (ferror (stream) || fclose (stream))
2652 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2657 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2660 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2666 for (i = 0; i < 0x110000; i++)
2669 stream = fopen (proplist_filename, "r");
2672 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2676 /* Search for the "Property dump for: ..." line. */
2679 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2681 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2685 while (strstr (buf, property_name) == NULL);
2689 unsigned int i1, i2;
2691 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2695 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2697 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2699 fprintf (stderr, "parse error in property in '%s'\n",
2704 else if (strlen (buf) >= 4)
2706 if (sscanf (buf, "%4X", &i1) < 1)
2708 fprintf (stderr, "parse error in property in '%s'\n",
2716 fprintf (stderr, "parse error in property in '%s'\n",
2720 if (!(i1 <= i2 && i2 < 0x110000))
2722 for (i = i1; i <= i2; i++)
2725 if (ferror (stream) || fclose (stream))
2727 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2732 /* Properties from Unicode 3.0 PropList.txt file. */
2734 /* The paired punctuation property from the PropList.txt file. */
2735 char unicode_pairedpunctuation[0x110000];
2737 /* The left of pair property from the PropList.txt file. */
2738 char unicode_leftofpair[0x110000];
2741 fill_properties30 (const char *proplist30_filename)
2743 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2744 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2747 /* ------------------------------------------------------------------------- */
2749 /* See PropList.txt, UCD.html. */
2751 is_property_white_space (unsigned int ch)
2753 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2756 /* See Unicode 3.0 book, section 4.10,
2757 PropList.txt, UCD.html,
2758 DerivedCoreProperties.txt, UCD.html. */
2760 is_property_alphabetic (unsigned int ch)
2764 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2765 /* For some reason, the following are listed as having property
2766 Alphabetic but not as having property Other_Alphabetic. */
2767 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2768 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2769 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2770 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2771 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2772 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2773 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2774 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2775 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2776 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2777 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2778 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2780 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2782 if (result1 != result2)
2787 /* See PropList.txt, UCD.html. */
2789 is_property_other_alphabetic (unsigned int ch)
2791 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2794 /* See PropList.txt, UCD.html. */
2796 is_property_not_a_character (unsigned int ch)
2798 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2801 /* See PropList.txt, UCD.html,
2802 DerivedCoreProperties.txt, UCD.html. */
2804 is_property_default_ignorable_code_point (unsigned int ch)
2807 (is_category_Cf (ch)
2808 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2809 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2810 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2811 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2813 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2815 if (result1 != result2)
2820 /* See PropList.txt, UCD.html. */
2822 is_property_other_default_ignorable_code_point (unsigned int ch)
2824 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2827 /* See PropList.txt, UCD.html. */
2829 is_property_deprecated (unsigned int ch)
2831 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2834 /* See PropList.txt, UCD.html. */
2836 is_property_logical_order_exception (unsigned int ch)
2838 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2841 /* See PropList.txt, UCD.html. */
2843 is_property_variation_selector (unsigned int ch)
2845 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2848 /* See PropList-3.0.1.txt. */
2850 is_property_private_use (unsigned int ch)
2852 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2853 return (ch >= 0xE000 && ch <= 0xF8FF)
2854 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2855 || (ch >= 0x100000 && ch <= 0x10FFFD);
2858 /* See PropList-3.0.1.txt. */
2860 is_property_unassigned_code_value (unsigned int ch)
2862 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2865 /* See PropList.txt, UCD.html,
2866 DerivedCoreProperties.txt, UCD.html. */
2868 is_property_uppercase (unsigned int ch)
2872 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2874 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2876 if (result1 != result2)
2881 /* See PropList.txt, UCD.html. */
2883 is_property_other_uppercase (unsigned int ch)
2885 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2888 /* See PropList.txt, UCD.html,
2889 DerivedCoreProperties.txt, UCD.html. */
2891 is_property_lowercase (unsigned int ch)
2895 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2897 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2899 if (result1 != result2)
2904 /* See PropList.txt, UCD.html. */
2906 is_property_other_lowercase (unsigned int ch)
2908 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2911 /* See PropList-3.0.1.txt. */
2913 is_property_titlecase (unsigned int ch)
2915 return is_category_Lt (ch);
2918 /* See PropList.txt, UCD.html. */
2920 is_property_soft_dotted (unsigned int ch)
2922 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2925 /* See DerivedCoreProperties.txt, UCD.html. */
2927 is_property_id_start (unsigned int ch)
2929 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2932 /* See PropList.txt, UCD.html. */
2934 is_property_other_id_start (unsigned int ch)
2936 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2939 /* See DerivedCoreProperties.txt, UCD.html. */
2941 is_property_id_continue (unsigned int ch)
2943 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2946 /* See PropList.txt, UCD.html. */
2948 is_property_other_id_continue (unsigned int ch)
2950 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2953 /* See DerivedCoreProperties.txt, UCD.html. */
2955 is_property_xid_start (unsigned int ch)
2957 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2960 /* See DerivedCoreProperties.txt, UCD.html. */
2962 is_property_xid_continue (unsigned int ch)
2964 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2967 /* See PropList.txt, UCD.html. */
2969 is_property_pattern_white_space (unsigned int ch)
2971 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2974 /* See PropList.txt, UCD.html. */
2976 is_property_pattern_syntax (unsigned int ch)
2978 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2981 /* See PropList.txt, UCD.html. */
2983 is_property_join_control (unsigned int ch)
2985 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2988 /* See DerivedCoreProperties.txt, UCD.html. */
2990 is_property_grapheme_base (unsigned int ch)
2992 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2995 /* See DerivedCoreProperties.txt, UCD.html. */
2997 is_property_grapheme_extend (unsigned int ch)
2999 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3002 /* See PropList.txt, UCD.html. */
3004 is_property_other_grapheme_extend (unsigned int ch)
3006 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3009 /* See DerivedCoreProperties.txt, UCD.html. */
3011 is_property_grapheme_link (unsigned int ch)
3013 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3016 /* See PropList.txt, UCD.html. */
3018 is_property_bidi_control (unsigned int ch)
3020 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3023 /* See PropList-3.0.1.txt. */
3025 is_property_bidi_left_to_right (unsigned int ch)
3027 return (get_bidi_category (ch) == UC_BIDI_L);
3030 /* See PropList-3.0.1.txt. */
3032 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3034 return (get_bidi_category (ch) == UC_BIDI_R);
3037 /* See PropList-3.0.1.txt. */
3039 is_property_bidi_arabic_right_to_left (unsigned int ch)
3041 return (get_bidi_category (ch) == UC_BIDI_AL);
3044 /* See PropList-3.0.1.txt. */
3046 is_property_bidi_european_digit (unsigned int ch)
3048 return (get_bidi_category (ch) == UC_BIDI_EN);
3051 /* See PropList-3.0.1.txt. */
3053 is_property_bidi_eur_num_separator (unsigned int ch)
3055 return (get_bidi_category (ch) == UC_BIDI_ES);
3058 /* See PropList-3.0.1.txt. */
3060 is_property_bidi_eur_num_terminator (unsigned int ch)
3062 return (get_bidi_category (ch) == UC_BIDI_ET);
3065 /* See PropList-3.0.1.txt. */
3067 is_property_bidi_arabic_digit (unsigned int ch)
3069 return (get_bidi_category (ch) == UC_BIDI_AN);
3072 /* See PropList-3.0.1.txt. */
3074 is_property_bidi_common_separator (unsigned int ch)
3076 return (get_bidi_category (ch) == UC_BIDI_CS);
3079 /* See PropList-3.0.1.txt. */
3081 is_property_bidi_block_separator (unsigned int ch)
3083 return (get_bidi_category (ch) == UC_BIDI_B);
3086 /* See PropList-3.0.1.txt. */
3088 is_property_bidi_segment_separator (unsigned int ch)
3090 return (get_bidi_category (ch) == UC_BIDI_S);
3093 /* See PropList-3.0.1.txt. */
3095 is_property_bidi_whitespace (unsigned int ch)
3097 return (get_bidi_category (ch) == UC_BIDI_WS);
3100 /* See PropList-3.0.1.txt. */
3102 is_property_bidi_non_spacing_mark (unsigned int ch)
3104 return (get_bidi_category (ch) == UC_BIDI_NSM);
3107 /* See PropList-3.0.1.txt. */
3109 is_property_bidi_boundary_neutral (unsigned int ch)
3111 return (get_bidi_category (ch) == UC_BIDI_BN);
3114 /* See PropList-3.0.1.txt. */
3116 is_property_bidi_pdf (unsigned int ch)
3118 return (get_bidi_category (ch) == UC_BIDI_PDF);
3121 /* See PropList-3.0.1.txt. */
3123 is_property_bidi_embedding_or_override (unsigned int ch)
3125 int category = get_bidi_category (ch);
3126 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3127 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3130 /* See PropList-3.0.1.txt. */
3132 is_property_bidi_other_neutral (unsigned int ch)
3134 return (get_bidi_category (ch) == UC_BIDI_ON);
3137 /* See PropList.txt, UCD.html. */
3139 is_property_hex_digit (unsigned int ch)
3141 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3144 /* See PropList.txt, UCD.html. */
3146 is_property_ascii_hex_digit (unsigned int ch)
3148 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3151 /* See Unicode 3.0 book, section 4.10,
3152 PropList.txt, UCD.html. */
3154 is_property_ideographic (unsigned int ch)
3156 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3159 /* See PropList.txt, UCD.html. */
3161 is_property_unified_ideograph (unsigned int ch)
3163 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3166 /* See PropList.txt, UCD.html. */
3168 is_property_radical (unsigned int ch)
3170 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3173 /* See PropList.txt, UCD.html. */
3175 is_property_ids_binary_operator (unsigned int ch)
3177 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3180 /* See PropList.txt, UCD.html. */
3182 is_property_ids_trinary_operator (unsigned int ch)
3184 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3187 /* See PropList-3.0.1.txt. */
3189 is_property_zero_width (unsigned int ch)
3191 return is_category_Cf (ch)
3192 || (unicode_attributes[ch].name != NULL
3193 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3196 /* See PropList-3.0.1.txt. */
3198 is_property_space (unsigned int ch)
3200 return is_category_Zs (ch);
3203 /* See PropList-3.0.1.txt. */
3205 is_property_non_break (unsigned int ch)
3207 /* This is exactly the set of characters having line breaking
3209 return (ch == 0x00A0 /* NO-BREAK SPACE */
3210 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3211 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3212 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3213 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3214 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3215 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3216 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3217 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3218 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3219 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3220 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3221 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3222 || ch == 0x2007 /* FIGURE SPACE */
3223 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3224 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3227 /* See PropList-3.0.1.txt. */
3229 is_property_iso_control (unsigned int ch)
3232 (unicode_attributes[ch].name != NULL
3233 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3235 is_category_Cc (ch);
3237 if (result1 != result2)
3242 /* See PropList-3.0.1.txt. */
3244 is_property_format_control (unsigned int ch)
3246 return (is_category_Cf (ch)
3247 && get_bidi_category (ch) == UC_BIDI_BN
3248 && !is_property_join_control (ch)
3252 /* See PropList.txt, UCD.html. */
3254 is_property_dash (unsigned int ch)
3256 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3259 /* See PropList.txt, UCD.html. */
3261 is_property_hyphen (unsigned int ch)
3263 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3266 /* See PropList-3.0.1.txt. */
3268 is_property_punctuation (unsigned int ch)
3270 return is_category_P (ch);
3273 /* See PropList-3.0.1.txt. */
3275 is_property_line_separator (unsigned int ch)
3277 return is_category_Zl (ch);
3280 /* See PropList-3.0.1.txt. */
3282 is_property_paragraph_separator (unsigned int ch)
3284 return is_category_Zp (ch);
3287 /* See PropList.txt, UCD.html. */
3289 is_property_quotation_mark (unsigned int ch)
3291 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3294 /* See PropList.txt, UCD.html. */
3296 is_property_sentence_terminal (unsigned int ch)
3298 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3301 /* See PropList.txt, UCD.html. */
3303 is_property_terminal_punctuation (unsigned int ch)
3305 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3308 /* See PropList-3.0.1.txt. */
3310 is_property_currency_symbol (unsigned int ch)
3312 return is_category_Sc (ch);
3315 /* See Unicode 3.0 book, section 4.9,
3316 PropList.txt, UCD.html,
3317 DerivedCoreProperties.txt, UCD.html. */
3319 is_property_math (unsigned int ch)
3323 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3325 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3327 if (result1 != result2)
3332 /* See PropList.txt, UCD.html. */
3334 is_property_other_math (unsigned int ch)
3336 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3339 /* See PropList-3.0.1.txt. */
3341 is_property_paired_punctuation (unsigned int ch)
3343 return unicode_pairedpunctuation[ch];
3346 /* See PropList-3.0.1.txt. */
3348 is_property_left_of_pair (unsigned int ch)
3350 return unicode_leftofpair[ch];
3353 /* See PropList-3.0.1.txt. */
3355 is_property_combining (unsigned int ch)
3357 return (unicode_attributes[ch].name != NULL
3358 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3359 || is_category_Mc (ch)
3360 || is_category_Me (ch)
3361 || is_category_Mn (ch)));
3364 #if 0 /* same as is_property_bidi_non_spacing_mark */
3365 /* See PropList-3.0.1.txt. */
3367 is_property_non_spacing (unsigned int ch)
3369 return (unicode_attributes[ch].name != NULL
3370 && get_bidi_category (ch) == UC_BIDI_NSM);
3374 /* See PropList-3.0.1.txt. */
3376 is_property_composite (unsigned int ch)
3378 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3379 logical in some sense. */
3380 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3382 if (unicode_attributes[ch].name != NULL
3383 && unicode_attributes[ch].decomposition != NULL)
3385 /* Test whether the decomposition contains more than one character,
3386 and the first is not a space. */
3387 const char *decomp = unicode_attributes[ch].decomposition;
3388 if (decomp[0] == '<')
3390 decomp = strchr (decomp, '>') + 1;
3391 if (decomp[0] == ' ')
3394 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3399 /* See PropList-3.0.1.txt. */
3401 is_property_decimal_digit (unsigned int ch)
3403 return is_category_Nd (ch);
3406 /* See PropList-3.0.1.txt. */
3408 is_property_numeric (unsigned int ch)
3410 return ((get_numeric_value (ch)).denominator > 0)
3411 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3412 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3415 /* See PropList.txt, UCD.html. */
3417 is_property_diacritic (unsigned int ch)
3419 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3422 /* See PropList.txt, UCD.html. */
3424 is_property_extender (unsigned int ch)
3426 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3429 /* See PropList-3.0.1.txt. */
3431 is_property_ignorable_control (unsigned int ch)
3433 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3434 || is_category_Cf (ch))
3438 /* ------------------------------------------------------------------------- */
3440 /* Output all properties. */
3442 output_properties (const char *version)
3444 #define PROPERTY(P) \
3445 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3446 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3447 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3448 PROPERTY(white_space)
3449 PROPERTY(alphabetic)
3450 PROPERTY(other_alphabetic)
3451 PROPERTY(not_a_character)
3452 PROPERTY(default_ignorable_code_point)
3453 PROPERTY(other_default_ignorable_code_point)
3454 PROPERTY(deprecated)
3455 PROPERTY(logical_order_exception)
3456 PROPERTY(variation_selector)
3457 PROPERTY(private_use)
3458 PROPERTY(unassigned_code_value)
3460 PROPERTY(other_uppercase)
3462 PROPERTY(other_lowercase)
3464 PROPERTY(soft_dotted)
3466 PROPERTY(other_id_start)
3467 PROPERTY(id_continue)
3468 PROPERTY(other_id_continue)
3470 PROPERTY(xid_continue)
3471 PROPERTY(pattern_white_space)
3472 PROPERTY(pattern_syntax)
3473 PROPERTY(join_control)
3474 PROPERTY(grapheme_base)
3475 PROPERTY(grapheme_extend)
3476 PROPERTY(other_grapheme_extend)
3477 PROPERTY(grapheme_link)
3478 PROPERTY(bidi_control)
3479 PROPERTY(bidi_left_to_right)
3480 PROPERTY(bidi_hebrew_right_to_left)
3481 PROPERTY(bidi_arabic_right_to_left)
3482 PROPERTY(bidi_european_digit)
3483 PROPERTY(bidi_eur_num_separator)
3484 PROPERTY(bidi_eur_num_terminator)
3485 PROPERTY(bidi_arabic_digit)
3486 PROPERTY(bidi_common_separator)
3487 PROPERTY(bidi_block_separator)
3488 PROPERTY(bidi_segment_separator)
3489 PROPERTY(bidi_whitespace)
3490 PROPERTY(bidi_non_spacing_mark)
3491 PROPERTY(bidi_boundary_neutral)
3493 PROPERTY(bidi_embedding_or_override)
3494 PROPERTY(bidi_other_neutral)
3496 PROPERTY(ascii_hex_digit)
3497 PROPERTY(ideographic)
3498 PROPERTY(unified_ideograph)
3500 PROPERTY(ids_binary_operator)
3501 PROPERTY(ids_trinary_operator)
3502 PROPERTY(zero_width)
3505 PROPERTY(iso_control)
3506 PROPERTY(format_control)
3509 PROPERTY(punctuation)
3510 PROPERTY(line_separator)
3511 PROPERTY(paragraph_separator)
3512 PROPERTY(quotation_mark)
3513 PROPERTY(sentence_terminal)
3514 PROPERTY(terminal_punctuation)
3515 PROPERTY(currency_symbol)
3517 PROPERTY(other_math)
3518 PROPERTY(paired_punctuation)
3519 PROPERTY(left_of_pair)
3522 PROPERTY(decimal_digit)
3526 PROPERTY(ignorable_control)
3530 /* ========================================================================= */
3534 static const char *scripts[256];
3535 static unsigned int numscripts;
3537 static uint8_t unicode_scripts[0x110000];
3540 fill_scripts (const char *scripts_filename)
3545 stream = fopen (scripts_filename, "r");
3548 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3554 for (i = 0; i < 0x110000; i++)
3555 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3560 unsigned int i1, i2;
3561 char padding[200+1];
3562 char scriptname[200+1];
3565 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3568 if (buf[0] == '\0' || buf[0] == '#')
3571 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3573 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3575 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3585 for (script = numscripts - 1; script >= 0; script--)
3586 if (strcmp (scripts[script], scriptname) == 0)
3590 scripts[numscripts] = strdup (scriptname);
3591 script = numscripts;
3593 if (numscripts == 256)
3597 for (i = i1; i <= i2; i++)
3599 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3600 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3601 unicode_scripts[i] = script;
3605 if (ferror (stream) || fclose (stream))
3607 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3612 /* Construction of sparse 3-level tables. */
3613 #define TABLE script_table
3614 #define ELEMENT uint8_t
3615 #define DEFAULT (uint8_t)~(uint8_t)0
3616 #define xmalloc malloc
3617 #define xrealloc realloc
3621 output_scripts (const char *version)
3623 const char *filename = "unictype/scripts.h";
3625 unsigned int ch, s, i;
3626 struct script_table t;
3627 unsigned int level1_offset, level2_offset, level3_offset;
3631 const char *lowercase_name;
3634 scriptinfo_t scriptinfo[256];
3636 stream = fopen (filename, "w");
3639 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3643 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3644 fprintf (stream, "/* Unicode scripts. */\n");
3645 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3648 for (s = 0; s < numscripts; s++)
3650 char *lcp = strdup (scripts[s]);
3653 for (cp = lcp; *cp != '\0'; cp++)
3654 if (*cp >= 'A' && *cp <= 'Z')
3657 scriptinfo[s].lowercase_name = lcp;
3660 for (s = 0; s < numscripts; s++)
3662 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3663 scriptinfo[s].lowercase_name);
3664 fprintf (stream, "{\n");
3666 for (ch = 0; ch < 0x110000; ch++)
3667 if (unicode_scripts[ch] == s)
3673 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3678 fprintf (stream, ",\n");
3680 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3682 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3686 fprintf (stream, "\n");
3687 fprintf (stream, "};\n");
3690 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3691 fprintf (stream, "{\n");
3692 for (s = 0; s < numscripts; s++)
3694 fprintf (stream, " {\n");
3695 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3696 scriptinfo[s].lowercase_name);
3697 fprintf (stream, " script_%s_intervals,\n",
3698 scriptinfo[s].lowercase_name);
3699 fprintf (stream, " \"%s\"\n", scripts[s]);
3700 fprintf (stream, " }");
3701 if (s+1 < numscripts)
3702 fprintf (stream, ",");
3703 fprintf (stream, "\n");
3705 fprintf (stream, "};\n");
3709 script_table_init (&t);
3711 for (ch = 0; ch < 0x110000; ch++)
3713 unsigned int s = unicode_scripts[ch];
3714 if (s != (uint8_t)~(uint8_t)0)
3715 script_table_add (&t, ch, s);
3718 script_table_finalize (&t);
3720 /* Offsets in t.result, in memory of this process. */
3722 5 * sizeof (uint32_t);
3724 5 * sizeof (uint32_t)
3725 + t.level1_size * sizeof (uint32_t);
3727 5 * sizeof (uint32_t)
3728 + t.level1_size * sizeof (uint32_t)
3729 + (t.level2_size << t.q) * sizeof (uint32_t);
3731 for (i = 0; i < 5; i++)
3732 fprintf (stream, "#define script_header_%d %d\n", i,
3733 ((uint32_t *) t.result)[i]);
3734 fprintf (stream, "static const\n");
3735 fprintf (stream, "struct\n");
3736 fprintf (stream, " {\n");
3737 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3738 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3739 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3740 fprintf (stream, " }\n");
3741 fprintf (stream, "u_script =\n");
3742 fprintf (stream, "{\n");
3743 fprintf (stream, " {");
3744 if (t.level1_size > 8)
3745 fprintf (stream, "\n ");
3746 for (i = 0; i < t.level1_size; i++)
3749 if (i > 0 && (i % 8) == 0)
3750 fprintf (stream, "\n ");
3751 offset = ((uint32_t *) (t.result + level1_offset))[i];
3753 fprintf (stream, " %5d", -1);
3755 fprintf (stream, " %5zd",
3756 (offset - level2_offset) / sizeof (uint32_t));
3757 if (i+1 < t.level1_size)
3758 fprintf (stream, ",");
3760 if (t.level1_size > 8)
3761 fprintf (stream, "\n ");
3762 fprintf (stream, " },\n");
3763 fprintf (stream, " {");
3764 if (t.level2_size << t.q > 8)
3765 fprintf (stream, "\n ");
3766 for (i = 0; i < t.level2_size << t.q; i++)
3769 if (i > 0 && (i % 8) == 0)
3770 fprintf (stream, "\n ");
3771 offset = ((uint32_t *) (t.result + level2_offset))[i];
3773 fprintf (stream, " %5d", -1);
3775 fprintf (stream, " %5zd",
3776 (offset - level3_offset) / sizeof (uint8_t));
3777 if (i+1 < t.level2_size << t.q)
3778 fprintf (stream, ",");
3780 if (t.level2_size << t.q > 8)
3781 fprintf (stream, "\n ");
3782 fprintf (stream, " },\n");
3783 fprintf (stream, " {");
3784 if (t.level3_size << t.p > 8)
3785 fprintf (stream, "\n ");
3786 for (i = 0; i < t.level3_size << t.p; i++)
3788 if (i > 0 && (i % 8) == 0)
3789 fprintf (stream, "\n ");
3790 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3791 if (i+1 < t.level3_size << t.p)
3792 fprintf (stream, ",");
3794 if (t.level3_size << t.p > 8)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " }\n");
3797 fprintf (stream, "};\n");
3799 if (ferror (stream) || fclose (stream))
3801 fprintf (stderr, "error writing to '%s'\n", filename);
3807 output_scripts_byname (const char *version)
3809 const char *filename = "unictype/scripts_byname.gperf";
3813 stream = fopen (filename, "w");
3816 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3820 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3821 fprintf (stream, "/* Unicode scripts. */\n");
3822 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3824 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3825 fprintf (stream, "%%struct-type\n");
3826 fprintf (stream, "%%language=ANSI-C\n");
3827 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3828 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3829 fprintf (stream, "%%readonly-tables\n");
3830 fprintf (stream, "%%global-table\n");
3831 fprintf (stream, "%%define word-array-name script_names\n");
3832 fprintf (stream, "%%%%\n");
3833 for (s = 0; s < numscripts; s++)
3834 fprintf (stream, "%s, %u\n", scripts[s], s);
3836 if (ferror (stream) || fclose (stream))
3838 fprintf (stderr, "error writing to '%s'\n", filename);
3843 /* ========================================================================= */
3847 typedef struct { unsigned int start; unsigned int end; const char *name; }
3849 static block_t blocks[256];
3850 static unsigned int numblocks;
3853 fill_blocks (const char *blocks_filename)
3857 stream = fopen (blocks_filename, "r");
3860 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3867 unsigned int i1, i2;
3868 char padding[200+1];
3869 char blockname[200+1];
3871 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3874 if (buf[0] == '\0' || buf[0] == '#')
3877 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3879 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3882 blocks[numblocks].start = i1;
3883 blocks[numblocks].end = i2;
3884 blocks[numblocks].name = strdup (blockname);
3885 /* It must be sorted. */
3886 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3889 if (numblocks == 256)
3893 if (ferror (stream) || fclose (stream))
3895 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3900 /* Return the smallest block index among the blocks for characters >= ch. */
3902 block_first_index (unsigned int ch)
3904 /* Binary search. */
3905 unsigned int lo = 0;
3906 unsigned int hi = numblocks;
3908 All blocks[i], i < lo, have blocks[i].end < ch,
3909 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3912 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3913 if (blocks[mid].end < ch)
3921 /* Return the largest block index among the blocks for characters <= ch,
3924 block_last_index (unsigned int ch)
3926 /* Binary search. */
3927 unsigned int lo = 0;
3928 unsigned int hi = numblocks;
3930 All blocks[i], i < lo, have blocks[i].start <= ch,
3931 all blocks[i], i >= hi, have blocks[i].start > ch. */
3934 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3935 if (blocks[mid].start <= ch)
3944 output_blocks (const char *version)
3946 const char *filename = "unictype/blocks.h";
3947 const unsigned int shift = 8; /* bits to shift away for array access */
3948 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3953 stream = fopen (filename, "w");
3956 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3960 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3961 fprintf (stream, "/* Unicode blocks. */\n");
3962 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3965 fprintf (stream, "static const uc_block_t blocks[] =\n");
3966 fprintf (stream, "{\n");
3967 for (i = 0; i < numblocks; i++)
3969 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3970 blocks[i].end, blocks[i].name);
3971 if (i+1 < numblocks)
3972 fprintf (stream, ",");
3973 fprintf (stream, "\n");
3975 fprintf (stream, "};\n");
3976 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3977 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3978 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3979 threshold >> shift);
3980 fprintf (stream, "{\n");
3981 for (i1 = 0; i1 < (threshold >> shift); i1++)
3983 unsigned int first_index = block_first_index (i1 << shift);
3984 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3985 fprintf (stream, " %3d, %3d", first_index, last_index);
3986 if (i1+1 < (threshold >> shift))
3987 fprintf (stream, ",");
3988 fprintf (stream, "\n");
3990 fprintf (stream, "};\n");
3991 fprintf (stream, "#define blocks_upper_first_index %d\n",
3992 block_first_index (threshold));
3993 fprintf (stream, "#define blocks_upper_last_index %d\n",
3994 block_last_index (0x10FFFF));
3996 if (ferror (stream) || fclose (stream))
3998 fprintf (stderr, "error writing to '%s'\n", filename);
4003 /* ========================================================================= */
4005 /* C and Java syntax. */
4009 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4010 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4011 UC_IDENTIFIER_INVALID, /* not valid */
4012 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4015 /* ISO C 99 section 6.4.(3). */
4017 is_c_whitespace (unsigned int ch)
4019 return (ch == ' ' /* space */
4020 || ch == '\t' /* horizontal tab */
4021 || ch == '\n' || ch == '\r' /* new-line */
4022 || ch == '\v' /* vertical tab */
4023 || ch == '\f'); /* form-feed */
4026 /* ISO C 99 section 6.4.2.1 and appendix D. */
4028 c_ident_category (unsigned int ch)
4030 /* Section 6.4.2.1. */
4031 if (ch >= '0' && ch <= '9')
4032 return UC_IDENTIFIER_VALID;
4033 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4034 return UC_IDENTIFIER_START;
4040 || (ch >= 0x00C0 && ch <= 0x00D6)
4041 || (ch >= 0x00D8 && ch <= 0x00F6)
4042 || (ch >= 0x00F8 && ch <= 0x01F5)
4043 || (ch >= 0x01FA && ch <= 0x0217)
4044 || (ch >= 0x0250 && ch <= 0x02A8)
4045 || (ch >= 0x1E00 && ch <= 0x1E9B)
4046 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4050 || (ch >= 0x0388 && ch <= 0x038A)
4052 || (ch >= 0x038E && ch <= 0x03A1)
4053 || (ch >= 0x03A3 && ch <= 0x03CE)
4054 || (ch >= 0x03D0 && ch <= 0x03D6)
4059 || (ch >= 0x03E2 && ch <= 0x03F3)
4060 || (ch >= 0x1F00 && ch <= 0x1F15)
4061 || (ch >= 0x1F18 && ch <= 0x1F1D)
4062 || (ch >= 0x1F20 && ch <= 0x1F45)
4063 || (ch >= 0x1F48 && ch <= 0x1F4D)
4064 || (ch >= 0x1F50 && ch <= 0x1F57)
4068 || (ch >= 0x1F5F && ch <= 0x1F7D)
4069 || (ch >= 0x1F80 && ch <= 0x1FB4)
4070 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4071 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4072 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4073 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4074 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4075 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4076 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4077 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4079 || (ch >= 0x0401 && ch <= 0x040C)
4080 || (ch >= 0x040E && ch <= 0x044F)
4081 || (ch >= 0x0451 && ch <= 0x045C)
4082 || (ch >= 0x045E && ch <= 0x0481)
4083 || (ch >= 0x0490 && ch <= 0x04C4)
4084 || (ch >= 0x04C7 && ch <= 0x04C8)
4085 || (ch >= 0x04CB && ch <= 0x04CC)
4086 || (ch >= 0x04D0 && ch <= 0x04EB)
4087 || (ch >= 0x04EE && ch <= 0x04F5)
4088 || (ch >= 0x04F8 && ch <= 0x04F9)
4090 || (ch >= 0x0531 && ch <= 0x0556)
4091 || (ch >= 0x0561 && ch <= 0x0587)
4093 || (ch >= 0x05B0 && ch <= 0x05B9)
4094 || (ch >= 0x05BB && ch <= 0x05BD)
4096 || (ch >= 0x05C1 && ch <= 0x05C2)
4097 || (ch >= 0x05D0 && ch <= 0x05EA)
4098 || (ch >= 0x05F0 && ch <= 0x05F2)
4100 || (ch >= 0x0621 && ch <= 0x063A)
4101 || (ch >= 0x0640 && ch <= 0x0652)
4102 || (ch >= 0x0670 && ch <= 0x06B7)
4103 || (ch >= 0x06BA && ch <= 0x06BE)
4104 || (ch >= 0x06C0 && ch <= 0x06CE)
4105 || (ch >= 0x06D0 && ch <= 0x06DC)
4106 || (ch >= 0x06E5 && ch <= 0x06E8)
4107 || (ch >= 0x06EA && ch <= 0x06ED)
4109 || (ch >= 0x0901 && ch <= 0x0903)
4110 || (ch >= 0x0905 && ch <= 0x0939)
4111 || (ch >= 0x093E && ch <= 0x094D)
4112 || (ch >= 0x0950 && ch <= 0x0952)
4113 || (ch >= 0x0958 && ch <= 0x0963)
4115 || (ch >= 0x0981 && ch <= 0x0983)
4116 || (ch >= 0x0985 && ch <= 0x098C)
4117 || (ch >= 0x098F && ch <= 0x0990)
4118 || (ch >= 0x0993 && ch <= 0x09A8)
4119 || (ch >= 0x09AA && ch <= 0x09B0)
4121 || (ch >= 0x09B6 && ch <= 0x09B9)
4122 || (ch >= 0x09BE && ch <= 0x09C4)
4123 || (ch >= 0x09C7 && ch <= 0x09C8)
4124 || (ch >= 0x09CB && ch <= 0x09CD)
4125 || (ch >= 0x09DC && ch <= 0x09DD)
4126 || (ch >= 0x09DF && ch <= 0x09E3)
4127 || (ch >= 0x09F0 && ch <= 0x09F1)
4130 || (ch >= 0x0A05 && ch <= 0x0A0A)
4131 || (ch >= 0x0A0F && ch <= 0x0A10)
4132 || (ch >= 0x0A13 && ch <= 0x0A28)
4133 || (ch >= 0x0A2A && ch <= 0x0A30)
4134 || (ch >= 0x0A32 && ch <= 0x0A33)
4135 || (ch >= 0x0A35 && ch <= 0x0A36)
4136 || (ch >= 0x0A38 && ch <= 0x0A39)
4137 || (ch >= 0x0A3E && ch <= 0x0A42)
4138 || (ch >= 0x0A47 && ch <= 0x0A48)
4139 || (ch >= 0x0A4B && ch <= 0x0A4D)
4140 || (ch >= 0x0A59 && ch <= 0x0A5C)
4144 || (ch >= 0x0A81 && ch <= 0x0A83)
4145 || (ch >= 0x0A85 && ch <= 0x0A8B)
4147 || (ch >= 0x0A8F && ch <= 0x0A91)
4148 || (ch >= 0x0A93 && ch <= 0x0AA8)
4149 || (ch >= 0x0AAA && ch <= 0x0AB0)
4150 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4151 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4152 || (ch >= 0x0ABD && ch <= 0x0AC5)
4153 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4154 || (ch >= 0x0ACB && ch <= 0x0ACD)
4158 || (ch >= 0x0B01 && ch <= 0x0B03)
4159 || (ch >= 0x0B05 && ch <= 0x0B0C)
4160 || (ch >= 0x0B0F && ch <= 0x0B10)
4161 || (ch >= 0x0B13 && ch <= 0x0B28)
4162 || (ch >= 0x0B2A && ch <= 0x0B30)
4163 || (ch >= 0x0B32 && ch <= 0x0B33)
4164 || (ch >= 0x0B36 && ch <= 0x0B39)
4165 || (ch >= 0x0B3E && ch <= 0x0B43)
4166 || (ch >= 0x0B47 && ch <= 0x0B48)
4167 || (ch >= 0x0B4B && ch <= 0x0B4D)
4168 || (ch >= 0x0B5C && ch <= 0x0B5D)
4169 || (ch >= 0x0B5F && ch <= 0x0B61)
4171 || (ch >= 0x0B82 && ch <= 0x0B83)
4172 || (ch >= 0x0B85 && ch <= 0x0B8A)
4173 || (ch >= 0x0B8E && ch <= 0x0B90)
4174 || (ch >= 0x0B92 && ch <= 0x0B95)
4175 || (ch >= 0x0B99 && ch <= 0x0B9A)
4177 || (ch >= 0x0B9E && ch <= 0x0B9F)
4178 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4179 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4180 || (ch >= 0x0BAE && ch <= 0x0BB5)
4181 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4182 || (ch >= 0x0BBE && ch <= 0x0BC2)
4183 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4184 || (ch >= 0x0BCA && ch <= 0x0BCD)
4186 || (ch >= 0x0C01 && ch <= 0x0C03)
4187 || (ch >= 0x0C05 && ch <= 0x0C0C)
4188 || (ch >= 0x0C0E && ch <= 0x0C10)
4189 || (ch >= 0x0C12 && ch <= 0x0C28)
4190 || (ch >= 0x0C2A && ch <= 0x0C33)
4191 || (ch >= 0x0C35 && ch <= 0x0C39)
4192 || (ch >= 0x0C3E && ch <= 0x0C44)
4193 || (ch >= 0x0C46 && ch <= 0x0C48)
4194 || (ch >= 0x0C4A && ch <= 0x0C4D)
4195 || (ch >= 0x0C60 && ch <= 0x0C61)
4197 || (ch >= 0x0C82 && ch <= 0x0C83)
4198 || (ch >= 0x0C85 && ch <= 0x0C8C)
4199 || (ch >= 0x0C8E && ch <= 0x0C90)
4200 || (ch >= 0x0C92 && ch <= 0x0CA8)
4201 || (ch >= 0x0CAA && ch <= 0x0CB3)
4202 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4203 || (ch >= 0x0CBE && ch <= 0x0CC4)
4204 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4205 || (ch >= 0x0CCA && ch <= 0x0CCD)
4207 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4209 || (ch >= 0x0D02 && ch <= 0x0D03)
4210 || (ch >= 0x0D05 && ch <= 0x0D0C)
4211 || (ch >= 0x0D0E && ch <= 0x0D10)
4212 || (ch >= 0x0D12 && ch <= 0x0D28)
4213 || (ch >= 0x0D2A && ch <= 0x0D39)
4214 || (ch >= 0x0D3E && ch <= 0x0D43)
4215 || (ch >= 0x0D46 && ch <= 0x0D48)
4216 || (ch >= 0x0D4A && ch <= 0x0D4D)
4217 || (ch >= 0x0D60 && ch <= 0x0D61)
4219 || (ch >= 0x0E01 && ch <= 0x0E3A)
4220 || (ch >= 0x0E40 && ch <= 0x0E5B)
4222 || (ch >= 0x0E81 && ch <= 0x0E82)
4224 || (ch >= 0x0E87 && ch <= 0x0E88)
4227 || (ch >= 0x0E94 && ch <= 0x0E97)
4228 || (ch >= 0x0E99 && ch <= 0x0E9F)
4229 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4232 || (ch >= 0x0EAA && ch <= 0x0EAB)
4233 || (ch >= 0x0EAD && ch <= 0x0EAE)
4234 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4235 || (ch >= 0x0EBB && ch <= 0x0EBD)
4236 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4238 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4239 || (ch >= 0x0EDC && ch <= 0x0EDD)
4242 || (ch >= 0x0F18 && ch <= 0x0F19)
4246 || (ch >= 0x0F3E && ch <= 0x0F47)
4247 || (ch >= 0x0F49 && ch <= 0x0F69)
4248 || (ch >= 0x0F71 && ch <= 0x0F84)
4249 || (ch >= 0x0F86 && ch <= 0x0F8B)
4250 || (ch >= 0x0F90 && ch <= 0x0F95)
4252 || (ch >= 0x0F99 && ch <= 0x0FAD)
4253 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4256 || (ch >= 0x10A0 && ch <= 0x10C5)
4257 || (ch >= 0x10D0 && ch <= 0x10F6)
4259 || (ch >= 0x3041 && ch <= 0x3093)
4260 || (ch >= 0x309B && ch <= 0x309C)
4262 || (ch >= 0x30A1 && ch <= 0x30F6)
4263 || (ch >= 0x30FB && ch <= 0x30FC)
4265 || (ch >= 0x3105 && ch <= 0x312C)
4266 /* CJK Unified Ideographs */
4267 || (ch >= 0x4E00 && ch <= 0x9FA5)
4269 || (ch >= 0xAC00 && ch <= 0xD7A3)
4271 || (ch >= 0x0660 && ch <= 0x0669)
4272 || (ch >= 0x06F0 && ch <= 0x06F9)
4273 || (ch >= 0x0966 && ch <= 0x096F)
4274 || (ch >= 0x09E6 && ch <= 0x09EF)
4275 || (ch >= 0x0A66 && ch <= 0x0A6F)
4276 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4277 || (ch >= 0x0B66 && ch <= 0x0B6F)
4278 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4279 || (ch >= 0x0C66 && ch <= 0x0C6F)
4280 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4281 || (ch >= 0x0D66 && ch <= 0x0D6F)
4282 || (ch >= 0x0E50 && ch <= 0x0E59)
4283 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4284 || (ch >= 0x0F20 && ch <= 0x0F33)
4285 /* Special characters */
4288 || (ch >= 0x02B0 && ch <= 0x02B8)
4290 || (ch >= 0x02BD && ch <= 0x02C1)
4291 || (ch >= 0x02D0 && ch <= 0x02D1)
4292 || (ch >= 0x02E0 && ch <= 0x02E4)
4298 || (ch >= 0x203F && ch <= 0x2040)
4301 || (ch >= 0x210A && ch <= 0x2113)
4303 || (ch >= 0x2118 && ch <= 0x211D)
4307 || (ch >= 0x212A && ch <= 0x2131)
4308 || (ch >= 0x2133 && ch <= 0x2138)
4309 || (ch >= 0x2160 && ch <= 0x2182)
4310 || (ch >= 0x3005 && ch <= 0x3007)
4311 || (ch >= 0x3021 && ch <= 0x3029)
4313 return UC_IDENTIFIER_START;
4314 return UC_IDENTIFIER_INVALID;
4317 /* The Java Language Specification, 3rd edition, §3.6.
4318 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4320 is_java_whitespace (unsigned int ch)
4322 return (ch == ' ' || ch == '\t' || ch == '\f'
4323 || ch == '\n' || ch == '\r');
4326 /* The Java Language Specification, 3rd edition, §3.8.
4327 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4328 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4330 java_ident_category (unsigned int ch)
4332 /* FIXME: Check this against Sun's JDK implementation. */
4333 if (is_category_L (ch) /* = Character.isLetter(ch) */
4334 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4335 || is_category_Sc (ch) /* currency symbol */
4336 || is_category_Pc (ch) /* connector punctuation */
4338 return UC_IDENTIFIER_START;
4339 if (is_category_Nd (ch) /* digit */
4340 || is_category_Mc (ch) /* combining mark */
4341 || is_category_Mn (ch) /* non-spacing mark */
4343 return UC_IDENTIFIER_VALID;
4344 if ((ch >= 0x0000 && ch <= 0x0008)
4345 || (ch >= 0x000E && ch <= 0x001B)
4346 || (ch >= 0x007F && ch <= 0x009F)
4347 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4349 return UC_IDENTIFIER_IGNORABLE;
4350 return UC_IDENTIFIER_INVALID;
4353 /* Construction of sparse 3-level tables. */
4354 #define TABLE identsyntax_table
4355 #define ELEMENT uint8_t
4356 #define DEFAULT UC_IDENTIFIER_INVALID
4357 #define xmalloc malloc
4358 #define xrealloc realloc
4361 /* Output an identifier syntax categorization in a three-level bitmap. */
4363 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4367 struct identsyntax_table t;
4368 unsigned int level1_offset, level2_offset, level3_offset;
4370 stream = fopen (filename, "w");
4373 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4377 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4378 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4379 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4384 identsyntax_table_init (&t);
4386 for (ch = 0; ch < 0x110000; ch++)
4388 int syntaxcode = predicate (ch);
4389 if (syntaxcode != UC_IDENTIFIER_INVALID)
4390 identsyntax_table_add (&t, ch, syntaxcode);
4393 identsyntax_table_finalize (&t);
4395 /* Offsets in t.result, in memory of this process. */
4397 5 * sizeof (uint32_t);
4399 5 * sizeof (uint32_t)
4400 + t.level1_size * sizeof (uint32_t);
4402 5 * sizeof (uint32_t)
4403 + t.level1_size * sizeof (uint32_t)
4404 + (t.level2_size << t.q) * sizeof (uint32_t);
4406 for (i = 0; i < 5; i++)
4407 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4408 ((uint32_t *) t.result)[i]);
4409 fprintf (stream, "static const\n");
4410 fprintf (stream, "struct\n");
4411 fprintf (stream, " {\n");
4412 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4413 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4414 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4415 (1 << t.p) * 2 / 16);
4416 fprintf (stream, " }\n");
4417 fprintf (stream, "%s =\n", name);
4418 fprintf (stream, "{\n");
4419 fprintf (stream, " {");
4420 if (t.level1_size > 8)
4421 fprintf (stream, "\n ");
4422 for (i = 0; i < t.level1_size; i++)
4425 if (i > 0 && (i % 8) == 0)
4426 fprintf (stream, "\n ");
4427 offset = ((uint32_t *) (t.result + level1_offset))[i];
4429 fprintf (stream, " %5d", -1);
4431 fprintf (stream, " %5zd",
4432 (offset - level2_offset) / sizeof (uint32_t));
4433 if (i+1 < t.level1_size)
4434 fprintf (stream, ",");
4436 if (t.level1_size > 8)
4437 fprintf (stream, "\n ");
4438 fprintf (stream, " },\n");
4439 fprintf (stream, " {");
4440 if (t.level2_size << t.q > 8)
4441 fprintf (stream, "\n ");
4442 for (i = 0; i < t.level2_size << t.q; i++)
4445 if (i > 0 && (i % 8) == 0)
4446 fprintf (stream, "\n ");
4447 offset = ((uint32_t *) (t.result + level2_offset))[i];
4449 fprintf (stream, " %5d", -1);
4451 fprintf (stream, " %5zd",
4452 (offset - level3_offset) / sizeof (uint8_t));
4453 if (i+1 < t.level2_size << t.q)
4454 fprintf (stream, ",");
4456 if (t.level2_size << t.q > 8)
4457 fprintf (stream, "\n ");
4458 fprintf (stream, " },\n");
4459 /* Pack the level3 array. Each entry needs 2 bits only. */
4460 fprintf (stream, " {");
4461 if ((t.level3_size << t.p) * 2 / 16 > 8)
4462 fprintf (stream, "\n ");
4463 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4465 if (i > 0 && (i % 8) == 0)
4466 fprintf (stream, "\n ");
4467 fprintf (stream, " 0x%04x",
4468 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4469 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4470 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4476 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4477 fprintf (stream, ",");
4479 if ((t.level3_size << t.p) * 2 / 16 > 8)
4480 fprintf (stream, "\n ");
4481 fprintf (stream, " }\n");
4482 fprintf (stream, "};\n");
4484 if (ferror (stream) || fclose (stream))
4486 fprintf (stderr, "error writing to '%s'\n", filename);
4492 output_ident_properties (const char *version)
4494 #define PROPERTY(P) \
4495 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4496 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4497 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4498 PROPERTY(c_whitespace)
4499 PROPERTY(java_whitespace)
4502 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4503 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4506 /* ========================================================================= */
4508 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4509 glibc/localedata/locales/i18n file, generated by
4510 glibc/localedata/gen-unicode-ctype.c. */
4512 /* Character mappings. */
4515 to_upper (unsigned int ch)
4517 if (unicode_attributes[ch].name != NULL
4518 && unicode_attributes[ch].upper != NONE)
4519 return unicode_attributes[ch].upper;
4525 to_lower (unsigned int ch)
4527 if (unicode_attributes[ch].name != NULL
4528 && unicode_attributes[ch].lower != NONE)
4529 return unicode_attributes[ch].lower;
4535 to_title (unsigned int ch)
4537 if (unicode_attributes[ch].name != NULL
4538 && unicode_attributes[ch].title != NONE)
4539 return unicode_attributes[ch].title;
4544 /* Character class properties. */
4547 is_upper (unsigned int ch)
4549 return (to_lower (ch) != ch);
4553 is_lower (unsigned int ch)
4555 return (to_upper (ch) != ch)
4556 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4561 is_alpha (unsigned int ch)
4563 return (unicode_attributes[ch].name != NULL
4564 && ((unicode_attributes[ch].category[0] == 'L'
4565 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4566 <U0E2F>, <U0E46> should belong to is_punct. */
4567 && (ch != 0x0E2F) && (ch != 0x0E46))
4568 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4569 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4571 || (ch >= 0x0E34 && ch <= 0x0E3A)
4572 || (ch >= 0x0E47 && ch <= 0x0E4E)
4573 /* Avoid warning for <U0345>. */
4575 /* Avoid warnings for <U2160>..<U217F>. */
4576 || (unicode_attributes[ch].category[0] == 'N'
4577 && unicode_attributes[ch].category[1] == 'l')
4578 /* Avoid warnings for <U24B6>..<U24E9>. */
4579 || (unicode_attributes[ch].category[0] == 'S'
4580 && unicode_attributes[ch].category[1] == 'o'
4581 && strstr (unicode_attributes[ch].name, " LETTER ")
4583 /* Consider all the non-ASCII digits as alphabetic.
4584 ISO C 99 forbids us to have them in category "digit",
4585 but we want iswalnum to return true on them. */
4586 || (unicode_attributes[ch].category[0] == 'N'
4587 && unicode_attributes[ch].category[1] == 'd'
4588 && !(ch >= 0x0030 && ch <= 0x0039))));
4592 is_digit (unsigned int ch)
4595 return (unicode_attributes[ch].name != NULL
4596 && unicode_attributes[ch].category[0] == 'N'
4597 && unicode_attributes[ch].category[1] == 'd');
4598 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4599 a zero. Must add <0> in front of them by hand. */
4601 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4604 The iswdigit function tests for any wide character that corresponds
4605 to a decimal-digit character (as defined in 5.2.1).
4607 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4609 return (ch >= 0x0030 && ch <= 0x0039);
4614 is_outdigit (unsigned int ch)
4616 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_alnum (unsigned int ch)
4622 return is_alpha (ch) || is_digit (ch);
4626 is_blank (unsigned int ch)
4628 return (ch == 0x0009 /* '\t' */
4629 /* Category Zs without mention of "<noBreak>" */
4630 || (unicode_attributes[ch].name != NULL
4631 && unicode_attributes[ch].category[0] == 'Z'
4632 && unicode_attributes[ch].category[1] == 's'
4633 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4637 is_space (unsigned int ch)
4639 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4640 should treat it like a punctuation character, not like a space. */
4641 return (ch == 0x0020 /* ' ' */
4642 || ch == 0x000C /* '\f' */
4643 || ch == 0x000A /* '\n' */
4644 || ch == 0x000D /* '\r' */
4645 || ch == 0x0009 /* '\t' */
4646 || ch == 0x000B /* '\v' */
4647 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4648 || (unicode_attributes[ch].name != NULL
4649 && unicode_attributes[ch].category[0] == 'Z'
4650 && (unicode_attributes[ch].category[1] == 'l'
4651 || unicode_attributes[ch].category[1] == 'p'
4652 || (unicode_attributes[ch].category[1] == 's'
4653 && !strstr (unicode_attributes[ch].decomposition,
4658 is_cntrl (unsigned int ch)
4660 return (unicode_attributes[ch].name != NULL
4661 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4662 /* Categories Zl and Zp */
4663 || (unicode_attributes[ch].category[0] == 'Z'
4664 && (unicode_attributes[ch].category[1] == 'l'
4665 || unicode_attributes[ch].category[1] == 'p'))));
4669 is_xdigit (unsigned int ch)
4672 return is_digit (ch)
4673 || (ch >= 0x0041 && ch <= 0x0046)
4674 || (ch >= 0x0061 && ch <= 0x0066);
4676 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4679 The iswxdigit function tests for any wide character that corresponds
4680 to a hexadecimal-digit character (as defined in 6.4.4.1).
4682 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4684 return (ch >= 0x0030 && ch <= 0x0039)
4685 || (ch >= 0x0041 && ch <= 0x0046)
4686 || (ch >= 0x0061 && ch <= 0x0066);
4691 is_graph (unsigned int ch)
4693 return (unicode_attributes[ch].name != NULL
4694 && strcmp (unicode_attributes[ch].name, "<control>")
4699 is_print (unsigned int ch)
4701 return (unicode_attributes[ch].name != NULL
4702 && strcmp (unicode_attributes[ch].name, "<control>")
4703 /* Categories Zl and Zp */
4704 && !(unicode_attributes[ch].name != NULL
4705 && unicode_attributes[ch].category[0] == 'Z'
4706 && (unicode_attributes[ch].category[1] == 'l'
4707 || unicode_attributes[ch].category[1] == 'p')));
4711 is_punct (unsigned int ch)
4714 return (unicode_attributes[ch].name != NULL
4715 && unicode_attributes[ch].category[0] == 'P');
4717 /* The traditional POSIX definition of punctuation is every graphic,
4718 non-alphanumeric character. */
4719 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4723 /* Output all properties. */
4725 output_old_ctype (const char *version)
4727 #define PROPERTY(P) \
4728 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4729 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4730 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4749 is_combining (unsigned int ch)
4751 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4752 file. In 3.0.1 it was identical to the union of the general categories
4753 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4754 PropList.txt file, so we take the latter definition. */
4755 return (unicode_attributes[ch].name != NULL
4756 && unicode_attributes[ch].category[0] == 'M'
4757 && (unicode_attributes[ch].category[1] == 'n'
4758 || unicode_attributes[ch].category[1] == 'c'
4759 || unicode_attributes[ch].category[1] == 'e'));
4763 is_combining_level3 (unsigned int ch)
4765 return is_combining (ch)
4766 && !(unicode_attributes[ch].combining[0] != '\0'
4767 && unicode_attributes[ch].combining[0] != '0'
4768 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4771 /* Return the UCS symbol string for a Unicode character. */
4773 ucs_symbol (unsigned int i)
4775 static char buf[11+1];
4777 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4781 /* Return the UCS symbol range string for a Unicode characters interval. */
4783 ucs_symbol_range (unsigned int low, unsigned int high)
4785 static char buf[24+1];
4787 strcpy (buf, ucs_symbol (low));
4789 strcat (buf, ucs_symbol (high));
4793 /* Output a character class (= property) table. */
4796 output_charclass (FILE *stream, const char *classname,
4797 bool (*func) (unsigned int))
4799 char table[0x110000];
4801 bool need_semicolon;
4802 const int max_column = 75;
4805 for (i = 0; i < 0x110000; i++)
4806 table[i] = (int) func (i);
4808 fprintf (stream, "%s ", classname);
4809 need_semicolon = false;
4811 for (i = 0; i < 0x110000; )
4817 unsigned int low, high;
4823 while (i < 0x110000 && table[i]);
4827 strcpy (buf, ucs_symbol (low));
4829 strcpy (buf, ucs_symbol_range (low, high));
4833 fprintf (stream, ";");
4837 if (column + strlen (buf) > max_column)
4839 fprintf (stream, "/\n ");
4843 fprintf (stream, "%s", buf);
4844 column += strlen (buf);
4845 need_semicolon = true;
4848 fprintf (stream, "\n");
4851 /* Output a character mapping table. */
4854 output_charmap (FILE *stream, const char *mapname,
4855 unsigned int (*func) (unsigned int))
4857 char table[0x110000];
4859 bool need_semicolon;
4860 const int max_column = 75;
4863 for (i = 0; i < 0x110000; i++)
4864 table[i] = (func (i) != i);
4866 fprintf (stream, "%s ", mapname);
4867 need_semicolon = false;
4869 for (i = 0; i < 0x110000; i++)
4875 strcat (buf, ucs_symbol (i));
4877 strcat (buf, ucs_symbol (func (i)));
4882 fprintf (stream, ";");
4886 if (column + strlen (buf) > max_column)
4888 fprintf (stream, "/\n ");
4892 fprintf (stream, "%s", buf);
4893 column += strlen (buf);
4894 need_semicolon = true;
4896 fprintf (stream, "\n");
4899 /* Output the width table. */
4902 output_widthmap (FILE *stream)
4906 /* Output the tables to the given file. */
4909 output_tables (const char *filename, const char *version)
4914 stream = fopen (filename, "w");
4917 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4921 fprintf (stream, "escape_char /\n");
4922 fprintf (stream, "comment_char %%\n");
4923 fprintf (stream, "\n");
4924 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4926 fprintf (stream, "\n");
4928 fprintf (stream, "LC_IDENTIFICATION\n");
4929 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4930 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4931 fprintf (stream, "address \"\"\n");
4932 fprintf (stream, "contact \"\"\n");
4933 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4934 fprintf (stream, "tel \"\"\n");
4935 fprintf (stream, "fax \"\"\n");
4936 fprintf (stream, "language \"\"\n");
4937 fprintf (stream, "territory \"Earth\"\n");
4938 fprintf (stream, "revision \"%s\"\n", version);
4943 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4944 fprintf (stream, "date \"%s\"\n", date);
4946 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4947 fprintf (stream, "END LC_IDENTIFICATION\n");
4948 fprintf (stream, "\n");
4950 /* Verifications. */
4951 for (ch = 0; ch < 0x110000; ch++)
4953 /* toupper restriction: "Only characters specified for the keywords
4954 lower and upper shall be specified. */
4955 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4957 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4958 ucs_symbol (ch), ch, to_upper (ch));
4960 /* tolower restriction: "Only characters specified for the keywords
4961 lower and upper shall be specified. */
4962 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4964 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4965 ucs_symbol (ch), ch, to_lower (ch));
4967 /* alpha restriction: "Characters classified as either upper or lower
4968 shall automatically belong to this class. */
4969 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4970 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4972 /* alpha restriction: "No character specified for the keywords cntrl,
4973 digit, punct or space shall be specified." */
4974 if (is_alpha (ch) && is_cntrl (ch))
4975 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4976 if (is_alpha (ch) && is_digit (ch))
4977 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4978 if (is_alpha (ch) && is_punct (ch))
4979 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4980 if (is_alpha (ch) && is_space (ch))
4981 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4983 /* space restriction: "No character specified for the keywords upper,
4984 lower, alpha, digit, graph or xdigit shall be specified."
4985 upper, lower, alpha already checked above. */
4986 if (is_space (ch) && is_digit (ch))
4987 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4988 if (is_space (ch) && is_graph (ch))
4989 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4990 if (is_space (ch) && is_xdigit (ch))
4991 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4993 /* cntrl restriction: "No character specified for the keywords upper,
4994 lower, alpha, digit, punct, graph, print or xdigit shall be
4995 specified." upper, lower, alpha already checked above. */
4996 if (is_cntrl (ch) && is_digit (ch))
4997 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
4998 if (is_cntrl (ch) && is_punct (ch))
4999 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5000 if (is_cntrl (ch) && is_graph (ch))
5001 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5002 if (is_cntrl (ch) && is_print (ch))
5003 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_xdigit (ch))
5005 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5007 /* punct restriction: "No character specified for the keywords upper,
5008 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5009 be specified." upper, lower, alpha, cntrl already checked above. */
5010 if (is_punct (ch) && is_digit (ch))
5011 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5012 if (is_punct (ch) && is_xdigit (ch))
5013 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5014 if (is_punct (ch) && (ch == 0x0020))
5015 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5017 /* graph restriction: "No character specified for the keyword cntrl
5018 shall be specified." Already checked above. */
5020 /* print restriction: "No character specified for the keyword cntrl
5021 shall be specified." Already checked above. */
5023 /* graph - print relation: differ only in the <space> character.
5024 How is this possible if there are more than one space character?!
5025 I think susv2/xbd/locale.html should speak of "space characters",
5026 not "space character". */
5027 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5029 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5030 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5032 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5035 fprintf (stream, "LC_CTYPE\n");
5036 output_charclass (stream, "upper", is_upper);
5037 output_charclass (stream, "lower", is_lower);
5038 output_charclass (stream, "alpha", is_alpha);
5039 output_charclass (stream, "digit", is_digit);
5040 output_charclass (stream, "outdigit", is_outdigit);
5041 output_charclass (stream, "blank", is_blank);
5042 output_charclass (stream, "space", is_space);
5043 output_charclass (stream, "cntrl", is_cntrl);
5044 output_charclass (stream, "punct", is_punct);
5045 output_charclass (stream, "xdigit", is_xdigit);
5046 output_charclass (stream, "graph", is_graph);
5047 output_charclass (stream, "print", is_print);
5048 output_charclass (stream, "class \"combining\";", is_combining);
5049 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5050 output_charmap (stream, "toupper", to_upper);
5051 output_charmap (stream, "tolower", to_lower);
5052 output_charmap (stream, "map \"totitle\";", to_title);
5053 output_widthmap (stream);
5054 fprintf (stream, "END LC_CTYPE\n");
5056 if (ferror (stream) || fclose (stream))
5058 fprintf (stderr, "error writing to '%s'\n", filename);
5065 /* ========================================================================= */
5067 /* The width property from the EastAsianWidth.txt file.
5068 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5069 const char * unicode_width[0x110000];
5071 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5074 fill_width (const char *width_filename)
5078 char field0[FIELDLEN];
5079 char field1[FIELDLEN];
5080 char field2[FIELDLEN];
5083 for (i = 0; i < 0x110000; i++)
5084 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5086 stream = fopen (width_filename, "r");
5089 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5104 do c = getc (stream); while (c != EOF && c != '\n');
5108 n = getfield (stream, field0, ';');
5109 n += getfield (stream, field1, ' ');
5110 n += getfield (stream, field2, '\n');
5115 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5118 i = strtoul (field0, NULL, 16);
5119 if (strstr (field0, "..") != NULL)
5121 /* Deal with a range. */
5122 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5124 unicode_width[i] = strdup (field1);
5128 /* Single character line. */
5129 unicode_width[i] = strdup (field1);
5132 if (ferror (stream) || fclose (stream))
5134 fprintf (stderr, "error reading from '%s'\n", width_filename);
5139 /* Line breaking classification. */
5143 /* Values >= 24 are resolved at run time. */
5144 LBP_BK = 24, /* mandatory break */
5145 /*LBP_CR, carriage return - not used here because it's a DOSism */
5146 /*LBP_LF, line feed - not used here because it's a DOSism */
5147 LBP_CM = 25, /* attached characters and combining marks */
5148 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5149 /*LBP_SG, surrogates - not used here because they are not characters */
5150 LBP_WJ = 0, /* word joiner */
5151 LBP_ZW = 26, /* zero width space */
5152 LBP_GL = 1, /* non-breaking (glue) */
5153 LBP_SP = 27, /* space */
5154 LBP_B2 = 2, /* break opportunity before and after */
5155 LBP_BA = 3, /* break opportunity after */
5156 LBP_BB = 4, /* break opportunity before */
5157 LBP_HY = 5, /* hyphen */
5158 LBP_CB = 28, /* contingent break opportunity */
5159 LBP_CL = 6, /* closing punctuation */
5160 LBP_EX = 7, /* exclamation/interrogation */
5161 LBP_IN = 8, /* inseparable */
5162 LBP_NS = 9, /* non starter */
5163 LBP_OP = 10, /* opening punctuation */
5164 LBP_QU = 11, /* ambiguous quotation */
5165 LBP_IS = 12, /* infix separator (numeric) */
5166 LBP_NU = 13, /* numeric */
5167 LBP_PO = 14, /* postfix (numeric) */
5168 LBP_PR = 15, /* prefix (numeric) */
5169 LBP_SY = 16, /* symbols allowing breaks */
5170 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5171 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5172 LBP_H2 = 18, /* Hangul LV syllable */
5173 LBP_H3 = 19, /* Hangul LVT syllable */
5174 LBP_ID = 20, /* ideographic */
5175 LBP_JL = 21, /* Hangul L Jamo */
5176 LBP_JV = 22, /* Hangul V Jamo */
5177 LBP_JT = 23, /* Hangul T Jamo */
5178 LBP_SA = 30, /* complex context (South East Asian) */
5179 LBP_XX = 31 /* unknown */
5182 /* Returns the line breaking classification for ch, as a bit mask. */
5184 get_lbp (unsigned int ch)
5188 if (unicode_attributes[ch].name != NULL)
5190 /* mandatory break */
5191 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5192 || ch == 0x000C /* form feed */
5193 || ch == 0x000B /* line tabulation */
5194 || ch == 0x2028 /* LINE SEPARATOR */
5195 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5196 attr |= 1 << LBP_BK;
5198 if (ch == 0x2060 /* WORD JOINER */
5199 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5200 attr |= 1 << LBP_WJ;
5202 /* zero width space */
5203 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5204 attr |= 1 << LBP_ZW;
5206 /* non-breaking (glue) */
5207 if (ch == 0x00A0 /* NO-BREAK SPACE */
5208 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5209 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5210 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5211 || ch == 0x2007 /* FIGURE SPACE */
5212 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5213 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5214 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5215 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5216 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5217 attr |= 1 << LBP_GL;
5220 if (ch == 0x0020 /* SPACE */)
5221 attr |= 1 << LBP_SP;
5223 /* break opportunity before and after */
5224 if (ch == 0x2014 /* EM DASH */)
5225 attr |= 1 << LBP_B2;
5227 /* break opportunity after */
5228 if (ch == 0x1680 /* OGHAM SPACE MARK */
5229 || ch == 0x2000 /* EN QUAD */
5230 || ch == 0x2001 /* EM QUAD */
5231 || ch == 0x2002 /* EN SPACE */
5232 || ch == 0x2003 /* EM SPACE */
5233 || ch == 0x2004 /* THREE-PER-EM SPACE */
5234 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5235 || ch == 0x2006 /* SIX-PER-EM SPACE */
5236 || ch == 0x2008 /* PUNCTUATION SPACE */
5237 || ch == 0x2009 /* THIN SPACE */
5238 || ch == 0x200A /* HAIR SPACE */
5239 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5240 || ch == 0x0009 /* tab */
5241 || ch == 0x00AD /* SOFT HYPHEN */
5242 || ch == 0x058A /* ARMENIAN HYPHEN */
5243 || ch == 0x2010 /* HYPHEN */
5244 || ch == 0x2012 /* FIGURE DASH */
5245 || ch == 0x2013 /* EN DASH */
5246 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5247 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5248 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5249 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5250 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5251 || ch == 0x2027 /* HYPHENATION POINT */
5252 || ch == 0x007C /* VERTICAL LINE */
5253 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5254 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5255 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5256 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5257 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5258 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5259 || ch == 0x205A /* TWO DOT PUNCTUATION */
5260 || ch == 0x205B /* FOUR DOT MARK */
5261 || ch == 0x205D /* TRICOLON */
5262 || ch == 0x205E /* VERTICAL FOUR DOTS */
5263 || ch == 0x2E19 /* PALM BRANCH */
5264 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5265 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5266 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5267 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5268 || ch == 0x2E30 /* RING POINT */
5269 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5270 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5271 || ch == 0x10102 /* AEGEAN CHECK MARK */
5272 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5273 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5274 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5275 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5276 || ch == 0x0964 /* DEVANAGARI DANDA */
5277 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5278 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5279 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5280 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5281 || ch == 0x104B /* MYANMAR SIGN SECTION */
5282 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5283 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5284 || ch == 0x17D4 /* KHMER SIGN KHAN */
5285 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5286 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5287 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5288 || ch == 0xA8CE /* SAURASHTRA DANDA */
5289 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5290 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5291 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5292 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5293 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5294 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5295 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5296 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5297 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5298 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5299 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5300 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5302 || ch == 0x1802 /* MONGOLIAN COMMA */
5303 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5305 || ch == 0x1804 /* MONGOLIAN COLON */
5306 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5308 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5309 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5311 || ch == 0x1B5A /* BALINESE PANTI */
5312 || ch == 0x1B5B /* BALINESE PAMADA */
5313 || ch == 0x1B5C /* BALINESE WINDU */
5314 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5315 || ch == 0x1B60 /* BALINESE PAMENENG */
5316 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5317 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5318 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5319 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5320 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5321 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5322 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5324 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5326 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5327 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5328 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5330 || ch == 0x2CFE /* COPTIC FULL STOP */
5332 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5333 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5334 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5335 || ch == 0xA60D /* VAI COMMA */
5336 || ch == 0xA60F /* VAI QUESTION MARK */
5337 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5338 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5339 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5340 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5341 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5342 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5343 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5344 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5345 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5347 || ch == 0x1A1E /* BUGINESE PALLAWA */
5349 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5350 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5351 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5352 attr |= 1 << LBP_BA;
5354 /* break opportunity before */
5355 if (ch == 0x00B4 /* ACUTE ACCENT */
5357 || ch == 0x1FFD /* GREEK OXIA */
5358 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5360 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5361 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5362 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5363 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5364 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5365 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5366 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5367 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5368 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5369 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5370 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5371 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5372 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5373 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5374 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5375 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5376 attr |= 1 << LBP_BB;
5379 if (ch == 0x002D /* HYPHEN-MINUS */)
5380 attr |= 1 << LBP_HY;
5382 /* contingent break opportunity */
5383 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5384 attr |= 1 << LBP_CB;
5386 /* closing punctuation */
5387 if ((unicode_attributes[ch].category[0] == 'P'
5388 && unicode_attributes[ch].category[1] == 'e')
5389 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5390 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5391 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5392 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5393 || ch == 0xFE50 /* SMALL COMMA */
5394 || ch == 0xFE52 /* SMALL FULL STOP */
5395 || ch == 0xFF0C /* FULLWIDTH COMMA */
5396 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5397 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5398 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5399 attr |= 1 << LBP_CL;
5401 /* exclamation/interrogation */
5402 if (ch == 0x0021 /* EXCLAMATION MARK */
5403 || ch == 0x003F /* QUESTION MARK */
5404 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5406 || ch == 0x060C /* ARABIC COMMA */
5408 || ch == 0x061B /* ARABIC SEMICOLON */
5409 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5410 || ch == 0x061F /* ARABIC QUESTION MARK */
5412 || ch == 0x066A /* ARABIC PERCENT SIGN */
5414 || ch == 0x06D4 /* ARABIC FULL STOP */
5415 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5416 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5417 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5418 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5419 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5420 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5421 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5423 || ch == 0x1802 /* MONGOLIAN COMMA */
5424 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5425 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5426 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5428 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5429 || ch == 0x1945 /* LIMBU QUESTION MARK */
5430 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5431 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5433 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5434 || ch == 0x2CFE /* COPTIC FULL STOP */
5436 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5437 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5438 || ch == 0xA60E /* VAI FULL STOP */
5439 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5440 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5441 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5442 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5443 || ch == 0xFE56 /* SMALL QUESTION MARK */
5444 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5445 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5446 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5447 attr |= 1 << LBP_EX;
5450 if (ch == 0x2024 /* ONE DOT LEADER */
5451 || ch == 0x2025 /* TWO DOT LEADER */
5452 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5453 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5454 attr |= 1 << LBP_IN;
5457 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5458 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5459 || ch == 0x203D /* INTERROBANG */
5460 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5461 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5462 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5463 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5464 || ch == 0x301C /* WAVE DASH */
5465 || ch == 0x303C /* MASU MARK */
5466 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5467 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5468 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5469 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5470 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5471 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5472 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5473 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5474 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5475 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5476 || ch == 0xA015 /* YI SYLLABLE WU */
5477 || ch == 0xFE54 /* SMALL SEMICOLON */
5478 || ch == 0xFE55 /* SMALL COLON */
5479 || ch == 0xFF1A /* FULLWIDTH COLON */
5480 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5481 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5482 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5483 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5484 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5485 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5486 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5487 attr |= 1 << LBP_NS;
5489 /* opening punctuation */
5490 if ((unicode_attributes[ch].category[0] == 'P'
5491 && unicode_attributes[ch].category[1] == 's')
5493 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5494 || ch == 0x00BF /* INVERTED QUESTION MARK */
5496 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5497 attr |= 1 << LBP_OP;
5499 /* ambiguous quotation */
5500 if ((unicode_attributes[ch].category[0] == 'P'
5501 && (unicode_attributes[ch].category[1] == 'f'
5502 || unicode_attributes[ch].category[1] == 'i'))
5503 || ch == 0x0022 /* QUOTATION MARK */
5504 || ch == 0x0027 /* APOSTROPHE */
5505 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5506 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5507 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5508 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5509 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5510 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5511 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5512 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5513 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5514 || ch == 0x2E0B /* RAISED SQUARE */)
5515 attr |= 1 << LBP_QU;
5517 /* infix separator (numeric) */
5518 if (ch == 0x002C /* COMMA */
5519 || ch == 0x002E /* FULL STOP */
5520 || ch == 0x003A /* COLON */
5521 || ch == 0x003B /* SEMICOLON */
5522 || ch == 0x037E /* GREEK QUESTION MARK */
5523 || ch == 0x0589 /* ARMENIAN FULL STOP */
5525 || ch == 0x060C /* ARABIC COMMA */
5527 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5528 || ch == 0x07F8 /* NKO COMMA */
5529 || ch == 0x2044 /* FRACTION SLASH */
5530 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5531 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5532 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5533 attr |= 1 << LBP_IS;
5536 if ((unicode_attributes[ch].category[0] == 'N'
5537 && unicode_attributes[ch].category[1] == 'd'
5538 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5539 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5540 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5541 attr |= 1 << LBP_NU;
5543 /* postfix (numeric) */
5544 if (ch == 0x0025 /* PERCENT SIGN */
5545 || ch == 0x00A2 /* CENT SIGN */
5546 || ch == 0x00B0 /* DEGREE SIGN */
5547 || ch == 0x060B /* AFGHANI SIGN */
5549 || ch == 0x066A /* ARABIC PERCENT SIGN */
5551 || ch == 0x2030 /* PER MILLE SIGN */
5552 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5553 || ch == 0x2032 /* PRIME */
5554 || ch == 0x2033 /* DOUBLE PRIME */
5555 || ch == 0x2034 /* TRIPLE PRIME */
5556 || ch == 0x2035 /* REVERSED PRIME */
5557 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5558 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5559 || ch == 0x20A7 /* PESETA SIGN */
5560 || ch == 0x2103 /* DEGREE CELSIUS */
5561 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5562 || ch == 0xFDFC /* RIAL SIGN */
5563 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5564 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5565 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
5566 attr |= 1 << LBP_PO;
5568 /* prefix (numeric) */
5569 if ((unicode_attributes[ch].category[0] == 'S'
5570 && unicode_attributes[ch].category[1] == 'c')
5571 || ch == 0x002B /* PLUS SIGN */
5572 || ch == 0x005C /* REVERSE SOLIDUS */
5573 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5574 || ch == 0x2116 /* NUMERO SIGN */
5575 || ch == 0x2212 /* MINUS SIGN */
5576 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5577 if (!(attr & (1 << LBP_PO)))
5578 attr |= 1 << LBP_PR;
5580 /* symbols allowing breaks */
5581 if (ch == 0x002F /* SOLIDUS */)
5582 attr |= 1 << LBP_SY;
5584 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5585 attr |= 1 << LBP_H2;
5587 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5588 attr |= 1 << LBP_H3;
5590 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5591 attr |= 1 << LBP_JL;
5593 if (ch >= 0x1160 && ch <= 0x11A2)
5594 attr |= 1 << LBP_JV;
5596 if (ch >= 0x11A8 && ch <= 0x11F9)
5597 attr |= 1 << LBP_JT;
5599 /* complex context (South East Asian) */
5600 if (((unicode_attributes[ch].category[0] == 'C'
5601 && unicode_attributes[ch].category[1] == 'f')
5602 || (unicode_attributes[ch].category[0] == 'L'
5603 && (unicode_attributes[ch].category[1] == 'm'
5604 || unicode_attributes[ch].category[1] == 'o'))
5605 || (unicode_attributes[ch].category[0] == 'M'
5606 && (unicode_attributes[ch].category[1] == 'c'
5607 || unicode_attributes[ch].category[1] == 'n'))
5608 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5609 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5610 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5611 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5612 || (ch >= 0x1000 && ch <= 0x109F)
5613 || (ch >= 0x1780 && ch <= 0x17FF)
5614 || (ch >= 0x1950 && ch <= 0x19DF)))
5615 attr |= 1 << LBP_SA;
5617 /* attached characters and combining marks */
5618 if ((unicode_attributes[ch].category[0] == 'M'
5619 && (unicode_attributes[ch].category[1] == 'c'
5620 || unicode_attributes[ch].category[1] == 'e'
5621 || unicode_attributes[ch].category[1] == 'n'))
5622 || (unicode_attributes[ch].category[0] == 'C'
5623 && (unicode_attributes[ch].category[1] == 'c'
5624 || unicode_attributes[ch].category[1] == 'f')))
5625 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5626 attr |= 1 << LBP_CM;
5629 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5630 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5631 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5632 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5633 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5634 || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */
5635 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5636 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5637 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5638 || ch == 0xFE62 /* SMALL PLUS SIGN */
5639 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5640 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5641 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5642 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5643 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5644 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5645 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5646 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5647 || (ch >= 0x3000 && ch <= 0x33FF
5648 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5649 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5650 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5651 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5652 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5653 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5654 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5655 || ch == 0xFE45 /* SESAME DOT */
5656 || ch == 0xFE46 /* WHITE SESAME DOT */
5657 || ch == 0xFE49 /* DASHED OVERLINE */
5658 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5659 || ch == 0xFE4B /* WAVY OVERLINE */
5660 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5661 || ch == 0xFE4D /* DASHED LOW LINE */
5662 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5663 || ch == 0xFE4F /* WAVY LOW LINE */
5664 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5665 || ch == 0xFE58 /* SMALL EM DASH */
5666 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5667 || ch == 0xFE60 /* SMALL AMPERSAND */
5668 || ch == 0xFE61 /* SMALL ASTERISK */
5669 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5670 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5671 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5672 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5673 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5674 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5675 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5676 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5677 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5678 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5679 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5680 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5681 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5682 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5683 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5684 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5685 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5686 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5687 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5688 || ch == 0xFF5E /* FULLWIDTH TILDE */
5689 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5690 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5691 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5692 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5694 /* ambiguous (ideograph) ? */
5695 if ((unicode_width[ch] != NULL
5696 && unicode_width[ch][0] == 'A'
5698 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5699 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5700 attr |= 1 << LBP_AI;
5702 attr |= 1 << LBP_ID;
5705 /* ordinary alphabetic and symbol characters */
5706 if ((unicode_attributes[ch].category[0] == 'L'
5707 && (unicode_attributes[ch].category[1] == 'u'
5708 || unicode_attributes[ch].category[1] == 'l'
5709 || unicode_attributes[ch].category[1] == 't'
5710 || unicode_attributes[ch].category[1] == 'm'
5711 || unicode_attributes[ch].category[1] == 'o'))
5712 || (unicode_attributes[ch].category[0] == 'S'
5713 && (unicode_attributes[ch].category[1] == 'm'
5714 || unicode_attributes[ch].category[1] == 'k'
5715 || unicode_attributes[ch].category[1] == 'o'))
5716 || (unicode_attributes[ch].category[0] == 'N'
5717 && (unicode_attributes[ch].category[1] == 'l'
5718 || unicode_attributes[ch].category[1] == 'o'))
5719 || (unicode_attributes[ch].category[0] == 'P'
5720 && (unicode_attributes[ch].category[1] == 'c'
5721 || unicode_attributes[ch].category[1] == 'd'
5722 || unicode_attributes[ch].category[1] == 'o'))
5723 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5724 || ch == 0x0601 /* ARABIC SIGN SANAH */
5725 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5726 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5727 || ch == 0x06DD /* ARABIC END OF AYAH */
5728 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5729 || ch == 0x2061 /* FUNCTION APPLICATION */
5730 || ch == 0x2062 /* INVISIBLE TIMES */
5731 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5732 || ch == 0x2064 /* INVISIBLE PLUS */)
5733 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5735 /* ambiguous (alphabetic) ? */
5736 if ((unicode_width[ch] != NULL
5737 && unicode_width[ch][0] == 'A'
5739 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5740 && ch != 0x2022 /* BULLET */
5741 && ch != 0x203E /* OVERLINE */
5742 && ch != 0x2126 /* OHM SIGN */
5743 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5744 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5745 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5746 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5747 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5748 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5749 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5750 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5752 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5753 || ch == 0x00A7 /* SECTION SIGN */
5754 || ch == 0x00A8 /* DIAERESIS */
5755 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5756 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5757 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5758 || ch == 0x00B6 /* PILCROW SIGN */
5759 || ch == 0x00B7 /* MIDDLE DOT */
5760 || ch == 0x00B8 /* CEDILLA */
5761 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5762 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5763 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5764 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5765 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5766 || ch == 0x00BF /* INVERTED QUESTION MARK */
5767 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5768 || ch == 0x00F7 /* DIVISION SIGN */
5769 || ch == 0x02C7 /* CARON */
5770 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5771 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5772 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5773 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5774 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5775 || ch == 0x02D8 /* BREVE */
5776 || ch == 0x02D9 /* DOT ABOVE */
5777 || ch == 0x02DA /* RING ABOVE */
5778 || ch == 0x02DB /* OGONEK */
5779 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5781 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5782 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5783 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5784 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5785 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5786 || ch == 0x2616 /* WHITE SHOGI PIECE */
5787 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5788 attr |= 1 << LBP_AI;
5790 attr |= 1 << LBP_AL;
5791 attr &= ~(1 << LBP_CM);
5797 attr |= 1 << LBP_XX;
5802 /* Output the line breaking properties in a human readable format. */
5804 debug_output_lbp (FILE *stream)
5808 for (i = 0; i < 0x110000; i++)
5810 int attr = get_lbp (i);
5811 if (attr != 1 << LBP_XX)
5813 fprintf (stream, "0x%04X", i);
5814 #define PRINT_BIT(attr,bit) \
5815 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5816 PRINT_BIT(attr,LBP_BK);
5817 PRINT_BIT(attr,LBP_CM);
5818 PRINT_BIT(attr,LBP_WJ);
5819 PRINT_BIT(attr,LBP_ZW);
5820 PRINT_BIT(attr,LBP_GL);
5821 PRINT_BIT(attr,LBP_SP);
5822 PRINT_BIT(attr,LBP_B2);
5823 PRINT_BIT(attr,LBP_BA);
5824 PRINT_BIT(attr,LBP_BB);
5825 PRINT_BIT(attr,LBP_HY);
5826 PRINT_BIT(attr,LBP_CB);
5827 PRINT_BIT(attr,LBP_CL);
5828 PRINT_BIT(attr,LBP_EX);
5829 PRINT_BIT(attr,LBP_IN);
5830 PRINT_BIT(attr,LBP_NS);
5831 PRINT_BIT(attr,LBP_OP);
5832 PRINT_BIT(attr,LBP_QU);
5833 PRINT_BIT(attr,LBP_IS);
5834 PRINT_BIT(attr,LBP_NU);
5835 PRINT_BIT(attr,LBP_PO);
5836 PRINT_BIT(attr,LBP_PR);
5837 PRINT_BIT(attr,LBP_SY);
5838 PRINT_BIT(attr,LBP_AI);
5839 PRINT_BIT(attr,LBP_AL);
5840 PRINT_BIT(attr,LBP_H2);
5841 PRINT_BIT(attr,LBP_H3);
5842 PRINT_BIT(attr,LBP_ID);
5843 PRINT_BIT(attr,LBP_JL);
5844 PRINT_BIT(attr,LBP_JV);
5845 PRINT_BIT(attr,LBP_JT);
5846 PRINT_BIT(attr,LBP_SA);
5847 PRINT_BIT(attr,LBP_XX);
5849 fprintf (stream, "\n");
5855 debug_output_lbrk_tables (const char *filename)
5859 stream = fopen (filename, "w");
5862 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5866 debug_output_lbp (stream);
5868 if (ferror (stream) || fclose (stream))
5870 fprintf (stderr, "error writing to '%s'\n", filename);
5875 /* The line breaking property from the LineBreak.txt file. */
5876 int unicode_org_lbp[0x110000];
5878 /* Stores in unicode_org_lbp[] the line breaking property from the
5879 LineBreak.txt file. */
5881 fill_org_lbp (const char *linebreak_filename)
5885 char field0[FIELDLEN];
5886 char field1[FIELDLEN];
5887 char field2[FIELDLEN];
5890 for (i = 0; i < 0x110000; i++)
5891 unicode_org_lbp[i] = LBP_XX;
5893 stream = fopen (linebreak_filename, "r");
5896 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5912 do c = getc (stream); while (c != EOF && c != '\n');
5916 n = getfield (stream, field0, ';');
5917 n += getfield (stream, field1, ' ');
5918 n += getfield (stream, field2, '\n');
5923 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5927 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5962 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5963 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5964 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5965 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5968 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5969 field1, linebreak_filename, lineno);
5972 i = strtoul (field0, NULL, 16);
5973 if (strstr (field0, "..") != NULL)
5975 /* Deal with a range. */
5976 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5978 unicode_org_lbp[i] = value;
5982 /* Single character line. */
5983 unicode_org_lbp[i] = value;
5986 if (ferror (stream) || fclose (stream))
5988 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5993 /* Output the line breaking properties in a human readable format. */
5995 debug_output_org_lbp (FILE *stream)
5999 for (i = 0; i < 0x110000; i++)
6001 int attr = unicode_org_lbp[i];
6004 fprintf (stream, "0x%04X", i);
6005 #define PRINT_BIT(attr,bit) \
6006 if (attr == bit) fprintf (stream, " " #bit);
6007 PRINT_BIT(attr,LBP_BK);
6008 PRINT_BIT(attr,LBP_CM);
6009 PRINT_BIT(attr,LBP_WJ);
6010 PRINT_BIT(attr,LBP_ZW);
6011 PRINT_BIT(attr,LBP_GL);
6012 PRINT_BIT(attr,LBP_SP);
6013 PRINT_BIT(attr,LBP_B2);
6014 PRINT_BIT(attr,LBP_BA);
6015 PRINT_BIT(attr,LBP_BB);
6016 PRINT_BIT(attr,LBP_HY);
6017 PRINT_BIT(attr,LBP_CB);
6018 PRINT_BIT(attr,LBP_CL);
6019 PRINT_BIT(attr,LBP_EX);
6020 PRINT_BIT(attr,LBP_IN);
6021 PRINT_BIT(attr,LBP_NS);
6022 PRINT_BIT(attr,LBP_OP);
6023 PRINT_BIT(attr,LBP_QU);
6024 PRINT_BIT(attr,LBP_IS);
6025 PRINT_BIT(attr,LBP_NU);
6026 PRINT_BIT(attr,LBP_PO);
6027 PRINT_BIT(attr,LBP_PR);
6028 PRINT_BIT(attr,LBP_SY);
6029 PRINT_BIT(attr,LBP_AI);
6030 PRINT_BIT(attr,LBP_AL);
6031 PRINT_BIT(attr,LBP_H2);
6032 PRINT_BIT(attr,LBP_H3);
6033 PRINT_BIT(attr,LBP_ID);
6034 PRINT_BIT(attr,LBP_JL);
6035 PRINT_BIT(attr,LBP_JV);
6036 PRINT_BIT(attr,LBP_JT);
6037 PRINT_BIT(attr,LBP_SA);
6038 PRINT_BIT(attr,LBP_XX);
6040 fprintf (stream, "\n");
6046 debug_output_org_lbrk_tables (const char *filename)
6050 stream = fopen (filename, "w");
6053 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6057 debug_output_org_lbp (stream);
6059 if (ferror (stream) || fclose (stream))
6061 fprintf (stderr, "error writing to '%s'\n", filename);
6066 /* Construction of sparse 3-level tables. */
6067 #define TABLE lbp_table
6068 #define ELEMENT unsigned char
6069 #define DEFAULT LBP_XX
6070 #define xmalloc malloc
6071 #define xrealloc realloc
6075 output_lbp (FILE *stream1, FILE *stream2)
6079 unsigned int level1_offset, level2_offset, level3_offset;
6083 lbp_table_init (&t);
6085 for (i = 0; i < 0x110000; i++)
6087 int attr = get_lbp (i);
6089 /* Now attr should contain exactly one bit. */
6090 if (attr == 0 || ((attr & (attr - 1)) != 0))
6093 if (attr != 1 << LBP_XX)
6095 unsigned int log2_attr;
6096 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6098 lbp_table_add (&t, i, log2_attr);
6102 lbp_table_finalize (&t);
6105 5 * sizeof (uint32_t);
6107 5 * sizeof (uint32_t)
6108 + t.level1_size * sizeof (uint32_t);
6110 5 * sizeof (uint32_t)
6111 + t.level1_size * sizeof (uint32_t)
6112 + (t.level2_size << t.q) * sizeof (uint32_t);
6114 for (i = 0; i < 5; i++)
6115 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6116 ((uint32_t *) t.result)[i]);
6117 fprintf (stream1, "\n");
6118 fprintf (stream1, "typedef struct\n");
6119 fprintf (stream1, " {\n");
6120 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6121 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6122 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6123 fprintf (stream1, " }\n");
6124 fprintf (stream1, "lbrkprop_t;\n");
6125 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6127 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6128 fprintf (stream2, "{\n");
6129 fprintf (stream2, " {");
6130 if (t.level1_size > 8)
6131 fprintf (stream2, "\n ");
6132 for (i = 0; i < t.level1_size; i++)
6135 if (i > 0 && (i % 8) == 0)
6136 fprintf (stream2, "\n ");
6137 offset = ((uint32_t *) (t.result + level1_offset))[i];
6138 fprintf (stream2, " %5zd%s",
6139 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6140 (i+1 < t.level1_size ? "," : ""));
6142 if (t.level1_size > 8)
6143 fprintf (stream2, "\n ");
6144 fprintf (stream2, " },\n");
6145 fprintf (stream2, " {");
6146 if (t.level2_size << t.q > 8)
6147 fprintf (stream2, "\n ");
6148 for (i = 0; i < t.level2_size << t.q; i++)
6151 if (i > 0 && (i % 8) == 0)
6152 fprintf (stream2, "\n ");
6153 offset = ((uint32_t *) (t.result + level2_offset))[i];
6154 fprintf (stream2, " %5zd%s",
6155 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6156 (i+1 < t.level2_size << t.q ? "," : ""));
6158 if (t.level2_size << t.q > 8)
6159 fprintf (stream2, "\n ");
6160 fprintf (stream2, " },\n");
6161 fprintf (stream2, " {");
6162 if (t.level3_size << t.p > 8)
6163 fprintf (stream2, "\n ");
6164 for (i = 0; i < t.level3_size << t.p; i++)
6166 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6167 const char *value_string;
6170 #define CASE(x) case x: value_string = #x; break;
6207 if (i > 0 && (i % 8) == 0)
6208 fprintf (stream2, "\n ");
6209 fprintf (stream2, " %s%s", value_string,
6210 (i+1 < t.level3_size << t.p ? "," : ""));
6212 if (t.level3_size << t.p > 8)
6213 fprintf (stream2, "\n ");
6214 fprintf (stream2, " }\n");
6215 fprintf (stream2, "};\n");
6219 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6221 const char *filenames[2];
6225 filenames[0] = filename1;
6226 filenames[1] = filename2;
6228 for (i = 0; i < 2; i++)
6230 streams[i] = fopen (filenames[i], "w");
6231 if (streams[i] == NULL)
6233 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6238 for (i = 0; i < 2; i++)
6240 FILE *stream = streams[i];
6242 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6243 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6244 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6246 fprintf (stream, "\n");
6248 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6249 still carries the GPL header), and it's gnulib-tool which replaces the
6250 GPL header with an LGPL header. */
6251 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6252 fprintf (stream, "\n");
6253 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6254 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6255 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6256 fprintf (stream, " (at your option) any later version.\n");
6257 fprintf (stream, "\n");
6258 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6259 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6260 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6261 fprintf (stream, " GNU General Public License for more details.\n");
6262 fprintf (stream, "\n");
6263 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6264 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6265 fprintf (stream, "\n");
6268 output_lbp (streams[0], streams[1]);
6270 for (i = 0; i < 2; i++)
6272 if (ferror (streams[i]) || fclose (streams[i]))
6274 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6280 /* ========================================================================= */
6283 main (int argc, char * argv[])
6285 const char *unicodedata_filename;
6286 const char *proplist_filename;
6287 const char *derivedproplist_filename;
6288 const char *scripts_filename;
6289 const char *blocks_filename;
6290 const char *proplist30_filename;
6291 const char *eastasianwidth_filename;
6292 const char *linebreak_filename;
6293 const char *version;
6297 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt version\n",
6302 unicodedata_filename = argv[1];
6303 proplist_filename = argv[2];
6304 derivedproplist_filename = argv[3];
6305 scripts_filename = argv[4];
6306 blocks_filename = argv[5];
6307 proplist30_filename = argv[6];
6308 eastasianwidth_filename = argv[7];
6309 linebreak_filename = argv[8];
6312 fill_attributes (unicodedata_filename);
6313 clear_properties ();
6314 fill_properties (proplist_filename);
6315 fill_properties (derivedproplist_filename);
6316 fill_properties30 (proplist30_filename);
6317 fill_scripts (scripts_filename);
6318 fill_blocks (blocks_filename);
6319 fill_width (eastasianwidth_filename);
6320 fill_org_lbp (linebreak_filename);
6322 output_categories (version);
6323 output_category ("unictype/categ_of.h", version);
6324 output_combclass ("unictype/combining.h", version);
6325 output_bidi_category ("unictype/bidi_of.h", version);
6326 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
6327 output_decimal_digit ("unictype/decdigit.h", version);
6328 output_digit_test ("../tests/unictype/test-digit.h", version);
6329 output_digit ("unictype/digit.h", version);
6330 output_numeric_test ("../tests/unictype/test-numeric.h", version);
6331 output_numeric ("unictype/numeric.h", version);
6332 output_mirror ("unictype/mirror.h", version);
6333 output_properties (version);
6334 output_scripts (version);
6335 output_scripts_byname (version);
6336 output_blocks (version);
6337 output_ident_properties (version);
6338 output_old_ctype (version);
6340 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
6341 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
6342 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
6348 * For Emacs M-x compile
6350 * compile-command: "
6351 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
6353 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
6354 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
6355 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
6356 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
6357 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
6358 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
6359 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
6360 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \