1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
40 /* ========================================================================= */
42 /* Reading UnicodeData.txt. */
45 /* This structure represents one line in the UnicodeData.txt file. */
46 struct unicode_attribute
48 const char *name; /* Character name */
49 const char *category; /* General category */
50 const char *combining; /* Canonical combining class */
51 const char *bidi; /* Bidirectional category */
52 const char *decomposition; /* Character decomposition mapping */
53 const char *decdigit; /* Decimal digit value */
54 const char *digit; /* Digit value */
55 const char *numeric; /* Numeric value */
56 bool mirrored; /* mirrored */
57 const char *oldname; /* Old Unicode 1.0 name */
58 const char *comment; /* Comment */
59 unsigned int upper; /* Uppercase mapping */
60 unsigned int lower; /* Lowercase mapping */
61 unsigned int title; /* Titlecase mapping */
64 /* Missing fields are represented with "" for strings, and NONE for
66 #define NONE (~(unsigned int)0)
68 /* The entire contents of the UnicodeData.txt file. */
69 struct unicode_attribute unicode_attributes [0x110000];
71 /* Stores in unicode_attributes[i] the values from the given fields. */
73 fill_attribute (unsigned int i,
74 const char *field1, const char *field2,
75 const char *field3, const char *field4,
76 const char *field5, const char *field6,
77 const char *field7, const char *field8,
78 const char *field9, const char *field10,
79 const char *field11, const char *field12,
80 const char *field13, const char *field14)
82 struct unicode_attribute * uni;
86 fprintf (stderr, "index too large\n");
89 if (strcmp (field2, "Cs") == 0)
90 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
92 uni = &unicode_attributes[i];
93 /* Copy the strings. */
94 uni->name = strdup (field1);
95 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
96 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
97 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
98 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
99 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
100 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
101 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
102 uni->mirrored = (field9[0] == 'Y');
103 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
104 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
105 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
106 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
107 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
110 /* Maximum length of a field in the UnicodeData.txt file. */
113 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
114 Reads up to (but excluding) DELIM.
115 Returns 1 when a field was successfully read, otherwise 0. */
117 getfield (FILE *stream, char *buffer, int delim)
122 for (; (c = getc (stream)), (c != EOF && c != delim); )
124 /* The original unicode.org UnicodeData.txt file happens to have
125 CR/LF line terminators. Silently convert to LF. */
129 /* Put c into the buffer. */
130 if (++count >= FIELDLEN - 1)
132 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
145 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
148 fill_attributes (const char *unicodedata_filename)
152 char field0[FIELDLEN];
153 char field1[FIELDLEN];
154 char field2[FIELDLEN];
155 char field3[FIELDLEN];
156 char field4[FIELDLEN];
157 char field5[FIELDLEN];
158 char field6[FIELDLEN];
159 char field7[FIELDLEN];
160 char field8[FIELDLEN];
161 char field9[FIELDLEN];
162 char field10[FIELDLEN];
163 char field11[FIELDLEN];
164 char field12[FIELDLEN];
165 char field13[FIELDLEN];
166 char field14[FIELDLEN];
169 for (i = 0; i < 0x110000; i++)
170 unicode_attributes[i].name = NULL;
172 stream = fopen (unicodedata_filename, "r");
175 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
184 n = getfield (stream, field0, ';');
185 n += getfield (stream, field1, ';');
186 n += getfield (stream, field2, ';');
187 n += getfield (stream, field3, ';');
188 n += getfield (stream, field4, ';');
189 n += getfield (stream, field5, ';');
190 n += getfield (stream, field6, ';');
191 n += getfield (stream, field7, ';');
192 n += getfield (stream, field8, ';');
193 n += getfield (stream, field9, ';');
194 n += getfield (stream, field10, ';');
195 n += getfield (stream, field11, ';');
196 n += getfield (stream, field12, ';');
197 n += getfield (stream, field13, ';');
198 n += getfield (stream, field14, '\n');
203 fprintf (stderr, "short line in '%s':%d\n",
204 unicodedata_filename, lineno);
207 i = strtoul (field0, NULL, 16);
209 && strlen (field1) >= 9
210 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
212 /* Deal with a range. */
214 n = getfield (stream, field0, ';');
215 n += getfield (stream, field1, ';');
216 n += getfield (stream, field2, ';');
217 n += getfield (stream, field3, ';');
218 n += getfield (stream, field4, ';');
219 n += getfield (stream, field5, ';');
220 n += getfield (stream, field6, ';');
221 n += getfield (stream, field7, ';');
222 n += getfield (stream, field8, ';');
223 n += getfield (stream, field9, ';');
224 n += getfield (stream, field10, ';');
225 n += getfield (stream, field11, ';');
226 n += getfield (stream, field12, ';');
227 n += getfield (stream, field13, ';');
228 n += getfield (stream, field14, '\n');
231 fprintf (stderr, "missing end range in '%s':%d\n",
232 unicodedata_filename, lineno);
235 if (!(field1[0] == '<'
236 && strlen (field1) >= 8
237 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
239 fprintf (stderr, "missing end range in '%s':%d\n",
240 unicodedata_filename, lineno);
243 field1[strlen (field1) - 7] = '\0';
244 j = strtoul (field0, NULL, 16);
246 fill_attribute (i, field1+1, field2, field3, field4, field5,
247 field6, field7, field8, field9, field10,
248 field11, field12, field13, field14);
252 /* Single character line */
253 fill_attribute (i, field1, field2, field3, field4, field5,
254 field6, field7, field8, field9, field10,
255 field11, field12, field13, field14);
258 if (ferror (stream) || fclose (stream))
260 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
265 /* ========================================================================= */
267 /* General category. */
268 /* See Unicode 3.0 book, section 4.5,
272 is_category_L (unsigned int ch)
274 return (unicode_attributes[ch].name != NULL
275 && unicode_attributes[ch].category[0] == 'L');
279 is_category_Lu (unsigned int ch)
281 return (unicode_attributes[ch].name != NULL
282 && unicode_attributes[ch].category[0] == 'L'
283 && unicode_attributes[ch].category[1] == 'u');
287 is_category_Ll (unsigned int ch)
289 return (unicode_attributes[ch].name != NULL
290 && unicode_attributes[ch].category[0] == 'L'
291 && unicode_attributes[ch].category[1] == 'l');
295 is_category_Lt (unsigned int ch)
297 return (unicode_attributes[ch].name != NULL
298 && unicode_attributes[ch].category[0] == 'L'
299 && unicode_attributes[ch].category[1] == 't');
303 is_category_Lm (unsigned int ch)
305 return (unicode_attributes[ch].name != NULL
306 && unicode_attributes[ch].category[0] == 'L'
307 && unicode_attributes[ch].category[1] == 'm');
311 is_category_Lo (unsigned int ch)
313 return (unicode_attributes[ch].name != NULL
314 && unicode_attributes[ch].category[0] == 'L'
315 && unicode_attributes[ch].category[1] == 'o');
319 is_category_M (unsigned int ch)
321 return (unicode_attributes[ch].name != NULL
322 && unicode_attributes[ch].category[0] == 'M');
326 is_category_Mn (unsigned int ch)
328 return (unicode_attributes[ch].name != NULL
329 && unicode_attributes[ch].category[0] == 'M'
330 && unicode_attributes[ch].category[1] == 'n');
334 is_category_Mc (unsigned int ch)
336 return (unicode_attributes[ch].name != NULL
337 && unicode_attributes[ch].category[0] == 'M'
338 && unicode_attributes[ch].category[1] == 'c');
342 is_category_Me (unsigned int ch)
344 return (unicode_attributes[ch].name != NULL
345 && unicode_attributes[ch].category[0] == 'M'
346 && unicode_attributes[ch].category[1] == 'e');
350 is_category_N (unsigned int ch)
352 return (unicode_attributes[ch].name != NULL
353 && unicode_attributes[ch].category[0] == 'N');
357 is_category_Nd (unsigned int ch)
359 return (unicode_attributes[ch].name != NULL
360 && unicode_attributes[ch].category[0] == 'N'
361 && unicode_attributes[ch].category[1] == 'd');
365 is_category_Nl (unsigned int ch)
367 return (unicode_attributes[ch].name != NULL
368 && unicode_attributes[ch].category[0] == 'N'
369 && unicode_attributes[ch].category[1] == 'l');
373 is_category_No (unsigned int ch)
375 return (unicode_attributes[ch].name != NULL
376 && unicode_attributes[ch].category[0] == 'N'
377 && unicode_attributes[ch].category[1] == 'o');
381 is_category_P (unsigned int ch)
383 return (unicode_attributes[ch].name != NULL
384 && unicode_attributes[ch].category[0] == 'P');
388 is_category_Pc (unsigned int ch)
390 return (unicode_attributes[ch].name != NULL
391 && unicode_attributes[ch].category[0] == 'P'
392 && unicode_attributes[ch].category[1] == 'c');
396 is_category_Pd (unsigned int ch)
398 return (unicode_attributes[ch].name != NULL
399 && unicode_attributes[ch].category[0] == 'P'
400 && unicode_attributes[ch].category[1] == 'd');
404 is_category_Ps (unsigned int ch)
406 return (unicode_attributes[ch].name != NULL
407 && unicode_attributes[ch].category[0] == 'P'
408 && unicode_attributes[ch].category[1] == 's');
412 is_category_Pe (unsigned int ch)
414 return (unicode_attributes[ch].name != NULL
415 && unicode_attributes[ch].category[0] == 'P'
416 && unicode_attributes[ch].category[1] == 'e');
420 is_category_Pi (unsigned int ch)
422 return (unicode_attributes[ch].name != NULL
423 && unicode_attributes[ch].category[0] == 'P'
424 && unicode_attributes[ch].category[1] == 'i');
428 is_category_Pf (unsigned int ch)
430 return (unicode_attributes[ch].name != NULL
431 && unicode_attributes[ch].category[0] == 'P'
432 && unicode_attributes[ch].category[1] == 'f');
436 is_category_Po (unsigned int ch)
438 return (unicode_attributes[ch].name != NULL
439 && unicode_attributes[ch].category[0] == 'P'
440 && unicode_attributes[ch].category[1] == 'o');
444 is_category_S (unsigned int ch)
446 return (unicode_attributes[ch].name != NULL
447 && unicode_attributes[ch].category[0] == 'S');
451 is_category_Sm (unsigned int ch)
453 return (unicode_attributes[ch].name != NULL
454 && unicode_attributes[ch].category[0] == 'S'
455 && unicode_attributes[ch].category[1] == 'm');
459 is_category_Sc (unsigned int ch)
461 return (unicode_attributes[ch].name != NULL
462 && unicode_attributes[ch].category[0] == 'S'
463 && unicode_attributes[ch].category[1] == 'c');
467 is_category_Sk (unsigned int ch)
469 return (unicode_attributes[ch].name != NULL
470 && unicode_attributes[ch].category[0] == 'S'
471 && unicode_attributes[ch].category[1] == 'k');
475 is_category_So (unsigned int ch)
477 return (unicode_attributes[ch].name != NULL
478 && unicode_attributes[ch].category[0] == 'S'
479 && unicode_attributes[ch].category[1] == 'o');
483 is_category_Z (unsigned int ch)
485 return (unicode_attributes[ch].name != NULL
486 && unicode_attributes[ch].category[0] == 'Z');
490 is_category_Zs (unsigned int ch)
492 return (unicode_attributes[ch].name != NULL
493 && unicode_attributes[ch].category[0] == 'Z'
494 && unicode_attributes[ch].category[1] == 's');
498 is_category_Zl (unsigned int ch)
500 return (unicode_attributes[ch].name != NULL
501 && unicode_attributes[ch].category[0] == 'Z'
502 && unicode_attributes[ch].category[1] == 'l');
506 is_category_Zp (unsigned int ch)
508 return (unicode_attributes[ch].name != NULL
509 && unicode_attributes[ch].category[0] == 'Z'
510 && unicode_attributes[ch].category[1] == 'p');
514 is_category_C (unsigned int ch)
516 return (unicode_attributes[ch].name == NULL
517 || unicode_attributes[ch].category[0] == 'C');
521 is_category_Cc (unsigned int ch)
523 return (unicode_attributes[ch].name != NULL
524 && unicode_attributes[ch].category[0] == 'C'
525 && unicode_attributes[ch].category[1] == 'c');
529 is_category_Cf (unsigned int ch)
531 return (unicode_attributes[ch].name != NULL
532 && unicode_attributes[ch].category[0] == 'C'
533 && unicode_attributes[ch].category[1] == 'f');
537 is_category_Cs (unsigned int ch)
539 return (ch >= 0xd800 && ch < 0xe000);
543 is_category_Co (unsigned int ch)
545 return (unicode_attributes[ch].name != NULL
546 && unicode_attributes[ch].category[0] == 'C'
547 && unicode_attributes[ch].category[1] == 'o');
551 is_category_Cn (unsigned int ch)
553 return (unicode_attributes[ch].name == NULL
554 && !(ch >= 0xd800 && ch < 0xe000));
557 /* Output a boolean property in a human readable format. */
559 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
564 stream = fopen (filename, "w");
567 fprintf (stderr, "cannot open '%s' for writing\n", filename);
571 #if 0 /* This yields huge text output. */
572 for (ch = 0; ch < 0x110000; ch++)
575 fprintf (stream, "0x%04X\n", ch);
578 for (ch = 0; ch < 0x110000; ch++)
581 unsigned int first = ch;
584 while (ch + 1 < 0x110000 && predicate (ch + 1))
588 fprintf (stream, "0x%04X..0x%04X\n", first, last);
590 fprintf (stream, "0x%04X\n", ch);
594 if (ferror (stream) || fclose (stream))
596 fprintf (stderr, "error writing to '%s'\n", filename);
601 /* Output the unit test for a boolean property. */
603 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
609 stream = fopen (filename, "w");
612 fprintf (stderr, "cannot open '%s' for writing\n", filename);
616 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
617 fprintf (stream, "/* Test the Unicode character type functions.\n");
618 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
619 fprintf (stream, "\n");
620 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
621 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
622 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
623 fprintf (stream, " (at your option) any later version.\n");
624 fprintf (stream, "\n");
625 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
626 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
627 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
628 fprintf (stream, " GNU General Public License for more details.\n");
629 fprintf (stream, "\n");
630 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
631 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
632 fprintf (stream, "\n");
633 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
634 fprintf (stream, "\n");
637 for (ch = 0; ch < 0x110000; ch++)
640 unsigned int first = ch;
643 while (ch + 1 < 0x110000 && predicate (ch + 1))
647 fprintf (stream, ",\n");
648 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
652 fprintf (stream, "\n");
654 fprintf (stream, "\n");
655 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
656 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
658 if (ferror (stream) || fclose (stream))
660 fprintf (stderr, "error writing to '%s'\n", filename);
665 /* Construction of sparse 3-level tables. */
666 #define TABLE predicate_table
667 #define xmalloc malloc
668 #define xrealloc realloc
669 #include "3levelbit.h"
671 /* Output a boolean property in a three-level bitmap. */
673 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
677 struct predicate_table t;
678 unsigned int level1_offset, level2_offset, level3_offset;
680 stream = fopen (filename, "w");
683 fprintf (stderr, "cannot open '%s' for writing\n", filename);
687 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
688 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
689 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
694 predicate_table_init (&t);
696 for (ch = 0; ch < 0x110000; ch++)
698 predicate_table_add (&t, ch);
700 predicate_table_finalize (&t);
702 /* Offsets in t.result, in memory of this process. */
704 5 * sizeof (uint32_t);
706 5 * sizeof (uint32_t)
707 + t.level1_size * sizeof (uint32_t);
709 5 * sizeof (uint32_t)
710 + t.level1_size * sizeof (uint32_t)
711 + (t.level2_size << t.q) * sizeof (uint32_t);
713 for (i = 0; i < 5; i++)
715 fprintf (stream, "#define header_%d %d\n", i,
716 ((uint32_t *) t.result)[i]);
718 fprintf (stream, "static const\n");
719 fprintf (stream, "struct\n");
720 fprintf (stream, " {\n");
721 fprintf (stream, " int header[1];\n");
722 fprintf (stream, " int level1[%zu];\n", t.level1_size);
723 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
724 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
725 fprintf (stream, " }\n");
726 fprintf (stream, "%s =\n", name);
727 fprintf (stream, "{\n");
728 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
729 fprintf (stream, " {");
730 if (t.level1_size > 1)
731 fprintf (stream, "\n ");
732 for (i = 0; i < t.level1_size; i++)
735 if (i > 0 && (i % 1) == 0)
736 fprintf (stream, "\n ");
737 offset = ((uint32_t *) (t.result + level1_offset))[i];
739 fprintf (stream, " %5d", -1);
741 fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
742 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
743 if (i+1 < t.level1_size)
744 fprintf (stream, ",");
746 if (t.level1_size > 1)
747 fprintf (stream, "\n ");
748 fprintf (stream, " },\n");
749 fprintf (stream, " {");
750 if (t.level2_size << t.q > 1)
751 fprintf (stream, "\n ");
752 for (i = 0; i < t.level2_size << t.q; i++)
755 if (i > 0 && (i % 1) == 0)
756 fprintf (stream, "\n ");
757 offset = ((uint32_t *) (t.result + level2_offset))[i];
759 fprintf (stream, " %5d", -1);
761 fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
762 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
763 if (i+1 < t.level2_size << t.q)
764 fprintf (stream, ",");
766 if (t.level2_size << t.q > 1)
767 fprintf (stream, "\n ");
768 fprintf (stream, " },\n");
769 fprintf (stream, " {");
770 if (t.level3_size << t.p > 4)
771 fprintf (stream, "\n ");
772 for (i = 0; i < t.level3_size << t.p; i++)
774 if (i > 0 && (i % 4) == 0)
775 fprintf (stream, "\n ");
776 fprintf (stream, " 0x%08X",
777 ((uint32_t *) (t.result + level3_offset))[i]);
778 if (i+1 < t.level3_size << t.p)
779 fprintf (stream, ",");
781 if (t.level3_size << t.p > 4)
782 fprintf (stream, "\n ");
783 fprintf (stream, " }\n");
784 fprintf (stream, "};\n");
786 if (ferror (stream) || fclose (stream))
788 fprintf (stderr, "error writing to '%s'\n", filename);
793 /* Output all categories. */
795 output_categories (const char *version)
797 #define CATEGORY(C) \
798 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
799 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
800 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
843 UC_CATEGORY_MASK_L = 0x0000001f,
844 UC_CATEGORY_MASK_Lu = 0x00000001,
845 UC_CATEGORY_MASK_Ll = 0x00000002,
846 UC_CATEGORY_MASK_Lt = 0x00000004,
847 UC_CATEGORY_MASK_Lm = 0x00000008,
848 UC_CATEGORY_MASK_Lo = 0x00000010,
849 UC_CATEGORY_MASK_M = 0x000000e0,
850 UC_CATEGORY_MASK_Mn = 0x00000020,
851 UC_CATEGORY_MASK_Mc = 0x00000040,
852 UC_CATEGORY_MASK_Me = 0x00000080,
853 UC_CATEGORY_MASK_N = 0x00000700,
854 UC_CATEGORY_MASK_Nd = 0x00000100,
855 UC_CATEGORY_MASK_Nl = 0x00000200,
856 UC_CATEGORY_MASK_No = 0x00000400,
857 UC_CATEGORY_MASK_P = 0x0003f800,
858 UC_CATEGORY_MASK_Pc = 0x00000800,
859 UC_CATEGORY_MASK_Pd = 0x00001000,
860 UC_CATEGORY_MASK_Ps = 0x00002000,
861 UC_CATEGORY_MASK_Pe = 0x00004000,
862 UC_CATEGORY_MASK_Pi = 0x00008000,
863 UC_CATEGORY_MASK_Pf = 0x00010000,
864 UC_CATEGORY_MASK_Po = 0x00020000,
865 UC_CATEGORY_MASK_S = 0x003c0000,
866 UC_CATEGORY_MASK_Sm = 0x00040000,
867 UC_CATEGORY_MASK_Sc = 0x00080000,
868 UC_CATEGORY_MASK_Sk = 0x00100000,
869 UC_CATEGORY_MASK_So = 0x00200000,
870 UC_CATEGORY_MASK_Z = 0x01c00000,
871 UC_CATEGORY_MASK_Zs = 0x00400000,
872 UC_CATEGORY_MASK_Zl = 0x00800000,
873 UC_CATEGORY_MASK_Zp = 0x01000000,
874 UC_CATEGORY_MASK_C = 0x3e000000,
875 UC_CATEGORY_MASK_Cc = 0x02000000,
876 UC_CATEGORY_MASK_Cf = 0x04000000,
877 UC_CATEGORY_MASK_Cs = 0x08000000,
878 UC_CATEGORY_MASK_Co = 0x10000000,
879 UC_CATEGORY_MASK_Cn = 0x20000000
883 general_category_byname (const char *category_name)
885 if (category_name[0] != '\0'
886 && (category_name[1] == '\0' || category_name[2] == '\0'))
887 switch (category_name[0])
890 switch (category_name[1])
892 case '\0': return UC_CATEGORY_MASK_L;
893 case 'u': return UC_CATEGORY_MASK_Lu;
894 case 'l': return UC_CATEGORY_MASK_Ll;
895 case 't': return UC_CATEGORY_MASK_Lt;
896 case 'm': return UC_CATEGORY_MASK_Lm;
897 case 'o': return UC_CATEGORY_MASK_Lo;
901 switch (category_name[1])
903 case '\0': return UC_CATEGORY_MASK_M;
904 case 'n': return UC_CATEGORY_MASK_Mn;
905 case 'c': return UC_CATEGORY_MASK_Mc;
906 case 'e': return UC_CATEGORY_MASK_Me;
910 switch (category_name[1])
912 case '\0': return UC_CATEGORY_MASK_N;
913 case 'd': return UC_CATEGORY_MASK_Nd;
914 case 'l': return UC_CATEGORY_MASK_Nl;
915 case 'o': return UC_CATEGORY_MASK_No;
919 switch (category_name[1])
921 case '\0': return UC_CATEGORY_MASK_P;
922 case 'c': return UC_CATEGORY_MASK_Pc;
923 case 'd': return UC_CATEGORY_MASK_Pd;
924 case 's': return UC_CATEGORY_MASK_Ps;
925 case 'e': return UC_CATEGORY_MASK_Pe;
926 case 'i': return UC_CATEGORY_MASK_Pi;
927 case 'f': return UC_CATEGORY_MASK_Pf;
928 case 'o': return UC_CATEGORY_MASK_Po;
932 switch (category_name[1])
934 case '\0': return UC_CATEGORY_MASK_S;
935 case 'm': return UC_CATEGORY_MASK_Sm;
936 case 'c': return UC_CATEGORY_MASK_Sc;
937 case 'k': return UC_CATEGORY_MASK_Sk;
938 case 'o': return UC_CATEGORY_MASK_So;
942 switch (category_name[1])
944 case '\0': return UC_CATEGORY_MASK_Z;
945 case 's': return UC_CATEGORY_MASK_Zs;
946 case 'l': return UC_CATEGORY_MASK_Zl;
947 case 'p': return UC_CATEGORY_MASK_Zp;
951 switch (category_name[1])
953 case '\0': return UC_CATEGORY_MASK_C;
954 case 'c': return UC_CATEGORY_MASK_Cc;
955 case 'f': return UC_CATEGORY_MASK_Cf;
956 case 's': return UC_CATEGORY_MASK_Cs;
957 case 'o': return UC_CATEGORY_MASK_Co;
958 case 'n': return UC_CATEGORY_MASK_Cn;
962 /* Invalid category name. */
966 /* Construction of sparse 3-level tables. */
967 #define TABLE category_table
968 #define ELEMENT uint8_t
969 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
970 #define xmalloc malloc
971 #define xrealloc realloc
974 /* Output the per-character category table. */
976 output_category (const char *filename, const char *version)
980 struct category_table t;
981 unsigned int level1_offset, level2_offset, level3_offset;
982 uint16_t *level3_packed;
984 stream = fopen (filename, "w");
987 fprintf (stderr, "cannot open '%s' for writing\n", filename);
991 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
992 fprintf (stream, "/* Categories of Unicode characters. */\n");
993 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
998 category_table_init (&t);
1000 for (ch = 0; ch < 0x110000; ch++)
1003 unsigned int log2_value;
1005 if (is_category_Cs (ch))
1006 value = UC_CATEGORY_MASK_Cs;
1007 else if (unicode_attributes[ch].name != NULL)
1008 value = general_category_byname (unicode_attributes[ch].category);
1012 /* Now value should contain exactly one bit. */
1013 if (value == 0 || ((value & (value - 1)) != 0))
1016 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1018 category_table_add (&t, ch, log2_value);
1021 category_table_finalize (&t);
1023 /* Offsets in t.result, in memory of this process. */
1025 5 * sizeof (uint32_t);
1027 5 * sizeof (uint32_t)
1028 + t.level1_size * sizeof (uint32_t);
1030 5 * sizeof (uint32_t)
1031 + t.level1_size * sizeof (uint32_t)
1032 + (t.level2_size << t.q) * sizeof (uint32_t);
1034 for (i = 0; i < 5; i++)
1035 fprintf (stream, "#define category_header_%d %d\n", i,
1036 ((uint32_t *) t.result)[i]);
1037 fprintf (stream, "static const\n");
1038 fprintf (stream, "struct\n");
1039 fprintf (stream, " {\n");
1040 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1041 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1042 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1043 (1 << t.p) * 5 / 16);
1044 fprintf (stream, " }\n");
1045 fprintf (stream, "u_category =\n");
1046 fprintf (stream, "{\n");
1047 fprintf (stream, " {");
1048 if (t.level1_size > 8)
1049 fprintf (stream, "\n ");
1050 for (i = 0; i < t.level1_size; i++)
1053 if (i > 0 && (i % 8) == 0)
1054 fprintf (stream, "\n ");
1055 offset = ((uint32_t *) (t.result + level1_offset))[i];
1057 fprintf (stream, " %5d", -1);
1059 fprintf (stream, " %5zd",
1060 (offset - level2_offset) / sizeof (uint32_t));
1061 if (i+1 < t.level1_size)
1062 fprintf (stream, ",");
1064 if (t.level1_size > 8)
1065 fprintf (stream, "\n ");
1066 fprintf (stream, " },\n");
1067 fprintf (stream, " {");
1068 if (t.level2_size << t.q > 8)
1069 fprintf (stream, "\n ");
1070 for (i = 0; i < t.level2_size << t.q; i++)
1073 if (i > 0 && (i % 8) == 0)
1074 fprintf (stream, "\n ");
1075 offset = ((uint32_t *) (t.result + level2_offset))[i];
1077 fprintf (stream, " %5d", -1);
1079 fprintf (stream, " %5zd",
1080 (offset - level3_offset) / sizeof (uint8_t));
1081 if (i+1 < t.level2_size << t.q)
1082 fprintf (stream, ",");
1084 if (t.level2_size << t.q > 8)
1085 fprintf (stream, "\n ");
1086 fprintf (stream, " },\n");
1087 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1088 not 32-bit units, in order to make the lookup function easier. */
1091 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1092 for (i = 0; i < t.level3_size << t.p; i++)
1094 unsigned int j = (i * 5) / 16;
1095 unsigned int k = (i * 5) % 16;
1096 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1097 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1098 level3_packed[j] = value & 0xffff;
1099 level3_packed[j+1] = value >> 16;
1101 fprintf (stream, " {");
1102 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1103 fprintf (stream, "\n ");
1104 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1106 if (i > 0 && (i % 8) == 0)
1107 fprintf (stream, "\n ");
1108 fprintf (stream, " 0x%04x", level3_packed[i]);
1109 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1110 fprintf (stream, ",");
1112 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1113 fprintf (stream, "\n ");
1114 fprintf (stream, " }\n");
1115 free (level3_packed);
1116 fprintf (stream, "};\n");
1118 if (ferror (stream) || fclose (stream))
1120 fprintf (stderr, "error writing to '%s'\n", filename);
1125 /* ========================================================================= */
1127 /* Canonical combining class. */
1128 /* See Unicode 3.0 book, section 4.2,
1131 /* Construction of sparse 3-level tables. */
1132 #define TABLE combclass_table
1133 #define ELEMENT uint8_t
1135 #define xmalloc malloc
1136 #define xrealloc realloc
1139 /* Output the per-character combining class table. */
1141 output_combclass (const char *filename, const char *version)
1145 struct combclass_table t;
1146 unsigned int level1_offset, level2_offset, level3_offset;
1148 stream = fopen (filename, "w");
1151 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1155 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1156 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1157 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1162 combclass_table_init (&t);
1164 for (ch = 0; ch < 0x110000; ch++)
1165 if (unicode_attributes[ch].name != NULL)
1167 int value = atoi (unicode_attributes[ch].combining);
1168 if (!(value >= 0 && value <= 255))
1170 combclass_table_add (&t, ch, value);
1173 combclass_table_finalize (&t);
1175 /* Offsets in t.result, in memory of this process. */
1177 5 * sizeof (uint32_t);
1179 5 * sizeof (uint32_t)
1180 + t.level1_size * sizeof (uint32_t);
1182 5 * sizeof (uint32_t)
1183 + t.level1_size * sizeof (uint32_t)
1184 + (t.level2_size << t.q) * sizeof (uint32_t);
1186 for (i = 0; i < 5; i++)
1187 fprintf (stream, "#define combclass_header_%d %d\n", i,
1188 ((uint32_t *) t.result)[i]);
1189 fprintf (stream, "static const\n");
1190 fprintf (stream, "struct\n");
1191 fprintf (stream, " {\n");
1192 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1193 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1194 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1195 fprintf (stream, " }\n");
1196 fprintf (stream, "u_combclass =\n");
1197 fprintf (stream, "{\n");
1198 fprintf (stream, " {");
1199 if (t.level1_size > 8)
1200 fprintf (stream, "\n ");
1201 for (i = 0; i < t.level1_size; i++)
1204 if (i > 0 && (i % 8) == 0)
1205 fprintf (stream, "\n ");
1206 offset = ((uint32_t *) (t.result + level1_offset))[i];
1208 fprintf (stream, " %5d", -1);
1210 fprintf (stream, " %5zd",
1211 (offset - level2_offset) / sizeof (uint32_t));
1212 if (i+1 < t.level1_size)
1213 fprintf (stream, ",");
1215 if (t.level1_size > 8)
1216 fprintf (stream, "\n ");
1217 fprintf (stream, " },\n");
1218 fprintf (stream, " {");
1219 if (t.level2_size << t.q > 8)
1220 fprintf (stream, "\n ");
1221 for (i = 0; i < t.level2_size << t.q; i++)
1224 if (i > 0 && (i % 8) == 0)
1225 fprintf (stream, "\n ");
1226 offset = ((uint32_t *) (t.result + level2_offset))[i];
1228 fprintf (stream, " %5d", -1);
1230 fprintf (stream, " %5zd",
1231 (offset - level3_offset) / sizeof (uint8_t));
1232 if (i+1 < t.level2_size << t.q)
1233 fprintf (stream, ",");
1235 if (t.level2_size << t.q > 8)
1236 fprintf (stream, "\n ");
1237 fprintf (stream, " },\n");
1238 fprintf (stream, " {");
1239 if (t.level3_size << t.p > 8)
1240 fprintf (stream, "\n ");
1241 for (i = 0; i < t.level3_size << t.p; i++)
1243 if (i > 0 && (i % 8) == 0)
1244 fprintf (stream, "\n ");
1245 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1246 if (i+1 < t.level3_size << t.p)
1247 fprintf (stream, ",");
1249 if (t.level3_size << t.p > 8)
1250 fprintf (stream, "\n ");
1251 fprintf (stream, " }\n");
1252 fprintf (stream, "};\n");
1254 if (ferror (stream) || fclose (stream))
1256 fprintf (stderr, "error writing to '%s'\n", filename);
1261 /* ========================================================================= */
1263 /* Bidirectional category. */
1264 /* See Unicode 3.0 book, section 4.3,
1269 UC_BIDI_L, /* Left-to-Right */
1270 UC_BIDI_LRE, /* Left-to-Right Embedding */
1271 UC_BIDI_LRO, /* Left-to-Right Override */
1272 UC_BIDI_R, /* Right-to-Left */
1273 UC_BIDI_AL, /* Right-to-Left Arabic */
1274 UC_BIDI_RLE, /* Right-to-Left Embedding */
1275 UC_BIDI_RLO, /* Right-to-Left Override */
1276 UC_BIDI_PDF, /* Pop Directional Format */
1277 UC_BIDI_EN, /* European Number */
1278 UC_BIDI_ES, /* European Number Separator */
1279 UC_BIDI_ET, /* European Number Terminator */
1280 UC_BIDI_AN, /* Arabic Number */
1281 UC_BIDI_CS, /* Common Number Separator */
1282 UC_BIDI_NSM, /* Non-Spacing Mark */
1283 UC_BIDI_BN, /* Boundary Neutral */
1284 UC_BIDI_B, /* Paragraph Separator */
1285 UC_BIDI_S, /* Segment Separator */
1286 UC_BIDI_WS, /* Whitespace */
1287 UC_BIDI_ON /* Other Neutral */
1291 bidi_category_byname (const char *category_name)
1293 switch (category_name[0])
1296 switch (category_name[1])
1299 if (category_name[2] == '\0')
1303 if (category_name[2] == '\0')
1309 switch (category_name[1])
1314 if (category_name[2] == '\0')
1320 switch (category_name[1])
1323 if (category_name[2] == '\0')
1329 switch (category_name[1])
1332 if (category_name[2] == '\0')
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1346 switch (category_name[1])
1351 switch (category_name[2])
1354 if (category_name[3] == '\0')
1358 if (category_name[3] == '\0')
1366 switch (category_name[1])
1369 switch (category_name[2])
1372 if (category_name[3] == '\0')
1380 switch (category_name[1])
1383 if (category_name[2] == '\0')
1389 switch (category_name[1])
1392 switch (category_name[2])
1395 if (category_name[3] == '\0')
1403 switch (category_name[1])
1408 switch (category_name[2])
1411 if (category_name[3] == '\0')
1415 if (category_name[3] == '\0')
1423 if (category_name[1] == '\0')
1427 switch (category_name[1])
1430 if (category_name[2] == '\0')
1436 /* Invalid bidi category name. */
1441 get_bidi_category (unsigned int ch)
1443 if (unicode_attributes[ch].name != NULL)
1444 return bidi_category_byname (unicode_attributes[ch].bidi);
1447 /* The bidi category of unassigned characters depends on the range.
1448 See UTR #9 and DerivedBidiClass.txt. */
1449 if ((ch >= 0x0590 && ch <= 0x05FF)
1450 || (ch >= 0x07FB && ch <= 0x08FF)
1451 || (ch >= 0xFB37 && ch <= 0xFB45)
1452 || (ch >= 0x10800 && ch <= 0x10FFF))
1454 else if ((ch >= 0x0600 && ch <= 0x07BF)
1455 || (ch >= 0x2064 && ch <= 0x2069)
1456 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1457 || (ch >= 0xFDFE && ch <= 0xFEFE))
1459 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1460 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1461 || (ch & 0xFFFF) == 0xFFFE
1462 || (ch & 0xFFFF) == 0xFFFF
1463 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1470 /* Construction of sparse 3-level tables. */
1471 #define TABLE bidi_category_table
1472 #define ELEMENT uint8_t
1473 #define DEFAULT UC_BIDI_L
1474 #define xmalloc malloc
1475 #define xrealloc realloc
1478 /* Output the per-character bidi category table. */
1480 output_bidi_category (const char *filename, const char *version)
1484 struct bidi_category_table t;
1485 unsigned int level1_offset, level2_offset, level3_offset;
1486 uint16_t *level3_packed;
1488 stream = fopen (filename, "w");
1491 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1495 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1496 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1497 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1502 bidi_category_table_init (&t);
1504 for (ch = 0; ch < 0x110000; ch++)
1506 int value = get_bidi_category (ch);
1508 bidi_category_table_add (&t, ch, value);
1511 bidi_category_table_finalize (&t);
1513 /* Offsets in t.result, in memory of this process. */
1515 5 * sizeof (uint32_t);
1517 5 * sizeof (uint32_t)
1518 + t.level1_size * sizeof (uint32_t);
1520 5 * sizeof (uint32_t)
1521 + t.level1_size * sizeof (uint32_t)
1522 + (t.level2_size << t.q) * sizeof (uint32_t);
1524 for (i = 0; i < 5; i++)
1525 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1526 ((uint32_t *) t.result)[i]);
1527 fprintf (stream, "static const\n");
1528 fprintf (stream, "struct\n");
1529 fprintf (stream, " {\n");
1530 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1531 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1532 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1533 (1 << t.p) * 5 / 16);
1534 fprintf (stream, " }\n");
1535 fprintf (stream, "u_bidi_category =\n");
1536 fprintf (stream, "{\n");
1537 fprintf (stream, " {");
1538 if (t.level1_size > 8)
1539 fprintf (stream, "\n ");
1540 for (i = 0; i < t.level1_size; i++)
1543 if (i > 0 && (i % 8) == 0)
1544 fprintf (stream, "\n ");
1545 offset = ((uint32_t *) (t.result + level1_offset))[i];
1547 fprintf (stream, " %5d", -1);
1549 fprintf (stream, " %5zd",
1550 (offset - level2_offset) / sizeof (uint32_t));
1551 if (i+1 < t.level1_size)
1552 fprintf (stream, ",");
1554 if (t.level1_size > 8)
1555 fprintf (stream, "\n ");
1556 fprintf (stream, " },\n");
1557 fprintf (stream, " {");
1558 if (t.level2_size << t.q > 8)
1559 fprintf (stream, "\n ");
1560 for (i = 0; i < t.level2_size << t.q; i++)
1563 if (i > 0 && (i % 8) == 0)
1564 fprintf (stream, "\n ");
1565 offset = ((uint32_t *) (t.result + level2_offset))[i];
1567 fprintf (stream, " %5d", -1);
1569 fprintf (stream, " %5zd",
1570 (offset - level3_offset) / sizeof (uint8_t));
1571 if (i+1 < t.level2_size << t.q)
1572 fprintf (stream, ",");
1574 if (t.level2_size << t.q > 8)
1575 fprintf (stream, "\n ");
1576 fprintf (stream, " },\n");
1577 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1578 not 32-bit units, in order to make the lookup function easier. */
1581 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1582 for (i = 0; i < t.level3_size << t.p; i++)
1584 unsigned int j = (i * 5) / 16;
1585 unsigned int k = (i * 5) % 16;
1586 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1587 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1588 level3_packed[j] = value & 0xffff;
1589 level3_packed[j+1] = value >> 16;
1591 fprintf (stream, " {");
1592 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1593 fprintf (stream, "\n ");
1594 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1596 if (i > 0 && (i % 8) == 0)
1597 fprintf (stream, "\n ");
1598 fprintf (stream, " 0x%04x", level3_packed[i]);
1599 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1600 fprintf (stream, ",");
1602 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1603 fprintf (stream, "\n ");
1604 fprintf (stream, " }\n");
1605 free (level3_packed);
1606 fprintf (stream, "};\n");
1608 if (ferror (stream) || fclose (stream))
1610 fprintf (stderr, "error writing to '%s'\n", filename);
1615 /* ========================================================================= */
1617 /* Decimal digit value. */
1618 /* See Unicode 3.0 book, section 4.6. */
1621 get_decdigit_value (unsigned int ch)
1623 if (unicode_attributes[ch].name != NULL
1624 && unicode_attributes[ch].decdigit[0] != '\0')
1625 return atoi (unicode_attributes[ch].decdigit);
1629 /* Construction of sparse 3-level tables. */
1630 #define TABLE decdigit_table
1631 #define ELEMENT uint8_t
1633 #define xmalloc malloc
1634 #define xrealloc realloc
1637 /* Output the unit test for the per-character decimal digit value table. */
1639 output_decimal_digit_test (const char *filename, const char *version)
1645 stream = fopen (filename, "w");
1648 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1652 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1653 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1654 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1658 for (ch = 0; ch < 0x110000; ch++)
1660 int value = get_decdigit_value (ch);
1662 if (!(value >= -1 && value < 10))
1668 fprintf (stream, ",\n");
1669 fprintf (stream, " { 0x%04X, %d }", ch, value);
1674 fprintf (stream, "\n");
1676 if (ferror (stream) || fclose (stream))
1678 fprintf (stderr, "error writing to '%s'\n", filename);
1683 /* Output the per-character decimal digit value table. */
1685 output_decimal_digit (const char *filename, const char *version)
1689 struct decdigit_table t;
1690 unsigned int level1_offset, level2_offset, level3_offset;
1692 stream = fopen (filename, "w");
1695 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1699 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1700 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1701 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1706 decdigit_table_init (&t);
1708 for (ch = 0; ch < 0x110000; ch++)
1710 int value = 1 + get_decdigit_value (ch);
1712 if (!(value >= 0 && value <= 10))
1715 decdigit_table_add (&t, ch, value);
1718 decdigit_table_finalize (&t);
1720 /* Offsets in t.result, in memory of this process. */
1722 5 * sizeof (uint32_t);
1724 5 * sizeof (uint32_t)
1725 + t.level1_size * sizeof (uint32_t);
1727 5 * sizeof (uint32_t)
1728 + t.level1_size * sizeof (uint32_t)
1729 + (t.level2_size << t.q) * sizeof (uint32_t);
1731 for (i = 0; i < 5; i++)
1732 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1733 ((uint32_t *) t.result)[i]);
1734 fprintf (stream, "static const\n");
1735 fprintf (stream, "struct\n");
1736 fprintf (stream, " {\n");
1737 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1738 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1739 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1741 fprintf (stream, " }\n");
1742 fprintf (stream, "u_decdigit =\n");
1743 fprintf (stream, "{\n");
1744 fprintf (stream, " {");
1745 if (t.level1_size > 8)
1746 fprintf (stream, "\n ");
1747 for (i = 0; i < t.level1_size; i++)
1750 if (i > 0 && (i % 8) == 0)
1751 fprintf (stream, "\n ");
1752 offset = ((uint32_t *) (t.result + level1_offset))[i];
1754 fprintf (stream, " %5d", -1);
1756 fprintf (stream, " %5zd",
1757 (offset - level2_offset) / sizeof (uint32_t));
1758 if (i+1 < t.level1_size)
1759 fprintf (stream, ",");
1761 if (t.level1_size > 8)
1762 fprintf (stream, "\n ");
1763 fprintf (stream, " },\n");
1764 fprintf (stream, " {");
1765 if (t.level2_size << t.q > 8)
1766 fprintf (stream, "\n ");
1767 for (i = 0; i < t.level2_size << t.q; i++)
1770 if (i > 0 && (i % 8) == 0)
1771 fprintf (stream, "\n ");
1772 offset = ((uint32_t *) (t.result + level2_offset))[i];
1774 fprintf (stream, " %5d", -1);
1776 fprintf (stream, " %5zd",
1777 (offset - level3_offset) / sizeof (uint8_t));
1778 if (i+1 < t.level2_size << t.q)
1779 fprintf (stream, ",");
1781 if (t.level2_size << t.q > 8)
1782 fprintf (stream, "\n ");
1783 fprintf (stream, " },\n");
1784 /* Pack the level3 array. Each entry needs 4 bits only. */
1785 fprintf (stream, " {");
1786 if (t.level3_size << (t.p - 1) > 8)
1787 fprintf (stream, "\n ");
1788 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1790 if (i > 0 && (i % 8) == 0)
1791 fprintf (stream, "\n ");
1792 fprintf (stream, " 0x%02x",
1793 ((uint8_t *) (t.result + level3_offset))[2*i]
1794 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1795 if (i+1 < t.level3_size << (t.p - 1))
1796 fprintf (stream, ",");
1798 if (t.level3_size << (t.p - 1) > 8)
1799 fprintf (stream, "\n ");
1800 fprintf (stream, " }\n");
1801 fprintf (stream, "};\n");
1803 if (ferror (stream) || fclose (stream))
1805 fprintf (stderr, "error writing to '%s'\n", filename);
1810 /* ========================================================================= */
1813 /* See Unicode 3.0 book, section 4.6. */
1816 get_digit_value (unsigned int ch)
1818 if (unicode_attributes[ch].name != NULL
1819 && unicode_attributes[ch].digit[0] != '\0')
1820 return atoi (unicode_attributes[ch].digit);
1824 /* Output the unit test for the per-character digit value table. */
1826 output_digit_test (const char *filename, const char *version)
1832 stream = fopen (filename, "w");
1835 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1839 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1840 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1841 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1845 for (ch = 0; ch < 0x110000; ch++)
1847 int value = get_digit_value (ch);
1849 if (!(value >= -1 && value < 10))
1855 fprintf (stream, ",\n");
1856 fprintf (stream, " { 0x%04X, %d }", ch, value);
1861 fprintf (stream, "\n");
1863 if (ferror (stream) || fclose (stream))
1865 fprintf (stderr, "error writing to '%s'\n", filename);
1870 /* Output the per-character digit value table. */
1872 output_digit (const char *filename, const char *version)
1876 struct decdigit_table t;
1877 unsigned int level1_offset, level2_offset, level3_offset;
1879 stream = fopen (filename, "w");
1882 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1886 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1887 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1888 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1893 decdigit_table_init (&t);
1895 for (ch = 0; ch < 0x110000; ch++)
1897 int value = 1 + get_digit_value (ch);
1899 if (!(value >= 0 && value <= 10))
1902 decdigit_table_add (&t, ch, value);
1905 decdigit_table_finalize (&t);
1907 /* Offsets in t.result, in memory of this process. */
1909 5 * sizeof (uint32_t);
1911 5 * sizeof (uint32_t)
1912 + t.level1_size * sizeof (uint32_t);
1914 5 * sizeof (uint32_t)
1915 + t.level1_size * sizeof (uint32_t)
1916 + (t.level2_size << t.q) * sizeof (uint32_t);
1918 for (i = 0; i < 5; i++)
1919 fprintf (stream, "#define digit_header_%d %d\n", i,
1920 ((uint32_t *) t.result)[i]);
1921 fprintf (stream, "static const\n");
1922 fprintf (stream, "struct\n");
1923 fprintf (stream, " {\n");
1924 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1925 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1926 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1928 fprintf (stream, " }\n");
1929 fprintf (stream, "u_digit =\n");
1930 fprintf (stream, "{\n");
1931 fprintf (stream, " {");
1932 if (t.level1_size > 8)
1933 fprintf (stream, "\n ");
1934 for (i = 0; i < t.level1_size; i++)
1937 if (i > 0 && (i % 8) == 0)
1938 fprintf (stream, "\n ");
1939 offset = ((uint32_t *) (t.result + level1_offset))[i];
1941 fprintf (stream, " %5d", -1);
1943 fprintf (stream, " %5zd",
1944 (offset - level2_offset) / sizeof (uint32_t));
1945 if (i+1 < t.level1_size)
1946 fprintf (stream, ",");
1948 if (t.level1_size > 8)
1949 fprintf (stream, "\n ");
1950 fprintf (stream, " },\n");
1951 fprintf (stream, " {");
1952 if (t.level2_size << t.q > 8)
1953 fprintf (stream, "\n ");
1954 for (i = 0; i < t.level2_size << t.q; i++)
1957 if (i > 0 && (i % 8) == 0)
1958 fprintf (stream, "\n ");
1959 offset = ((uint32_t *) (t.result + level2_offset))[i];
1961 fprintf (stream, " %5d", -1);
1963 fprintf (stream, " %5zd",
1964 (offset - level3_offset) / sizeof (uint8_t));
1965 if (i+1 < t.level2_size << t.q)
1966 fprintf (stream, ",");
1968 if (t.level2_size << t.q > 8)
1969 fprintf (stream, "\n ");
1970 fprintf (stream, " },\n");
1971 /* Pack the level3 array. Each entry needs 4 bits only. */
1972 fprintf (stream, " {");
1973 if (t.level3_size << (t.p - 1) > 8)
1974 fprintf (stream, "\n ");
1975 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1977 if (i > 0 && (i % 8) == 0)
1978 fprintf (stream, "\n ");
1979 fprintf (stream, " 0x%02x",
1980 ((uint8_t *) (t.result + level3_offset))[2*i]
1981 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1982 if (i+1 < t.level3_size << (t.p - 1))
1983 fprintf (stream, ",");
1985 if (t.level3_size << (t.p - 1) > 8)
1986 fprintf (stream, "\n ");
1987 fprintf (stream, " }\n");
1988 fprintf (stream, "};\n");
1990 if (ferror (stream) || fclose (stream))
1992 fprintf (stderr, "error writing to '%s'\n", filename);
1997 /* ========================================================================= */
1999 /* Numeric value. */
2000 /* See Unicode 3.0 book, section 4.6. */
2002 typedef struct { int numerator; int denominator; } uc_fraction_t;
2004 static uc_fraction_t
2005 get_numeric_value (unsigned int ch)
2007 uc_fraction_t value;
2009 if (unicode_attributes[ch].name != NULL
2010 && unicode_attributes[ch].numeric[0] != '\0')
2012 const char *str = unicode_attributes[ch].numeric;
2013 /* str is of the form "integer" or "integer/posinteger". */
2014 value.numerator = atoi (str);
2015 if (strchr (str, '/') != NULL)
2016 value.denominator = atoi (strchr (str, '/') + 1);
2018 value.denominator = 1;
2022 value.numerator = 0;
2023 value.denominator = 0;
2028 /* Output the unit test for the per-character numeric value table. */
2030 output_numeric_test (const char *filename, const char *version)
2036 stream = fopen (filename, "w");
2039 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2043 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2044 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2045 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2049 for (ch = 0; ch < 0x110000; ch++)
2051 uc_fraction_t value = get_numeric_value (ch);
2053 if (value.numerator != 0 || value.denominator != 0)
2056 fprintf (stream, ",\n");
2057 fprintf (stream, " { 0x%04X, %d, %d }",
2058 ch, value.numerator, value.denominator);
2063 fprintf (stream, "\n");
2065 if (ferror (stream) || fclose (stream))
2067 fprintf (stderr, "error writing to '%s'\n", filename);
2072 /* Construction of sparse 3-level tables. */
2073 #define TABLE numeric_table
2074 #define ELEMENT uint8_t
2076 #define xmalloc malloc
2077 #define xrealloc realloc
2080 /* Output the per-character numeric value table. */
2082 output_numeric (const char *filename, const char *version)
2085 uc_fraction_t fractions[128];
2086 unsigned int nfractions;
2087 unsigned int ch, i, j;
2088 struct numeric_table t;
2089 unsigned int level1_offset, level2_offset, level3_offset;
2090 uint16_t *level3_packed;
2092 stream = fopen (filename, "w");
2095 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2099 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2100 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2101 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2104 /* Create table of occurring fractions. */
2106 for (ch = 0; ch < 0x110000; ch++)
2108 uc_fraction_t value = get_numeric_value (ch);
2110 for (i = 0; i < nfractions; i++)
2111 if (value.numerator == fractions[i].numerator
2112 && value.denominator == fractions[i].denominator)
2114 if (i == nfractions)
2116 if (nfractions == 128)
2118 for (i = 0; i < nfractions; i++)
2119 if (value.denominator < fractions[i].denominator
2120 || (value.denominator == fractions[i].denominator
2121 && value.numerator < fractions[i].numerator))
2123 for (j = nfractions; j > i; j--)
2124 fractions[j] = fractions[j - 1];
2125 fractions[i] = value;
2130 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2132 fprintf (stream, "{\n");
2133 for (i = 0; i < nfractions; i++)
2135 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2136 fractions[i].denominator);
2137 if (i+1 < nfractions)
2138 fprintf (stream, ",");
2139 fprintf (stream, "\n");
2141 fprintf (stream, "};\n");
2145 numeric_table_init (&t);
2147 for (ch = 0; ch < 0x110000; ch++)
2149 uc_fraction_t value = get_numeric_value (ch);
2151 for (i = 0; i < nfractions; i++)
2152 if (value.numerator == fractions[i].numerator
2153 && value.denominator == fractions[i].denominator)
2155 if (i == nfractions)
2158 numeric_table_add (&t, ch, i);
2161 numeric_table_finalize (&t);
2163 /* Offsets in t.result, in memory of this process. */
2165 5 * sizeof (uint32_t);
2167 5 * sizeof (uint32_t)
2168 + t.level1_size * sizeof (uint32_t);
2170 5 * sizeof (uint32_t)
2171 + t.level1_size * sizeof (uint32_t)
2172 + (t.level2_size << t.q) * sizeof (uint32_t);
2174 for (i = 0; i < 5; i++)
2175 fprintf (stream, "#define numeric_header_%d %d\n", i,
2176 ((uint32_t *) t.result)[i]);
2177 fprintf (stream, "static const\n");
2178 fprintf (stream, "struct\n");
2179 fprintf (stream, " {\n");
2180 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2181 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2182 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2183 (1 << t.p) * 7 / 16);
2184 fprintf (stream, " }\n");
2185 fprintf (stream, "u_numeric =\n");
2186 fprintf (stream, "{\n");
2187 fprintf (stream, " {");
2188 if (t.level1_size > 8)
2189 fprintf (stream, "\n ");
2190 for (i = 0; i < t.level1_size; i++)
2193 if (i > 0 && (i % 8) == 0)
2194 fprintf (stream, "\n ");
2195 offset = ((uint32_t *) (t.result + level1_offset))[i];
2197 fprintf (stream, " %5d", -1);
2199 fprintf (stream, " %5zd",
2200 (offset - level2_offset) / sizeof (uint32_t));
2201 if (i+1 < t.level1_size)
2202 fprintf (stream, ",");
2204 if (t.level1_size > 8)
2205 fprintf (stream, "\n ");
2206 fprintf (stream, " },\n");
2207 fprintf (stream, " {");
2208 if (t.level2_size << t.q > 8)
2209 fprintf (stream, "\n ");
2210 for (i = 0; i < t.level2_size << t.q; i++)
2213 if (i > 0 && (i % 8) == 0)
2214 fprintf (stream, "\n ");
2215 offset = ((uint32_t *) (t.result + level2_offset))[i];
2217 fprintf (stream, " %5d", -1);
2219 fprintf (stream, " %5zd",
2220 (offset - level3_offset) / sizeof (uint8_t));
2221 if (i+1 < t.level2_size << t.q)
2222 fprintf (stream, ",");
2224 if (t.level2_size << t.q > 8)
2225 fprintf (stream, "\n ");
2226 fprintf (stream, " },\n");
2227 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2228 not 32-bit units, in order to make the lookup function easier. */
2231 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2232 for (i = 0; i < t.level3_size << t.p; i++)
2234 unsigned int j = (i * 7) / 16;
2235 unsigned int k = (i * 7) % 16;
2236 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2237 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2238 level3_packed[j] = value & 0xffff;
2239 level3_packed[j+1] = value >> 16;
2241 fprintf (stream, " {");
2242 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2243 fprintf (stream, "\n ");
2244 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2246 if (i > 0 && (i % 8) == 0)
2247 fprintf (stream, "\n ");
2248 fprintf (stream, " 0x%04x", level3_packed[i]);
2249 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2250 fprintf (stream, ",");
2252 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2253 fprintf (stream, "\n ");
2254 fprintf (stream, " }\n");
2255 free (level3_packed);
2256 fprintf (stream, "};\n");
2258 if (ferror (stream) || fclose (stream))
2260 fprintf (stderr, "error writing to '%s'\n", filename);
2265 /* ========================================================================= */
2268 /* See Unicode 3.0 book, section 4.7,
2271 /* List of mirrored character pairs. This is a subset of the characters
2272 having the BidiMirrored property. */
2273 static unsigned int mirror_pairs[][2] =
2330 get_mirror_value (unsigned int ch)
2333 unsigned int mirror_char;
2336 mirrored = (unicode_attributes[ch].name != NULL
2337 && unicode_attributes[ch].mirrored);
2338 mirror_char = 0xfffd;
2339 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2340 if (ch == mirror_pairs[i][0])
2342 mirror_char = mirror_pairs[i][1];
2345 else if (ch == mirror_pairs[i][1])
2347 mirror_char = mirror_pairs[i][0];
2351 return (int) mirror_char - (int) ch;
2354 if (mirror_char != 0xfffd)
2360 /* Construction of sparse 3-level tables. */
2361 #define TABLE mirror_table
2362 #define ELEMENT int32_t
2364 #define xmalloc malloc
2365 #define xrealloc realloc
2368 /* Output the per-character mirror table. */
2370 output_mirror (const char *filename, const char *version)
2374 struct mirror_table t;
2375 unsigned int level1_offset, level2_offset, level3_offset;
2377 stream = fopen (filename, "w");
2380 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2384 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2385 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2386 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2391 mirror_table_init (&t);
2393 for (ch = 0; ch < 0x110000; ch++)
2395 int value = get_mirror_value (ch);
2397 mirror_table_add (&t, ch, value);
2400 mirror_table_finalize (&t);
2402 /* Offsets in t.result, in memory of this process. */
2404 5 * sizeof (uint32_t);
2406 5 * sizeof (uint32_t)
2407 + t.level1_size * sizeof (uint32_t);
2409 5 * sizeof (uint32_t)
2410 + t.level1_size * sizeof (uint32_t)
2411 + (t.level2_size << t.q) * sizeof (uint32_t);
2413 for (i = 0; i < 5; i++)
2414 fprintf (stream, "#define mirror_header_%d %d\n", i,
2415 ((uint32_t *) t.result)[i]);
2416 fprintf (stream, "static const\n");
2417 fprintf (stream, "struct\n");
2418 fprintf (stream, " {\n");
2419 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2420 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2421 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2422 fprintf (stream, " }\n");
2423 fprintf (stream, "u_mirror =\n");
2424 fprintf (stream, "{\n");
2425 fprintf (stream, " {");
2426 if (t.level1_size > 8)
2427 fprintf (stream, "\n ");
2428 for (i = 0; i < t.level1_size; i++)
2431 if (i > 0 && (i % 8) == 0)
2432 fprintf (stream, "\n ");
2433 offset = ((uint32_t *) (t.result + level1_offset))[i];
2435 fprintf (stream, " %5d", -1);
2437 fprintf (stream, " %5zd",
2438 (offset - level2_offset) / sizeof (uint32_t));
2439 if (i+1 < t.level1_size)
2440 fprintf (stream, ",");
2442 if (t.level1_size > 8)
2443 fprintf (stream, "\n ");
2444 fprintf (stream, " },\n");
2445 fprintf (stream, " {");
2446 if (t.level2_size << t.q > 8)
2447 fprintf (stream, "\n ");
2448 for (i = 0; i < t.level2_size << t.q; i++)
2451 if (i > 0 && (i % 8) == 0)
2452 fprintf (stream, "\n ");
2453 offset = ((uint32_t *) (t.result + level2_offset))[i];
2455 fprintf (stream, " %5d", -1);
2457 fprintf (stream, " %5zd",
2458 (offset - level3_offset) / sizeof (int32_t));
2459 if (i+1 < t.level2_size << t.q)
2460 fprintf (stream, ",");
2462 if (t.level2_size << t.q > 8)
2463 fprintf (stream, "\n ");
2464 fprintf (stream, " },\n");
2465 fprintf (stream, " {");
2466 if (t.level3_size << t.p > 8)
2467 fprintf (stream, "\n ");
2468 for (i = 0; i < t.level3_size << t.p; i++)
2470 if (i > 0 && (i % 8) == 0)
2471 fprintf (stream, "\n ");
2472 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2473 if (i+1 < t.level3_size << t.p)
2474 fprintf (stream, ",");
2476 if (t.level3_size << t.p > 8)
2477 fprintf (stream, "\n ");
2478 fprintf (stream, " }\n");
2479 fprintf (stream, "};\n");
2481 if (ferror (stream) || fclose (stream))
2483 fprintf (stderr, "error writing to '%s'\n", filename);
2488 /* ========================================================================= */
2492 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2501 PROP_QUOTATION_MARK,
2502 PROP_TERMINAL_PUNCTUATION,
2505 PROP_ASCII_HEX_DIGIT,
2506 PROP_OTHER_ALPHABETIC,
2510 PROP_OTHER_LOWERCASE,
2511 PROP_OTHER_UPPERCASE,
2512 PROP_NONCHARACTER_CODE_POINT,
2513 PROP_OTHER_GRAPHEME_EXTEND,
2514 PROP_IDS_BINARY_OPERATOR,
2515 PROP_IDS_TRINARY_OPERATOR,
2517 PROP_UNIFIED_IDEOGRAPH,
2518 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2521 PROP_LOGICAL_ORDER_EXCEPTION,
2522 PROP_OTHER_ID_START,
2523 PROP_OTHER_ID_CONTINUE,
2525 PROP_VARIATION_SELECTOR,
2526 PROP_PATTERN_WHITE_SPACE,
2527 PROP_PATTERN_SYNTAX,
2528 /* DerivedCoreProperties.txt */
2537 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2538 PROP_GRAPHEME_EXTEND,
2542 unsigned long long unicode_properties[0x110000];
2545 clear_properties (void)
2549 for (i = 0; i < 0x110000; i++)
2550 unicode_properties[i] = 0;
2553 /* Stores in unicode_properties[] the properties from the
2554 PropList.txt or DerivedCoreProperties.txt file. */
2556 fill_properties (const char *proplist_filename)
2561 stream = fopen (proplist_filename, "r");
2564 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2571 unsigned int i1, i2;
2572 char padding[200+1];
2573 char propname[200+1];
2574 unsigned int propvalue;
2576 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2579 if (buf[0] == '\0' || buf[0] == '#')
2582 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2584 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2586 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2591 #define PROP(name,value) \
2592 if (strcmp (propname, name) == 0) propvalue = value; else
2594 PROP ("White_Space", PROP_WHITE_SPACE)
2595 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2596 PROP ("Join_Control", PROP_JOIN_CONTROL)
2597 PROP ("Dash", PROP_DASH)
2598 PROP ("Hyphen", PROP_HYPHEN)
2599 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2600 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2601 PROP ("Other_Math", PROP_OTHER_MATH)
2602 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2603 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2604 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2605 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2606 PROP ("Diacritic", PROP_DIACRITIC)
2607 PROP ("Extender", PROP_EXTENDER)
2608 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2609 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2610 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2611 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2612 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2613 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2614 PROP ("Radical", PROP_RADICAL)
2615 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2616 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2617 PROP ("Deprecated", PROP_DEPRECATED)
2618 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2619 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2620 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2621 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2622 PROP ("STerm", PROP_STERM)
2623 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2624 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2625 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2626 /* DerivedCoreProperties.txt */
2627 PROP ("Math", PROP_MATH)
2628 PROP ("Alphabetic", PROP_ALPHABETIC)
2629 PROP ("Lowercase", PROP_LOWERCASE)
2630 PROP ("Uppercase", PROP_UPPERCASE)
2631 PROP ("ID_Start", PROP_ID_START)
2632 PROP ("ID_Continue", PROP_ID_CONTINUE)
2633 PROP ("XID_Start", PROP_XID_START)
2634 PROP ("XID_Continue", PROP_XID_CONTINUE)
2635 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2636 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2637 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2638 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2641 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2645 if (!(i1 <= i2 && i2 < 0x110000))
2648 for (i = i1; i <= i2; i++)
2649 unicode_properties[i] |= 1ULL << propvalue;
2652 if (ferror (stream) || fclose (stream))
2654 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2659 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2662 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2668 for (i = 0; i < 0x110000; i++)
2671 stream = fopen (proplist_filename, "r");
2674 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2678 /* Search for the "Property dump for: ..." line. */
2681 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2683 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2687 while (strstr (buf, property_name) == NULL);
2691 unsigned int i1, i2;
2693 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2697 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2699 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2701 fprintf (stderr, "parse error in property in '%s'\n",
2706 else if (strlen (buf) >= 4)
2708 if (sscanf (buf, "%4X", &i1) < 1)
2710 fprintf (stderr, "parse error in property in '%s'\n",
2718 fprintf (stderr, "parse error in property in '%s'\n",
2722 if (!(i1 <= i2 && i2 < 0x110000))
2724 for (i = i1; i <= i2; i++)
2727 if (ferror (stream) || fclose (stream))
2729 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2734 /* Properties from Unicode 3.0 PropList.txt file. */
2736 /* The paired punctuation property from the PropList.txt file. */
2737 char unicode_pairedpunctuation[0x110000];
2739 /* The left of pair property from the PropList.txt file. */
2740 char unicode_leftofpair[0x110000];
2743 fill_properties30 (const char *proplist30_filename)
2745 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2746 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2749 /* ------------------------------------------------------------------------- */
2751 /* See PropList.txt, UCD.html. */
2753 is_property_white_space (unsigned int ch)
2755 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2758 /* See Unicode 3.0 book, section 4.10,
2759 PropList.txt, UCD.html,
2760 DerivedCoreProperties.txt, UCD.html. */
2762 is_property_alphabetic (unsigned int ch)
2766 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2767 /* For some reason, the following are listed as having property
2768 Alphabetic but not as having property Other_Alphabetic. */
2769 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2770 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2771 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2772 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2773 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2774 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2775 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2776 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2777 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2778 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2779 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2780 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2782 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2784 if (result1 != result2)
2789 /* See PropList.txt, UCD.html. */
2791 is_property_other_alphabetic (unsigned int ch)
2793 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2796 /* See PropList.txt, UCD.html. */
2798 is_property_not_a_character (unsigned int ch)
2800 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2803 /* See PropList.txt, UCD.html,
2804 DerivedCoreProperties.txt, UCD.html. */
2806 is_property_default_ignorable_code_point (unsigned int ch)
2809 (is_category_Cf (ch)
2810 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2811 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2812 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2813 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2815 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2817 if (result1 != result2)
2822 /* See PropList.txt, UCD.html. */
2824 is_property_other_default_ignorable_code_point (unsigned int ch)
2826 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2829 /* See PropList.txt, UCD.html. */
2831 is_property_deprecated (unsigned int ch)
2833 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2836 /* See PropList.txt, UCD.html. */
2838 is_property_logical_order_exception (unsigned int ch)
2840 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2843 /* See PropList.txt, UCD.html. */
2845 is_property_variation_selector (unsigned int ch)
2847 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2850 /* See PropList-3.0.1.txt. */
2852 is_property_private_use (unsigned int ch)
2854 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2855 return (ch >= 0xE000 && ch <= 0xF8FF)
2856 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2857 || (ch >= 0x100000 && ch <= 0x10FFFD);
2860 /* See PropList-3.0.1.txt. */
2862 is_property_unassigned_code_value (unsigned int ch)
2864 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2867 /* See PropList.txt, UCD.html,
2868 DerivedCoreProperties.txt, UCD.html. */
2870 is_property_uppercase (unsigned int ch)
2874 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2876 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2878 if (result1 != result2)
2883 /* See PropList.txt, UCD.html. */
2885 is_property_other_uppercase (unsigned int ch)
2887 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2890 /* See PropList.txt, UCD.html,
2891 DerivedCoreProperties.txt, UCD.html. */
2893 is_property_lowercase (unsigned int ch)
2897 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2899 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2901 if (result1 != result2)
2906 /* See PropList.txt, UCD.html. */
2908 is_property_other_lowercase (unsigned int ch)
2910 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2913 /* See PropList-3.0.1.txt. */
2915 is_property_titlecase (unsigned int ch)
2917 return is_category_Lt (ch);
2920 /* See PropList.txt, UCD.html. */
2922 is_property_soft_dotted (unsigned int ch)
2924 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2927 /* See DerivedCoreProperties.txt, UCD.html. */
2929 is_property_id_start (unsigned int ch)
2931 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2934 /* See PropList.txt, UCD.html. */
2936 is_property_other_id_start (unsigned int ch)
2938 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2941 /* See DerivedCoreProperties.txt, UCD.html. */
2943 is_property_id_continue (unsigned int ch)
2945 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2948 /* See PropList.txt, UCD.html. */
2950 is_property_other_id_continue (unsigned int ch)
2952 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2955 /* See DerivedCoreProperties.txt, UCD.html. */
2957 is_property_xid_start (unsigned int ch)
2959 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2962 /* See DerivedCoreProperties.txt, UCD.html. */
2964 is_property_xid_continue (unsigned int ch)
2966 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2969 /* See PropList.txt, UCD.html. */
2971 is_property_pattern_white_space (unsigned int ch)
2973 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2976 /* See PropList.txt, UCD.html. */
2978 is_property_pattern_syntax (unsigned int ch)
2980 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2983 /* See PropList.txt, UCD.html. */
2985 is_property_join_control (unsigned int ch)
2987 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2990 /* See DerivedCoreProperties.txt, UCD.html. */
2992 is_property_grapheme_base (unsigned int ch)
2994 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2997 /* See DerivedCoreProperties.txt, UCD.html. */
2999 is_property_grapheme_extend (unsigned int ch)
3001 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3004 /* See PropList.txt, UCD.html. */
3006 is_property_other_grapheme_extend (unsigned int ch)
3008 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3011 /* See DerivedCoreProperties.txt, UCD.html. */
3013 is_property_grapheme_link (unsigned int ch)
3015 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3018 /* See PropList.txt, UCD.html. */
3020 is_property_bidi_control (unsigned int ch)
3022 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3025 /* See PropList-3.0.1.txt. */
3027 is_property_bidi_left_to_right (unsigned int ch)
3029 return (get_bidi_category (ch) == UC_BIDI_L);
3032 /* See PropList-3.0.1.txt. */
3034 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3036 return (get_bidi_category (ch) == UC_BIDI_R);
3039 /* See PropList-3.0.1.txt. */
3041 is_property_bidi_arabic_right_to_left (unsigned int ch)
3043 return (get_bidi_category (ch) == UC_BIDI_AL);
3046 /* See PropList-3.0.1.txt. */
3048 is_property_bidi_european_digit (unsigned int ch)
3050 return (get_bidi_category (ch) == UC_BIDI_EN);
3053 /* See PropList-3.0.1.txt. */
3055 is_property_bidi_eur_num_separator (unsigned int ch)
3057 return (get_bidi_category (ch) == UC_BIDI_ES);
3060 /* See PropList-3.0.1.txt. */
3062 is_property_bidi_eur_num_terminator (unsigned int ch)
3064 return (get_bidi_category (ch) == UC_BIDI_ET);
3067 /* See PropList-3.0.1.txt. */
3069 is_property_bidi_arabic_digit (unsigned int ch)
3071 return (get_bidi_category (ch) == UC_BIDI_AN);
3074 /* See PropList-3.0.1.txt. */
3076 is_property_bidi_common_separator (unsigned int ch)
3078 return (get_bidi_category (ch) == UC_BIDI_CS);
3081 /* See PropList-3.0.1.txt. */
3083 is_property_bidi_block_separator (unsigned int ch)
3085 return (get_bidi_category (ch) == UC_BIDI_B);
3088 /* See PropList-3.0.1.txt. */
3090 is_property_bidi_segment_separator (unsigned int ch)
3092 return (get_bidi_category (ch) == UC_BIDI_S);
3095 /* See PropList-3.0.1.txt. */
3097 is_property_bidi_whitespace (unsigned int ch)
3099 return (get_bidi_category (ch) == UC_BIDI_WS);
3102 /* See PropList-3.0.1.txt. */
3104 is_property_bidi_non_spacing_mark (unsigned int ch)
3106 return (get_bidi_category (ch) == UC_BIDI_NSM);
3109 /* See PropList-3.0.1.txt. */
3111 is_property_bidi_boundary_neutral (unsigned int ch)
3113 return (get_bidi_category (ch) == UC_BIDI_BN);
3116 /* See PropList-3.0.1.txt. */
3118 is_property_bidi_pdf (unsigned int ch)
3120 return (get_bidi_category (ch) == UC_BIDI_PDF);
3123 /* See PropList-3.0.1.txt. */
3125 is_property_bidi_embedding_or_override (unsigned int ch)
3127 int category = get_bidi_category (ch);
3128 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3129 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3132 /* See PropList-3.0.1.txt. */
3134 is_property_bidi_other_neutral (unsigned int ch)
3136 return (get_bidi_category (ch) == UC_BIDI_ON);
3139 /* See PropList.txt, UCD.html. */
3141 is_property_hex_digit (unsigned int ch)
3143 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3146 /* See PropList.txt, UCD.html. */
3148 is_property_ascii_hex_digit (unsigned int ch)
3150 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3153 /* See Unicode 3.0 book, section 4.10,
3154 PropList.txt, UCD.html. */
3156 is_property_ideographic (unsigned int ch)
3158 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3161 /* See PropList.txt, UCD.html. */
3163 is_property_unified_ideograph (unsigned int ch)
3165 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3168 /* See PropList.txt, UCD.html. */
3170 is_property_radical (unsigned int ch)
3172 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3175 /* See PropList.txt, UCD.html. */
3177 is_property_ids_binary_operator (unsigned int ch)
3179 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3182 /* See PropList.txt, UCD.html. */
3184 is_property_ids_trinary_operator (unsigned int ch)
3186 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3189 /* See PropList-3.0.1.txt. */
3191 is_property_zero_width (unsigned int ch)
3193 return is_category_Cf (ch)
3194 || (unicode_attributes[ch].name != NULL
3195 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3198 /* See PropList-3.0.1.txt. */
3200 is_property_space (unsigned int ch)
3202 return is_category_Zs (ch);
3205 /* See PropList-3.0.1.txt. */
3207 is_property_non_break (unsigned int ch)
3209 /* This is exactly the set of characters having line breaking
3211 return (ch == 0x00A0 /* NO-BREAK SPACE */
3212 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3213 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3214 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3215 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3216 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3217 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3218 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3219 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3220 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3221 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3222 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3223 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3224 || ch == 0x2007 /* FIGURE SPACE */
3225 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3226 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3229 /* See PropList-3.0.1.txt. */
3231 is_property_iso_control (unsigned int ch)
3234 (unicode_attributes[ch].name != NULL
3235 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3237 is_category_Cc (ch);
3239 if (result1 != result2)
3244 /* See PropList-3.0.1.txt. */
3246 is_property_format_control (unsigned int ch)
3248 return (is_category_Cf (ch)
3249 && get_bidi_category (ch) == UC_BIDI_BN
3250 && !is_property_join_control (ch)
3254 /* See PropList.txt, UCD.html. */
3256 is_property_dash (unsigned int ch)
3258 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3261 /* See PropList.txt, UCD.html. */
3263 is_property_hyphen (unsigned int ch)
3265 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3268 /* See PropList-3.0.1.txt. */
3270 is_property_punctuation (unsigned int ch)
3272 return is_category_P (ch);
3275 /* See PropList-3.0.1.txt. */
3277 is_property_line_separator (unsigned int ch)
3279 return is_category_Zl (ch);
3282 /* See PropList-3.0.1.txt. */
3284 is_property_paragraph_separator (unsigned int ch)
3286 return is_category_Zp (ch);
3289 /* See PropList.txt, UCD.html. */
3291 is_property_quotation_mark (unsigned int ch)
3293 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3296 /* See PropList.txt, UCD.html. */
3298 is_property_sentence_terminal (unsigned int ch)
3300 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3303 /* See PropList.txt, UCD.html. */
3305 is_property_terminal_punctuation (unsigned int ch)
3307 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3310 /* See PropList-3.0.1.txt. */
3312 is_property_currency_symbol (unsigned int ch)
3314 return is_category_Sc (ch);
3317 /* See Unicode 3.0 book, section 4.9,
3318 PropList.txt, UCD.html,
3319 DerivedCoreProperties.txt, UCD.html. */
3321 is_property_math (unsigned int ch)
3325 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3327 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3329 if (result1 != result2)
3334 /* See PropList.txt, UCD.html. */
3336 is_property_other_math (unsigned int ch)
3338 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3341 /* See PropList-3.0.1.txt. */
3343 is_property_paired_punctuation (unsigned int ch)
3345 return unicode_pairedpunctuation[ch];
3348 /* See PropList-3.0.1.txt. */
3350 is_property_left_of_pair (unsigned int ch)
3352 return unicode_leftofpair[ch];
3355 /* See PropList-3.0.1.txt. */
3357 is_property_combining (unsigned int ch)
3359 return (unicode_attributes[ch].name != NULL
3360 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3361 || is_category_Mc (ch)
3362 || is_category_Me (ch)
3363 || is_category_Mn (ch)));
3366 #if 0 /* same as is_property_bidi_non_spacing_mark */
3367 /* See PropList-3.0.1.txt. */
3369 is_property_non_spacing (unsigned int ch)
3371 return (unicode_attributes[ch].name != NULL
3372 && get_bidi_category (ch) == UC_BIDI_NSM);
3376 /* See PropList-3.0.1.txt. */
3378 is_property_composite (unsigned int ch)
3380 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3381 logical in some sense. */
3382 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3384 if (unicode_attributes[ch].name != NULL
3385 && unicode_attributes[ch].decomposition != NULL)
3387 /* Test whether the decomposition contains more than one character,
3388 and the first is not a space. */
3389 const char *decomp = unicode_attributes[ch].decomposition;
3390 if (decomp[0] == '<')
3392 decomp = strchr (decomp, '>') + 1;
3393 if (decomp[0] == ' ')
3396 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3401 /* See PropList-3.0.1.txt. */
3403 is_property_decimal_digit (unsigned int ch)
3405 return is_category_Nd (ch);
3408 /* See PropList-3.0.1.txt. */
3410 is_property_numeric (unsigned int ch)
3412 return ((get_numeric_value (ch)).denominator > 0)
3413 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3414 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3417 /* See PropList.txt, UCD.html. */
3419 is_property_diacritic (unsigned int ch)
3421 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3424 /* See PropList.txt, UCD.html. */
3426 is_property_extender (unsigned int ch)
3428 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3431 /* See PropList-3.0.1.txt. */
3433 is_property_ignorable_control (unsigned int ch)
3435 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3436 || is_category_Cf (ch))
3440 /* ------------------------------------------------------------------------- */
3442 /* Output all properties. */
3444 output_properties (const char *version)
3446 #define PROPERTY(P) \
3447 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3448 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3449 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3450 PROPERTY(white_space)
3451 PROPERTY(alphabetic)
3452 PROPERTY(other_alphabetic)
3453 PROPERTY(not_a_character)
3454 PROPERTY(default_ignorable_code_point)
3455 PROPERTY(other_default_ignorable_code_point)
3456 PROPERTY(deprecated)
3457 PROPERTY(logical_order_exception)
3458 PROPERTY(variation_selector)
3459 PROPERTY(private_use)
3460 PROPERTY(unassigned_code_value)
3462 PROPERTY(other_uppercase)
3464 PROPERTY(other_lowercase)
3466 PROPERTY(soft_dotted)
3468 PROPERTY(other_id_start)
3469 PROPERTY(id_continue)
3470 PROPERTY(other_id_continue)
3472 PROPERTY(xid_continue)
3473 PROPERTY(pattern_white_space)
3474 PROPERTY(pattern_syntax)
3475 PROPERTY(join_control)
3476 PROPERTY(grapheme_base)
3477 PROPERTY(grapheme_extend)
3478 PROPERTY(other_grapheme_extend)
3479 PROPERTY(grapheme_link)
3480 PROPERTY(bidi_control)
3481 PROPERTY(bidi_left_to_right)
3482 PROPERTY(bidi_hebrew_right_to_left)
3483 PROPERTY(bidi_arabic_right_to_left)
3484 PROPERTY(bidi_european_digit)
3485 PROPERTY(bidi_eur_num_separator)
3486 PROPERTY(bidi_eur_num_terminator)
3487 PROPERTY(bidi_arabic_digit)
3488 PROPERTY(bidi_common_separator)
3489 PROPERTY(bidi_block_separator)
3490 PROPERTY(bidi_segment_separator)
3491 PROPERTY(bidi_whitespace)
3492 PROPERTY(bidi_non_spacing_mark)
3493 PROPERTY(bidi_boundary_neutral)
3495 PROPERTY(bidi_embedding_or_override)
3496 PROPERTY(bidi_other_neutral)
3498 PROPERTY(ascii_hex_digit)
3499 PROPERTY(ideographic)
3500 PROPERTY(unified_ideograph)
3502 PROPERTY(ids_binary_operator)
3503 PROPERTY(ids_trinary_operator)
3504 PROPERTY(zero_width)
3507 PROPERTY(iso_control)
3508 PROPERTY(format_control)
3511 PROPERTY(punctuation)
3512 PROPERTY(line_separator)
3513 PROPERTY(paragraph_separator)
3514 PROPERTY(quotation_mark)
3515 PROPERTY(sentence_terminal)
3516 PROPERTY(terminal_punctuation)
3517 PROPERTY(currency_symbol)
3519 PROPERTY(other_math)
3520 PROPERTY(paired_punctuation)
3521 PROPERTY(left_of_pair)
3524 PROPERTY(decimal_digit)
3528 PROPERTY(ignorable_control)
3532 /* ========================================================================= */
3536 static const char *scripts[256];
3537 static unsigned int numscripts;
3539 static uint8_t unicode_scripts[0x110000];
3542 fill_scripts (const char *scripts_filename)
3547 stream = fopen (scripts_filename, "r");
3550 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3556 for (i = 0; i < 0x110000; i++)
3557 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3562 unsigned int i1, i2;
3563 char padding[200+1];
3564 char scriptname[200+1];
3567 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3570 if (buf[0] == '\0' || buf[0] == '#')
3573 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3575 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3577 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3587 for (script = numscripts - 1; script >= 0; script--)
3588 if (strcmp (scripts[script], scriptname) == 0)
3592 scripts[numscripts] = strdup (scriptname);
3593 script = numscripts;
3595 if (numscripts == 256)
3599 for (i = i1; i <= i2; i++)
3601 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3602 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3603 unicode_scripts[i] = script;
3607 if (ferror (stream) || fclose (stream))
3609 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3614 /* Construction of sparse 3-level tables. */
3615 #define TABLE script_table
3616 #define ELEMENT uint8_t
3617 #define DEFAULT (uint8_t)~(uint8_t)0
3618 #define xmalloc malloc
3619 #define xrealloc realloc
3623 output_scripts (const char *version)
3625 const char *filename = "unictype/scripts.h";
3627 unsigned int ch, s, i;
3628 struct script_table t;
3629 unsigned int level1_offset, level2_offset, level3_offset;
3633 const char *lowercase_name;
3636 scriptinfo_t scriptinfo[256];
3638 stream = fopen (filename, "w");
3641 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3645 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3646 fprintf (stream, "/* Unicode scripts. */\n");
3647 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3650 for (s = 0; s < numscripts; s++)
3652 char *lcp = strdup (scripts[s]);
3655 for (cp = lcp; *cp != '\0'; cp++)
3656 if (*cp >= 'A' && *cp <= 'Z')
3659 scriptinfo[s].lowercase_name = lcp;
3662 for (s = 0; s < numscripts; s++)
3664 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3665 scriptinfo[s].lowercase_name);
3666 fprintf (stream, "{\n");
3668 for (ch = 0; ch < 0x110000; ch++)
3669 if (unicode_scripts[ch] == s)
3675 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3680 fprintf (stream, ",\n");
3682 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3684 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3688 fprintf (stream, "\n");
3689 fprintf (stream, "};\n");
3692 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3693 fprintf (stream, "{\n");
3694 for (s = 0; s < numscripts; s++)
3696 fprintf (stream, " {\n");
3697 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3698 scriptinfo[s].lowercase_name);
3699 fprintf (stream, " script_%s_intervals,\n",
3700 scriptinfo[s].lowercase_name);
3701 fprintf (stream, " \"%s\"\n", scripts[s]);
3702 fprintf (stream, " }");
3703 if (s+1 < numscripts)
3704 fprintf (stream, ",");
3705 fprintf (stream, "\n");
3707 fprintf (stream, "};\n");
3711 script_table_init (&t);
3713 for (ch = 0; ch < 0x110000; ch++)
3715 unsigned int s = unicode_scripts[ch];
3716 if (s != (uint8_t)~(uint8_t)0)
3717 script_table_add (&t, ch, s);
3720 script_table_finalize (&t);
3722 /* Offsets in t.result, in memory of this process. */
3724 5 * sizeof (uint32_t);
3726 5 * sizeof (uint32_t)
3727 + t.level1_size * sizeof (uint32_t);
3729 5 * sizeof (uint32_t)
3730 + t.level1_size * sizeof (uint32_t)
3731 + (t.level2_size << t.q) * sizeof (uint32_t);
3733 for (i = 0; i < 5; i++)
3734 fprintf (stream, "#define script_header_%d %d\n", i,
3735 ((uint32_t *) t.result)[i]);
3736 fprintf (stream, "static const\n");
3737 fprintf (stream, "struct\n");
3738 fprintf (stream, " {\n");
3739 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3740 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3741 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3742 fprintf (stream, " }\n");
3743 fprintf (stream, "u_script =\n");
3744 fprintf (stream, "{\n");
3745 fprintf (stream, " {");
3746 if (t.level1_size > 8)
3747 fprintf (stream, "\n ");
3748 for (i = 0; i < t.level1_size; i++)
3751 if (i > 0 && (i % 8) == 0)
3752 fprintf (stream, "\n ");
3753 offset = ((uint32_t *) (t.result + level1_offset))[i];
3755 fprintf (stream, " %5d", -1);
3757 fprintf (stream, " %5zd",
3758 (offset - level2_offset) / sizeof (uint32_t));
3759 if (i+1 < t.level1_size)
3760 fprintf (stream, ",");
3762 if (t.level1_size > 8)
3763 fprintf (stream, "\n ");
3764 fprintf (stream, " },\n");
3765 fprintf (stream, " {");
3766 if (t.level2_size << t.q > 8)
3767 fprintf (stream, "\n ");
3768 for (i = 0; i < t.level2_size << t.q; i++)
3771 if (i > 0 && (i % 8) == 0)
3772 fprintf (stream, "\n ");
3773 offset = ((uint32_t *) (t.result + level2_offset))[i];
3775 fprintf (stream, " %5d", -1);
3777 fprintf (stream, " %5zd",
3778 (offset - level3_offset) / sizeof (uint8_t));
3779 if (i+1 < t.level2_size << t.q)
3780 fprintf (stream, ",");
3782 if (t.level2_size << t.q > 8)
3783 fprintf (stream, "\n ");
3784 fprintf (stream, " },\n");
3785 fprintf (stream, " {");
3786 if (t.level3_size << t.p > 8)
3787 fprintf (stream, "\n ");
3788 for (i = 0; i < t.level3_size << t.p; i++)
3790 if (i > 0 && (i % 8) == 0)
3791 fprintf (stream, "\n ");
3792 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3793 if (i+1 < t.level3_size << t.p)
3794 fprintf (stream, ",");
3796 if (t.level3_size << t.p > 8)
3797 fprintf (stream, "\n ");
3798 fprintf (stream, " }\n");
3799 fprintf (stream, "};\n");
3801 if (ferror (stream) || fclose (stream))
3803 fprintf (stderr, "error writing to '%s'\n", filename);
3809 output_scripts_byname (const char *version)
3811 const char *filename = "unictype/scripts_byname.gperf";
3815 stream = fopen (filename, "w");
3818 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3822 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3823 fprintf (stream, "/* Unicode scripts. */\n");
3824 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3826 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3827 fprintf (stream, "%%struct-type\n");
3828 fprintf (stream, "%%language=ANSI-C\n");
3829 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3830 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3831 fprintf (stream, "%%readonly-tables\n");
3832 fprintf (stream, "%%global-table\n");
3833 fprintf (stream, "%%define word-array-name script_names\n");
3834 fprintf (stream, "%%%%\n");
3835 for (s = 0; s < numscripts; s++)
3836 fprintf (stream, "%s, %u\n", scripts[s], s);
3838 if (ferror (stream) || fclose (stream))
3840 fprintf (stderr, "error writing to '%s'\n", filename);
3845 /* ========================================================================= */
3849 typedef struct { unsigned int start; unsigned int end; const char *name; }
3851 static block_t blocks[256];
3852 static unsigned int numblocks;
3855 fill_blocks (const char *blocks_filename)
3859 stream = fopen (blocks_filename, "r");
3862 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3869 unsigned int i1, i2;
3870 char padding[200+1];
3871 char blockname[200+1];
3873 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3876 if (buf[0] == '\0' || buf[0] == '#')
3879 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3881 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3884 blocks[numblocks].start = i1;
3885 blocks[numblocks].end = i2;
3886 blocks[numblocks].name = strdup (blockname);
3887 /* It must be sorted. */
3888 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3891 if (numblocks == 256)
3895 if (ferror (stream) || fclose (stream))
3897 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3902 /* Return the smallest block index among the blocks for characters >= ch. */
3904 block_first_index (unsigned int ch)
3906 /* Binary search. */
3907 unsigned int lo = 0;
3908 unsigned int hi = numblocks;
3910 All blocks[i], i < lo, have blocks[i].end < ch,
3911 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3914 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3915 if (blocks[mid].end < ch)
3923 /* Return the largest block index among the blocks for characters <= ch,
3926 block_last_index (unsigned int ch)
3928 /* Binary search. */
3929 unsigned int lo = 0;
3930 unsigned int hi = numblocks;
3932 All blocks[i], i < lo, have blocks[i].start <= ch,
3933 all blocks[i], i >= hi, have blocks[i].start > ch. */
3936 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3937 if (blocks[mid].start <= ch)
3946 output_blocks (const char *version)
3948 const char *filename = "unictype/blocks.h";
3949 const unsigned int shift = 8; /* bits to shift away for array access */
3950 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3955 stream = fopen (filename, "w");
3958 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3962 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3963 fprintf (stream, "/* Unicode blocks. */\n");
3964 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3967 fprintf (stream, "static const uc_block_t blocks[] =\n");
3968 fprintf (stream, "{\n");
3969 for (i = 0; i < numblocks; i++)
3971 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3972 blocks[i].end, blocks[i].name);
3973 if (i+1 < numblocks)
3974 fprintf (stream, ",");
3975 fprintf (stream, "\n");
3977 fprintf (stream, "};\n");
3978 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3979 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3980 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3981 threshold >> shift);
3982 fprintf (stream, "{\n");
3983 for (i1 = 0; i1 < (threshold >> shift); i1++)
3985 unsigned int first_index = block_first_index (i1 << shift);
3986 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3987 fprintf (stream, " %3d, %3d", first_index, last_index);
3988 if (i1+1 < (threshold >> shift))
3989 fprintf (stream, ",");
3990 fprintf (stream, "\n");
3992 fprintf (stream, "};\n");
3993 fprintf (stream, "#define blocks_upper_first_index %d\n",
3994 block_first_index (threshold));
3995 fprintf (stream, "#define blocks_upper_last_index %d\n",
3996 block_last_index (0x10FFFF));
3998 if (ferror (stream) || fclose (stream))
4000 fprintf (stderr, "error writing to '%s'\n", filename);
4005 /* ========================================================================= */
4007 /* C and Java syntax. */
4011 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4012 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4013 UC_IDENTIFIER_INVALID, /* not valid */
4014 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4017 /* ISO C 99 section 6.4.(3). */
4019 is_c_whitespace (unsigned int ch)
4021 return (ch == ' ' /* space */
4022 || ch == '\t' /* horizontal tab */
4023 || ch == '\n' || ch == '\r' /* new-line */
4024 || ch == '\v' /* vertical tab */
4025 || ch == '\f'); /* form-feed */
4028 /* ISO C 99 section 6.4.2.1 and appendix D. */
4030 c_ident_category (unsigned int ch)
4032 /* Section 6.4.2.1. */
4033 if (ch >= '0' && ch <= '9')
4034 return UC_IDENTIFIER_VALID;
4035 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4036 return UC_IDENTIFIER_START;
4042 || (ch >= 0x00C0 && ch <= 0x00D6)
4043 || (ch >= 0x00D8 && ch <= 0x00F6)
4044 || (ch >= 0x00F8 && ch <= 0x01F5)
4045 || (ch >= 0x01FA && ch <= 0x0217)
4046 || (ch >= 0x0250 && ch <= 0x02A8)
4047 || (ch >= 0x1E00 && ch <= 0x1E9B)
4048 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4052 || (ch >= 0x0388 && ch <= 0x038A)
4054 || (ch >= 0x038E && ch <= 0x03A1)
4055 || (ch >= 0x03A3 && ch <= 0x03CE)
4056 || (ch >= 0x03D0 && ch <= 0x03D6)
4061 || (ch >= 0x03E2 && ch <= 0x03F3)
4062 || (ch >= 0x1F00 && ch <= 0x1F15)
4063 || (ch >= 0x1F18 && ch <= 0x1F1D)
4064 || (ch >= 0x1F20 && ch <= 0x1F45)
4065 || (ch >= 0x1F48 && ch <= 0x1F4D)
4066 || (ch >= 0x1F50 && ch <= 0x1F57)
4070 || (ch >= 0x1F5F && ch <= 0x1F7D)
4071 || (ch >= 0x1F80 && ch <= 0x1FB4)
4072 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4073 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4074 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4075 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4076 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4077 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4078 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4079 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4081 || (ch >= 0x0401 && ch <= 0x040C)
4082 || (ch >= 0x040E && ch <= 0x044F)
4083 || (ch >= 0x0451 && ch <= 0x045C)
4084 || (ch >= 0x045E && ch <= 0x0481)
4085 || (ch >= 0x0490 && ch <= 0x04C4)
4086 || (ch >= 0x04C7 && ch <= 0x04C8)
4087 || (ch >= 0x04CB && ch <= 0x04CC)
4088 || (ch >= 0x04D0 && ch <= 0x04EB)
4089 || (ch >= 0x04EE && ch <= 0x04F5)
4090 || (ch >= 0x04F8 && ch <= 0x04F9)
4092 || (ch >= 0x0531 && ch <= 0x0556)
4093 || (ch >= 0x0561 && ch <= 0x0587)
4095 || (ch >= 0x05B0 && ch <= 0x05B9)
4096 || (ch >= 0x05BB && ch <= 0x05BD)
4098 || (ch >= 0x05C1 && ch <= 0x05C2)
4099 || (ch >= 0x05D0 && ch <= 0x05EA)
4100 || (ch >= 0x05F0 && ch <= 0x05F2)
4102 || (ch >= 0x0621 && ch <= 0x063A)
4103 || (ch >= 0x0640 && ch <= 0x0652)
4104 || (ch >= 0x0670 && ch <= 0x06B7)
4105 || (ch >= 0x06BA && ch <= 0x06BE)
4106 || (ch >= 0x06C0 && ch <= 0x06CE)
4107 || (ch >= 0x06D0 && ch <= 0x06DC)
4108 || (ch >= 0x06E5 && ch <= 0x06E8)
4109 || (ch >= 0x06EA && ch <= 0x06ED)
4111 || (ch >= 0x0901 && ch <= 0x0903)
4112 || (ch >= 0x0905 && ch <= 0x0939)
4113 || (ch >= 0x093E && ch <= 0x094D)
4114 || (ch >= 0x0950 && ch <= 0x0952)
4115 || (ch >= 0x0958 && ch <= 0x0963)
4117 || (ch >= 0x0981 && ch <= 0x0983)
4118 || (ch >= 0x0985 && ch <= 0x098C)
4119 || (ch >= 0x098F && ch <= 0x0990)
4120 || (ch >= 0x0993 && ch <= 0x09A8)
4121 || (ch >= 0x09AA && ch <= 0x09B0)
4123 || (ch >= 0x09B6 && ch <= 0x09B9)
4124 || (ch >= 0x09BE && ch <= 0x09C4)
4125 || (ch >= 0x09C7 && ch <= 0x09C8)
4126 || (ch >= 0x09CB && ch <= 0x09CD)
4127 || (ch >= 0x09DC && ch <= 0x09DD)
4128 || (ch >= 0x09DF && ch <= 0x09E3)
4129 || (ch >= 0x09F0 && ch <= 0x09F1)
4132 || (ch >= 0x0A05 && ch <= 0x0A0A)
4133 || (ch >= 0x0A0F && ch <= 0x0A10)
4134 || (ch >= 0x0A13 && ch <= 0x0A28)
4135 || (ch >= 0x0A2A && ch <= 0x0A30)
4136 || (ch >= 0x0A32 && ch <= 0x0A33)
4137 || (ch >= 0x0A35 && ch <= 0x0A36)
4138 || (ch >= 0x0A38 && ch <= 0x0A39)
4139 || (ch >= 0x0A3E && ch <= 0x0A42)
4140 || (ch >= 0x0A47 && ch <= 0x0A48)
4141 || (ch >= 0x0A4B && ch <= 0x0A4D)
4142 || (ch >= 0x0A59 && ch <= 0x0A5C)
4146 || (ch >= 0x0A81 && ch <= 0x0A83)
4147 || (ch >= 0x0A85 && ch <= 0x0A8B)
4149 || (ch >= 0x0A8F && ch <= 0x0A91)
4150 || (ch >= 0x0A93 && ch <= 0x0AA8)
4151 || (ch >= 0x0AAA && ch <= 0x0AB0)
4152 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4153 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4154 || (ch >= 0x0ABD && ch <= 0x0AC5)
4155 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4156 || (ch >= 0x0ACB && ch <= 0x0ACD)
4160 || (ch >= 0x0B01 && ch <= 0x0B03)
4161 || (ch >= 0x0B05 && ch <= 0x0B0C)
4162 || (ch >= 0x0B0F && ch <= 0x0B10)
4163 || (ch >= 0x0B13 && ch <= 0x0B28)
4164 || (ch >= 0x0B2A && ch <= 0x0B30)
4165 || (ch >= 0x0B32 && ch <= 0x0B33)
4166 || (ch >= 0x0B36 && ch <= 0x0B39)
4167 || (ch >= 0x0B3E && ch <= 0x0B43)
4168 || (ch >= 0x0B47 && ch <= 0x0B48)
4169 || (ch >= 0x0B4B && ch <= 0x0B4D)
4170 || (ch >= 0x0B5C && ch <= 0x0B5D)
4171 || (ch >= 0x0B5F && ch <= 0x0B61)
4173 || (ch >= 0x0B82 && ch <= 0x0B83)
4174 || (ch >= 0x0B85 && ch <= 0x0B8A)
4175 || (ch >= 0x0B8E && ch <= 0x0B90)
4176 || (ch >= 0x0B92 && ch <= 0x0B95)
4177 || (ch >= 0x0B99 && ch <= 0x0B9A)
4179 || (ch >= 0x0B9E && ch <= 0x0B9F)
4180 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4181 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4182 || (ch >= 0x0BAE && ch <= 0x0BB5)
4183 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4184 || (ch >= 0x0BBE && ch <= 0x0BC2)
4185 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4186 || (ch >= 0x0BCA && ch <= 0x0BCD)
4188 || (ch >= 0x0C01 && ch <= 0x0C03)
4189 || (ch >= 0x0C05 && ch <= 0x0C0C)
4190 || (ch >= 0x0C0E && ch <= 0x0C10)
4191 || (ch >= 0x0C12 && ch <= 0x0C28)
4192 || (ch >= 0x0C2A && ch <= 0x0C33)
4193 || (ch >= 0x0C35 && ch <= 0x0C39)
4194 || (ch >= 0x0C3E && ch <= 0x0C44)
4195 || (ch >= 0x0C46 && ch <= 0x0C48)
4196 || (ch >= 0x0C4A && ch <= 0x0C4D)
4197 || (ch >= 0x0C60 && ch <= 0x0C61)
4199 || (ch >= 0x0C82 && ch <= 0x0C83)
4200 || (ch >= 0x0C85 && ch <= 0x0C8C)
4201 || (ch >= 0x0C8E && ch <= 0x0C90)
4202 || (ch >= 0x0C92 && ch <= 0x0CA8)
4203 || (ch >= 0x0CAA && ch <= 0x0CB3)
4204 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4205 || (ch >= 0x0CBE && ch <= 0x0CC4)
4206 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4207 || (ch >= 0x0CCA && ch <= 0x0CCD)
4209 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4211 || (ch >= 0x0D02 && ch <= 0x0D03)
4212 || (ch >= 0x0D05 && ch <= 0x0D0C)
4213 || (ch >= 0x0D0E && ch <= 0x0D10)
4214 || (ch >= 0x0D12 && ch <= 0x0D28)
4215 || (ch >= 0x0D2A && ch <= 0x0D39)
4216 || (ch >= 0x0D3E && ch <= 0x0D43)
4217 || (ch >= 0x0D46 && ch <= 0x0D48)
4218 || (ch >= 0x0D4A && ch <= 0x0D4D)
4219 || (ch >= 0x0D60 && ch <= 0x0D61)
4221 || (ch >= 0x0E01 && ch <= 0x0E3A)
4222 || (ch >= 0x0E40 && ch <= 0x0E5B)
4224 || (ch >= 0x0E81 && ch <= 0x0E82)
4226 || (ch >= 0x0E87 && ch <= 0x0E88)
4229 || (ch >= 0x0E94 && ch <= 0x0E97)
4230 || (ch >= 0x0E99 && ch <= 0x0E9F)
4231 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4234 || (ch >= 0x0EAA && ch <= 0x0EAB)
4235 || (ch >= 0x0EAD && ch <= 0x0EAE)
4236 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4237 || (ch >= 0x0EBB && ch <= 0x0EBD)
4238 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4240 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4241 || (ch >= 0x0EDC && ch <= 0x0EDD)
4244 || (ch >= 0x0F18 && ch <= 0x0F19)
4248 || (ch >= 0x0F3E && ch <= 0x0F47)
4249 || (ch >= 0x0F49 && ch <= 0x0F69)
4250 || (ch >= 0x0F71 && ch <= 0x0F84)
4251 || (ch >= 0x0F86 && ch <= 0x0F8B)
4252 || (ch >= 0x0F90 && ch <= 0x0F95)
4254 || (ch >= 0x0F99 && ch <= 0x0FAD)
4255 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4258 || (ch >= 0x10A0 && ch <= 0x10C5)
4259 || (ch >= 0x10D0 && ch <= 0x10F6)
4261 || (ch >= 0x3041 && ch <= 0x3093)
4262 || (ch >= 0x309B && ch <= 0x309C)
4264 || (ch >= 0x30A1 && ch <= 0x30F6)
4265 || (ch >= 0x30FB && ch <= 0x30FC)
4267 || (ch >= 0x3105 && ch <= 0x312C)
4268 /* CJK Unified Ideographs */
4269 || (ch >= 0x4E00 && ch <= 0x9FA5)
4271 || (ch >= 0xAC00 && ch <= 0xD7A3)
4273 || (ch >= 0x0660 && ch <= 0x0669)
4274 || (ch >= 0x06F0 && ch <= 0x06F9)
4275 || (ch >= 0x0966 && ch <= 0x096F)
4276 || (ch >= 0x09E6 && ch <= 0x09EF)
4277 || (ch >= 0x0A66 && ch <= 0x0A6F)
4278 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4279 || (ch >= 0x0B66 && ch <= 0x0B6F)
4280 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4281 || (ch >= 0x0C66 && ch <= 0x0C6F)
4282 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4283 || (ch >= 0x0D66 && ch <= 0x0D6F)
4284 || (ch >= 0x0E50 && ch <= 0x0E59)
4285 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4286 || (ch >= 0x0F20 && ch <= 0x0F33)
4287 /* Special characters */
4290 || (ch >= 0x02B0 && ch <= 0x02B8)
4292 || (ch >= 0x02BD && ch <= 0x02C1)
4293 || (ch >= 0x02D0 && ch <= 0x02D1)
4294 || (ch >= 0x02E0 && ch <= 0x02E4)
4300 || (ch >= 0x203F && ch <= 0x2040)
4303 || (ch >= 0x210A && ch <= 0x2113)
4305 || (ch >= 0x2118 && ch <= 0x211D)
4309 || (ch >= 0x212A && ch <= 0x2131)
4310 || (ch >= 0x2133 && ch <= 0x2138)
4311 || (ch >= 0x2160 && ch <= 0x2182)
4312 || (ch >= 0x3005 && ch <= 0x3007)
4313 || (ch >= 0x3021 && ch <= 0x3029)
4315 return UC_IDENTIFIER_START;
4316 return UC_IDENTIFIER_INVALID;
4319 /* The Java Language Specification, 3rd edition, §3.6.
4320 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4322 is_java_whitespace (unsigned int ch)
4324 return (ch == ' ' || ch == '\t' || ch == '\f'
4325 || ch == '\n' || ch == '\r');
4328 /* The Java Language Specification, 3rd edition, §3.8.
4329 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4330 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4332 java_ident_category (unsigned int ch)
4334 /* FIXME: Check this against Sun's JDK implementation. */
4335 if (is_category_L (ch) /* = Character.isLetter(ch) */
4336 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4337 || is_category_Sc (ch) /* currency symbol */
4338 || is_category_Pc (ch) /* connector punctuation */
4340 return UC_IDENTIFIER_START;
4341 if (is_category_Nd (ch) /* digit */
4342 || is_category_Mc (ch) /* combining mark */
4343 || is_category_Mn (ch) /* non-spacing mark */
4345 return UC_IDENTIFIER_VALID;
4346 if ((ch >= 0x0000 && ch <= 0x0008)
4347 || (ch >= 0x000E && ch <= 0x001B)
4348 || (ch >= 0x007F && ch <= 0x009F)
4349 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4351 return UC_IDENTIFIER_IGNORABLE;
4352 return UC_IDENTIFIER_INVALID;
4355 /* Construction of sparse 3-level tables. */
4356 #define TABLE identsyntax_table
4357 #define ELEMENT uint8_t
4358 #define DEFAULT UC_IDENTIFIER_INVALID
4359 #define xmalloc malloc
4360 #define xrealloc realloc
4363 /* Output an identifier syntax categorization in a three-level bitmap. */
4365 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4369 struct identsyntax_table t;
4370 unsigned int level1_offset, level2_offset, level3_offset;
4372 stream = fopen (filename, "w");
4375 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4379 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4380 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4381 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4386 identsyntax_table_init (&t);
4388 for (ch = 0; ch < 0x110000; ch++)
4390 int syntaxcode = predicate (ch);
4391 if (syntaxcode != UC_IDENTIFIER_INVALID)
4392 identsyntax_table_add (&t, ch, syntaxcode);
4395 identsyntax_table_finalize (&t);
4397 /* Offsets in t.result, in memory of this process. */
4399 5 * sizeof (uint32_t);
4401 5 * sizeof (uint32_t)
4402 + t.level1_size * sizeof (uint32_t);
4404 5 * sizeof (uint32_t)
4405 + t.level1_size * sizeof (uint32_t)
4406 + (t.level2_size << t.q) * sizeof (uint32_t);
4408 for (i = 0; i < 5; i++)
4409 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4410 ((uint32_t *) t.result)[i]);
4411 fprintf (stream, "static const\n");
4412 fprintf (stream, "struct\n");
4413 fprintf (stream, " {\n");
4414 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4415 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4416 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4417 (1 << t.p) * 2 / 16);
4418 fprintf (stream, " }\n");
4419 fprintf (stream, "%s =\n", name);
4420 fprintf (stream, "{\n");
4421 fprintf (stream, " {");
4422 if (t.level1_size > 8)
4423 fprintf (stream, "\n ");
4424 for (i = 0; i < t.level1_size; i++)
4427 if (i > 0 && (i % 8) == 0)
4428 fprintf (stream, "\n ");
4429 offset = ((uint32_t *) (t.result + level1_offset))[i];
4431 fprintf (stream, " %5d", -1);
4433 fprintf (stream, " %5zd",
4434 (offset - level2_offset) / sizeof (uint32_t));
4435 if (i+1 < t.level1_size)
4436 fprintf (stream, ",");
4438 if (t.level1_size > 8)
4439 fprintf (stream, "\n ");
4440 fprintf (stream, " },\n");
4441 fprintf (stream, " {");
4442 if (t.level2_size << t.q > 8)
4443 fprintf (stream, "\n ");
4444 for (i = 0; i < t.level2_size << t.q; i++)
4447 if (i > 0 && (i % 8) == 0)
4448 fprintf (stream, "\n ");
4449 offset = ((uint32_t *) (t.result + level2_offset))[i];
4451 fprintf (stream, " %5d", -1);
4453 fprintf (stream, " %5zd",
4454 (offset - level3_offset) / sizeof (uint8_t));
4455 if (i+1 < t.level2_size << t.q)
4456 fprintf (stream, ",");
4458 if (t.level2_size << t.q > 8)
4459 fprintf (stream, "\n ");
4460 fprintf (stream, " },\n");
4461 /* Pack the level3 array. Each entry needs 2 bits only. */
4462 fprintf (stream, " {");
4463 if ((t.level3_size << t.p) * 2 / 16 > 8)
4464 fprintf (stream, "\n ");
4465 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4467 if (i > 0 && (i % 8) == 0)
4468 fprintf (stream, "\n ");
4469 fprintf (stream, " 0x%04x",
4470 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4478 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4479 fprintf (stream, ",");
4481 if ((t.level3_size << t.p) * 2 / 16 > 8)
4482 fprintf (stream, "\n ");
4483 fprintf (stream, " }\n");
4484 fprintf (stream, "};\n");
4486 if (ferror (stream) || fclose (stream))
4488 fprintf (stderr, "error writing to '%s'\n", filename);
4494 output_ident_properties (const char *version)
4496 #define PROPERTY(P) \
4497 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4498 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4499 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4500 PROPERTY(c_whitespace)
4501 PROPERTY(java_whitespace)
4504 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4505 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4508 /* ========================================================================= */
4510 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4511 glibc/localedata/locales/i18n file, generated by
4512 glibc/localedata/gen-unicode-ctype.c. */
4514 /* Character mappings. */
4517 to_upper (unsigned int ch)
4519 if (unicode_attributes[ch].name != NULL
4520 && unicode_attributes[ch].upper != NONE)
4521 return unicode_attributes[ch].upper;
4527 to_lower (unsigned int ch)
4529 if (unicode_attributes[ch].name != NULL
4530 && unicode_attributes[ch].lower != NONE)
4531 return unicode_attributes[ch].lower;
4537 to_title (unsigned int ch)
4539 if (unicode_attributes[ch].name != NULL
4540 && unicode_attributes[ch].title != NONE)
4541 return unicode_attributes[ch].title;
4546 /* Character class properties. */
4549 is_upper (unsigned int ch)
4551 return (to_lower (ch) != ch);
4555 is_lower (unsigned int ch)
4557 return (to_upper (ch) != ch)
4558 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4563 is_alpha (unsigned int ch)
4565 return (unicode_attributes[ch].name != NULL
4566 && ((unicode_attributes[ch].category[0] == 'L'
4567 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4568 <U0E2F>, <U0E46> should belong to is_punct. */
4569 && (ch != 0x0E2F) && (ch != 0x0E46))
4570 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4571 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4573 || (ch >= 0x0E34 && ch <= 0x0E3A)
4574 || (ch >= 0x0E47 && ch <= 0x0E4E)
4575 /* Avoid warning for <U0345>. */
4577 /* Avoid warnings for <U2160>..<U217F>. */
4578 || (unicode_attributes[ch].category[0] == 'N'
4579 && unicode_attributes[ch].category[1] == 'l')
4580 /* Avoid warnings for <U24B6>..<U24E9>. */
4581 || (unicode_attributes[ch].category[0] == 'S'
4582 && unicode_attributes[ch].category[1] == 'o'
4583 && strstr (unicode_attributes[ch].name, " LETTER ")
4585 /* Consider all the non-ASCII digits as alphabetic.
4586 ISO C 99 forbids us to have them in category "digit",
4587 but we want iswalnum to return true on them. */
4588 || (unicode_attributes[ch].category[0] == 'N'
4589 && unicode_attributes[ch].category[1] == 'd'
4590 && !(ch >= 0x0030 && ch <= 0x0039))));
4594 is_digit (unsigned int ch)
4597 return (unicode_attributes[ch].name != NULL
4598 && unicode_attributes[ch].category[0] == 'N'
4599 && unicode_attributes[ch].category[1] == 'd');
4600 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4601 a zero. Must add <0> in front of them by hand. */
4603 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4606 The iswdigit function tests for any wide character that corresponds
4607 to a decimal-digit character (as defined in 5.2.1).
4609 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4611 return (ch >= 0x0030 && ch <= 0x0039);
4616 is_outdigit (unsigned int ch)
4618 return (ch >= 0x0030 && ch <= 0x0039);
4622 is_alnum (unsigned int ch)
4624 return is_alpha (ch) || is_digit (ch);
4628 is_blank (unsigned int ch)
4630 return (ch == 0x0009 /* '\t' */
4631 /* Category Zs without mention of "<noBreak>" */
4632 || (unicode_attributes[ch].name != NULL
4633 && unicode_attributes[ch].category[0] == 'Z'
4634 && unicode_attributes[ch].category[1] == 's'
4635 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4639 is_space (unsigned int ch)
4641 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4642 should treat it like a punctuation character, not like a space. */
4643 return (ch == 0x0020 /* ' ' */
4644 || ch == 0x000C /* '\f' */
4645 || ch == 0x000A /* '\n' */
4646 || ch == 0x000D /* '\r' */
4647 || ch == 0x0009 /* '\t' */
4648 || ch == 0x000B /* '\v' */
4649 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4650 || (unicode_attributes[ch].name != NULL
4651 && unicode_attributes[ch].category[0] == 'Z'
4652 && (unicode_attributes[ch].category[1] == 'l'
4653 || unicode_attributes[ch].category[1] == 'p'
4654 || (unicode_attributes[ch].category[1] == 's'
4655 && !strstr (unicode_attributes[ch].decomposition,
4660 is_cntrl (unsigned int ch)
4662 return (unicode_attributes[ch].name != NULL
4663 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4664 /* Categories Zl and Zp */
4665 || (unicode_attributes[ch].category[0] == 'Z'
4666 && (unicode_attributes[ch].category[1] == 'l'
4667 || unicode_attributes[ch].category[1] == 'p'))));
4671 is_xdigit (unsigned int ch)
4674 return is_digit (ch)
4675 || (ch >= 0x0041 && ch <= 0x0046)
4676 || (ch >= 0x0061 && ch <= 0x0066);
4678 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4681 The iswxdigit function tests for any wide character that corresponds
4682 to a hexadecimal-digit character (as defined in 6.4.4.1).
4684 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4686 return (ch >= 0x0030 && ch <= 0x0039)
4687 || (ch >= 0x0041 && ch <= 0x0046)
4688 || (ch >= 0x0061 && ch <= 0x0066);
4693 is_graph (unsigned int ch)
4695 return (unicode_attributes[ch].name != NULL
4696 && strcmp (unicode_attributes[ch].name, "<control>")
4701 is_print (unsigned int ch)
4703 return (unicode_attributes[ch].name != NULL
4704 && strcmp (unicode_attributes[ch].name, "<control>")
4705 /* Categories Zl and Zp */
4706 && !(unicode_attributes[ch].name != NULL
4707 && unicode_attributes[ch].category[0] == 'Z'
4708 && (unicode_attributes[ch].category[1] == 'l'
4709 || unicode_attributes[ch].category[1] == 'p')));
4713 is_punct (unsigned int ch)
4716 return (unicode_attributes[ch].name != NULL
4717 && unicode_attributes[ch].category[0] == 'P');
4719 /* The traditional POSIX definition of punctuation is every graphic,
4720 non-alphanumeric character. */
4721 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4725 /* Output all properties. */
4727 output_old_ctype (const char *version)
4729 #define PROPERTY(P) \
4730 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4731 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4732 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4751 is_combining (unsigned int ch)
4753 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4754 file. In 3.0.1 it was identical to the union of the general categories
4755 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4756 PropList.txt file, so we take the latter definition. */
4757 return (unicode_attributes[ch].name != NULL
4758 && unicode_attributes[ch].category[0] == 'M'
4759 && (unicode_attributes[ch].category[1] == 'n'
4760 || unicode_attributes[ch].category[1] == 'c'
4761 || unicode_attributes[ch].category[1] == 'e'));
4765 is_combining_level3 (unsigned int ch)
4767 return is_combining (ch)
4768 && !(unicode_attributes[ch].combining[0] != '\0'
4769 && unicode_attributes[ch].combining[0] != '0'
4770 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4773 /* Return the UCS symbol string for a Unicode character. */
4775 ucs_symbol (unsigned int i)
4777 static char buf[11+1];
4779 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4783 /* Return the UCS symbol range string for a Unicode characters interval. */
4785 ucs_symbol_range (unsigned int low, unsigned int high)
4787 static char buf[24+1];
4789 strcpy (buf, ucs_symbol (low));
4791 strcat (buf, ucs_symbol (high));
4795 /* Output a character class (= property) table. */
4798 output_charclass (FILE *stream, const char *classname,
4799 bool (*func) (unsigned int))
4801 char table[0x110000];
4803 bool need_semicolon;
4804 const int max_column = 75;
4807 for (i = 0; i < 0x110000; i++)
4808 table[i] = (int) func (i);
4810 fprintf (stream, "%s ", classname);
4811 need_semicolon = false;
4813 for (i = 0; i < 0x110000; )
4819 unsigned int low, high;
4825 while (i < 0x110000 && table[i]);
4829 strcpy (buf, ucs_symbol (low));
4831 strcpy (buf, ucs_symbol_range (low, high));
4835 fprintf (stream, ";");
4839 if (column + strlen (buf) > max_column)
4841 fprintf (stream, "/\n ");
4845 fprintf (stream, "%s", buf);
4846 column += strlen (buf);
4847 need_semicolon = true;
4850 fprintf (stream, "\n");
4853 /* Output a character mapping table. */
4856 output_charmap (FILE *stream, const char *mapname,
4857 unsigned int (*func) (unsigned int))
4859 char table[0x110000];
4861 bool need_semicolon;
4862 const int max_column = 75;
4865 for (i = 0; i < 0x110000; i++)
4866 table[i] = (func (i) != i);
4868 fprintf (stream, "%s ", mapname);
4869 need_semicolon = false;
4871 for (i = 0; i < 0x110000; i++)
4877 strcat (buf, ucs_symbol (i));
4879 strcat (buf, ucs_symbol (func (i)));
4884 fprintf (stream, ";");
4888 if (column + strlen (buf) > max_column)
4890 fprintf (stream, "/\n ");
4894 fprintf (stream, "%s", buf);
4895 column += strlen (buf);
4896 need_semicolon = true;
4898 fprintf (stream, "\n");
4901 /* Output the width table. */
4904 output_widthmap (FILE *stream)
4908 /* Output the tables to the given file. */
4911 output_tables (const char *filename, const char *version)
4916 stream = fopen (filename, "w");
4919 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4923 fprintf (stream, "escape_char /\n");
4924 fprintf (stream, "comment_char %%\n");
4925 fprintf (stream, "\n");
4926 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4928 fprintf (stream, "\n");
4930 fprintf (stream, "LC_IDENTIFICATION\n");
4931 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4932 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4933 fprintf (stream, "address \"\"\n");
4934 fprintf (stream, "contact \"\"\n");
4935 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4936 fprintf (stream, "tel \"\"\n");
4937 fprintf (stream, "fax \"\"\n");
4938 fprintf (stream, "language \"\"\n");
4939 fprintf (stream, "territory \"Earth\"\n");
4940 fprintf (stream, "revision \"%s\"\n", version);
4945 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4946 fprintf (stream, "date \"%s\"\n", date);
4948 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4949 fprintf (stream, "END LC_IDENTIFICATION\n");
4950 fprintf (stream, "\n");
4952 /* Verifications. */
4953 for (ch = 0; ch < 0x110000; ch++)
4955 /* toupper restriction: "Only characters specified for the keywords
4956 lower and upper shall be specified. */
4957 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4959 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4960 ucs_symbol (ch), ch, to_upper (ch));
4962 /* tolower restriction: "Only characters specified for the keywords
4963 lower and upper shall be specified. */
4964 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4966 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4967 ucs_symbol (ch), ch, to_lower (ch));
4969 /* alpha restriction: "Characters classified as either upper or lower
4970 shall automatically belong to this class. */
4971 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4972 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4974 /* alpha restriction: "No character specified for the keywords cntrl,
4975 digit, punct or space shall be specified." */
4976 if (is_alpha (ch) && is_cntrl (ch))
4977 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4978 if (is_alpha (ch) && is_digit (ch))
4979 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4980 if (is_alpha (ch) && is_punct (ch))
4981 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4982 if (is_alpha (ch) && is_space (ch))
4983 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4985 /* space restriction: "No character specified for the keywords upper,
4986 lower, alpha, digit, graph or xdigit shall be specified."
4987 upper, lower, alpha already checked above. */
4988 if (is_space (ch) && is_digit (ch))
4989 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4990 if (is_space (ch) && is_graph (ch))
4991 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4992 if (is_space (ch) && is_xdigit (ch))
4993 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4995 /* cntrl restriction: "No character specified for the keywords upper,
4996 lower, alpha, digit, punct, graph, print or xdigit shall be
4997 specified." upper, lower, alpha already checked above. */
4998 if (is_cntrl (ch) && is_digit (ch))
4999 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5000 if (is_cntrl (ch) && is_punct (ch))
5001 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5002 if (is_cntrl (ch) && is_graph (ch))
5003 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_print (ch))
5005 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5006 if (is_cntrl (ch) && is_xdigit (ch))
5007 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5009 /* punct restriction: "No character specified for the keywords upper,
5010 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5011 be specified." upper, lower, alpha, cntrl already checked above. */
5012 if (is_punct (ch) && is_digit (ch))
5013 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5014 if (is_punct (ch) && is_xdigit (ch))
5015 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5016 if (is_punct (ch) && (ch == 0x0020))
5017 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5019 /* graph restriction: "No character specified for the keyword cntrl
5020 shall be specified." Already checked above. */
5022 /* print restriction: "No character specified for the keyword cntrl
5023 shall be specified." Already checked above. */
5025 /* graph - print relation: differ only in the <space> character.
5026 How is this possible if there are more than one space character?!
5027 I think susv2/xbd/locale.html should speak of "space characters",
5028 not "space character". */
5029 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5031 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5032 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5034 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5037 fprintf (stream, "LC_CTYPE\n");
5038 output_charclass (stream, "upper", is_upper);
5039 output_charclass (stream, "lower", is_lower);
5040 output_charclass (stream, "alpha", is_alpha);
5041 output_charclass (stream, "digit", is_digit);
5042 output_charclass (stream, "outdigit", is_outdigit);
5043 output_charclass (stream, "blank", is_blank);
5044 output_charclass (stream, "space", is_space);
5045 output_charclass (stream, "cntrl", is_cntrl);
5046 output_charclass (stream, "punct", is_punct);
5047 output_charclass (stream, "xdigit", is_xdigit);
5048 output_charclass (stream, "graph", is_graph);
5049 output_charclass (stream, "print", is_print);
5050 output_charclass (stream, "class \"combining\";", is_combining);
5051 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5052 output_charmap (stream, "toupper", to_upper);
5053 output_charmap (stream, "tolower", to_lower);
5054 output_charmap (stream, "map \"totitle\";", to_title);
5055 output_widthmap (stream);
5056 fprintf (stream, "END LC_CTYPE\n");
5058 if (ferror (stream) || fclose (stream))
5060 fprintf (stderr, "error writing to '%s'\n", filename);
5067 /* ========================================================================= */
5069 /* The width property from the EastAsianWidth.txt file.
5070 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5071 const char * unicode_width[0x110000];
5073 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5076 fill_width (const char *width_filename)
5080 char field0[FIELDLEN];
5081 char field1[FIELDLEN];
5082 char field2[FIELDLEN];
5085 for (i = 0; i < 0x110000; i++)
5086 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5088 stream = fopen (width_filename, "r");
5091 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5106 do c = getc (stream); while (c != EOF && c != '\n');
5110 n = getfield (stream, field0, ';');
5111 n += getfield (stream, field1, ' ');
5112 n += getfield (stream, field2, '\n');
5117 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5120 i = strtoul (field0, NULL, 16);
5121 if (strstr (field0, "..") != NULL)
5123 /* Deal with a range. */
5124 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5126 unicode_width[i] = strdup (field1);
5130 /* Single character line. */
5131 unicode_width[i] = strdup (field1);
5134 if (ferror (stream) || fclose (stream))
5136 fprintf (stderr, "error reading from '%s'\n", width_filename);
5141 /* Line breaking classification. */
5145 /* Values >= 24 are resolved at run time. */
5146 LBP_BK = 24, /* mandatory break */
5147 /*LBP_CR, carriage return - not used here because it's a DOSism */
5148 /*LBP_LF, line feed - not used here because it's a DOSism */
5149 LBP_CM = 25, /* attached characters and combining marks */
5150 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5151 /*LBP_SG, surrogates - not used here because they are not characters */
5152 LBP_WJ = 0, /* word joiner */
5153 LBP_ZW = 26, /* zero width space */
5154 LBP_GL = 1, /* non-breaking (glue) */
5155 LBP_SP = 27, /* space */
5156 LBP_B2 = 2, /* break opportunity before and after */
5157 LBP_BA = 3, /* break opportunity after */
5158 LBP_BB = 4, /* break opportunity before */
5159 LBP_HY = 5, /* hyphen */
5160 LBP_CB = 28, /* contingent break opportunity */
5161 LBP_CL = 6, /* closing punctuation */
5162 LBP_EX = 7, /* exclamation/interrogation */
5163 LBP_IN = 8, /* inseparable */
5164 LBP_NS = 9, /* non starter */
5165 LBP_OP = 10, /* opening punctuation */
5166 LBP_QU = 11, /* ambiguous quotation */
5167 LBP_IS = 12, /* infix separator (numeric) */
5168 LBP_NU = 13, /* numeric */
5169 LBP_PO = 14, /* postfix (numeric) */
5170 LBP_PR = 15, /* prefix (numeric) */
5171 LBP_SY = 16, /* symbols allowing breaks */
5172 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5173 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5174 LBP_H2 = 18, /* Hangul LV syllable */
5175 LBP_H3 = 19, /* Hangul LVT syllable */
5176 LBP_ID = 20, /* ideographic */
5177 LBP_JL = 21, /* Hangul L Jamo */
5178 LBP_JV = 22, /* Hangul V Jamo */
5179 LBP_JT = 23, /* Hangul T Jamo */
5180 LBP_SA = 30, /* complex context (South East Asian) */
5181 LBP_XX = 31 /* unknown */
5184 /* Returns the line breaking classification for ch, as a bit mask. */
5186 get_lbp (unsigned int ch)
5190 if (unicode_attributes[ch].name != NULL)
5192 /* mandatory break */
5193 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5194 || ch == 0x000C /* form feed */
5195 || ch == 0x000B /* line tabulation */
5196 || ch == 0x2028 /* LINE SEPARATOR */
5197 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5198 attr |= 1 << LBP_BK;
5200 if (ch == 0x2060 /* WORD JOINER */
5201 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5202 attr |= 1 << LBP_WJ;
5204 /* zero width space */
5205 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5206 attr |= 1 << LBP_ZW;
5208 /* non-breaking (glue) */
5209 if (ch == 0x00A0 /* NO-BREAK SPACE */
5210 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5211 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5212 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5213 || ch == 0x2007 /* FIGURE SPACE */
5214 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5215 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5216 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5217 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5218 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5219 attr |= 1 << LBP_GL;
5222 if (ch == 0x0020 /* SPACE */)
5223 attr |= 1 << LBP_SP;
5225 /* break opportunity before and after */
5226 if (ch == 0x2014 /* EM DASH */)
5227 attr |= 1 << LBP_B2;
5229 /* break opportunity after */
5230 if (ch == 0x1680 /* OGHAM SPACE MARK */
5231 || ch == 0x2000 /* EN QUAD */
5232 || ch == 0x2001 /* EM QUAD */
5233 || ch == 0x2002 /* EN SPACE */
5234 || ch == 0x2003 /* EM SPACE */
5235 || ch == 0x2004 /* THREE-PER-EM SPACE */
5236 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5237 || ch == 0x2006 /* SIX-PER-EM SPACE */
5238 || ch == 0x2008 /* PUNCTUATION SPACE */
5239 || ch == 0x2009 /* THIN SPACE */
5240 || ch == 0x200A /* HAIR SPACE */
5241 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5242 || ch == 0x0009 /* tab */
5243 || ch == 0x00AD /* SOFT HYPHEN */
5244 || ch == 0x058A /* ARMENIAN HYPHEN */
5245 || ch == 0x2010 /* HYPHEN */
5246 || ch == 0x2012 /* FIGURE DASH */
5247 || ch == 0x2013 /* EN DASH */
5248 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5249 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5250 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5251 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5252 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5253 || ch == 0x2027 /* HYPHENATION POINT */
5254 || ch == 0x007C /* VERTICAL LINE */
5255 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5256 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5257 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5258 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5259 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5260 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5261 || ch == 0x205A /* TWO DOT PUNCTUATION */
5262 || ch == 0x205B /* FOUR DOT MARK */
5263 || ch == 0x205D /* TRICOLON */
5264 || ch == 0x205E /* VERTICAL FOUR DOTS */
5265 || ch == 0x2E19 /* PALM BRANCH */
5266 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5267 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5268 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5269 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5270 || ch == 0x2E30 /* RING POINT */
5271 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5272 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5273 || ch == 0x10102 /* AEGEAN CHECK MARK */
5274 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5275 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5276 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5277 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5278 || ch == 0x0964 /* DEVANAGARI DANDA */
5279 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5280 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5281 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5282 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5283 || ch == 0x104B /* MYANMAR SIGN SECTION */
5284 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5285 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5286 || ch == 0x17D4 /* KHMER SIGN KHAN */
5287 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5288 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5289 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5290 || ch == 0xA8CE /* SAURASHTRA DANDA */
5291 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5292 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5293 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5294 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5295 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5296 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5297 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5298 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5299 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5300 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5301 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5302 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5303 || ch == 0x1804 /* MONGOLIAN COLON */
5304 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5305 || ch == 0x1B5A /* BALINESE PANTI */
5306 || ch == 0x1B5B /* BALINESE PAMADA */
5307 || ch == 0x1B5C /* BALINESE WINDU */
5308 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5309 || ch == 0x1B60 /* BALINESE PAMENENG */
5310 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5311 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5312 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5313 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5314 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5315 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5316 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5317 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5318 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5319 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5320 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5321 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5322 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5323 || ch == 0xA60D /* VAI COMMA */
5324 || ch == 0xA60F /* VAI QUESTION MARK */
5325 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5326 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5327 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5328 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5329 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5330 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5331 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5332 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5333 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5334 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5335 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5336 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5337 attr |= 1 << LBP_BA;
5339 /* break opportunity before */
5340 if (ch == 0x00B4 /* ACUTE ACCENT */
5341 || ch == 0x1FFD /* GREEK OXIA */
5342 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5343 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5344 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5345 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5346 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5347 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5348 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5349 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5350 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5351 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5352 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5353 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5354 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5355 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5356 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5357 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5358 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5359 attr |= 1 << LBP_BB;
5362 if (ch == 0x002D /* HYPHEN-MINUS */)
5363 attr |= 1 << LBP_HY;
5365 /* contingent break opportunity */
5366 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5367 attr |= 1 << LBP_CB;
5369 /* closing punctuation */
5370 if ((unicode_attributes[ch].category[0] == 'P'
5371 && unicode_attributes[ch].category[1] == 'e')
5372 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5373 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5374 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5375 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5376 || ch == 0xFE50 /* SMALL COMMA */
5377 || ch == 0xFE52 /* SMALL FULL STOP */
5378 || ch == 0xFF0C /* FULLWIDTH COMMA */
5379 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5380 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5381 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5382 attr |= 1 << LBP_CL;
5384 /* exclamation/interrogation */
5385 if (ch == 0x0021 /* EXCLAMATION MARK */
5386 || ch == 0x003F /* QUESTION MARK */
5387 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5388 || ch == 0x061B /* ARABIC SEMICOLON */
5389 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5390 || ch == 0x061F /* ARABIC QUESTION MARK */
5391 || ch == 0x06D4 /* ARABIC FULL STOP */
5392 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5393 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5394 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5395 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5396 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5397 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5398 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5399 || ch == 0x1802 /* MONGOLIAN COMMA */
5400 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5401 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5402 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5403 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5404 || ch == 0x1945 /* LIMBU QUESTION MARK */
5405 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5406 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5407 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5408 || ch == 0x2CFE /* COPTIC FULL STOP */
5409 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5411 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5413 || ch == 0xA60E /* VAI FULL STOP */
5414 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5415 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5416 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5417 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5418 || ch == 0xFE56 /* SMALL QUESTION MARK */
5419 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5420 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5421 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5422 attr |= 1 << LBP_EX;
5425 if (ch == 0x2024 /* ONE DOT LEADER */
5426 || ch == 0x2025 /* TWO DOT LEADER */
5427 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5428 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5429 attr |= 1 << LBP_IN;
5432 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5433 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5434 || ch == 0x203D /* INTERROBANG */
5435 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5436 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5437 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5438 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5439 || ch == 0x301C /* WAVE DASH */
5440 || ch == 0x303C /* MASU MARK */
5441 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5442 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5443 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5444 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5445 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5446 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5447 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5448 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5449 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5450 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5451 || ch == 0xA015 /* YI SYLLABLE WU */
5452 || ch == 0xFE54 /* SMALL SEMICOLON */
5453 || ch == 0xFE55 /* SMALL COLON */
5454 || ch == 0xFF1A /* FULLWIDTH COLON */
5455 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5456 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5457 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5458 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5459 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5460 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5461 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5462 attr |= 1 << LBP_NS;
5464 /* opening punctuation */
5465 if ((unicode_attributes[ch].category[0] == 'P'
5466 && unicode_attributes[ch].category[1] == 's')
5467 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5468 || ch == 0x00BF /* INVERTED QUESTION MARK */
5469 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5470 attr |= 1 << LBP_OP;
5472 /* ambiguous quotation */
5473 if ((unicode_attributes[ch].category[0] == 'P'
5474 && (unicode_attributes[ch].category[1] == 'f'
5475 || unicode_attributes[ch].category[1] == 'i'))
5476 || ch == 0x0022 /* QUOTATION MARK */
5477 || ch == 0x0027 /* APOSTROPHE */
5478 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5479 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5480 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5481 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5482 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5483 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5484 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5485 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5486 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5487 || ch == 0x2E0B /* RAISED SQUARE */)
5488 attr |= 1 << LBP_QU;
5490 /* infix separator (numeric) */
5491 if (ch == 0x002C /* COMMA */
5492 || ch == 0x002E /* FULL STOP */
5493 || ch == 0x003A /* COLON */
5494 || ch == 0x003B /* SEMICOLON */
5495 || ch == 0x037E /* GREEK QUESTION MARK */
5496 || ch == 0x0589 /* ARMENIAN FULL STOP */
5497 || ch == 0x060C /* ARABIC COMMA */
5498 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5499 || ch == 0x07F8 /* NKO COMMA */
5500 || ch == 0x2044 /* FRACTION SLASH */
5501 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5502 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5503 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5504 attr |= 1 << LBP_IS;
5507 if ((unicode_attributes[ch].category[0] == 'N'
5508 && unicode_attributes[ch].category[1] == 'd'
5509 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5510 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5511 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5512 attr |= 1 << LBP_NU;
5514 /* postfix (numeric) */
5515 if (ch == 0x0025 /* PERCENT SIGN */
5516 || ch == 0x00A2 /* CENT SIGN */
5517 || ch == 0x00B0 /* DEGREE SIGN */
5518 || ch == 0x060B /* AFGHANI SIGN */
5519 || ch == 0x066A /* ARABIC PERCENT SIGN */
5520 || ch == 0x2030 /* PER MILLE SIGN */
5521 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5522 || ch == 0x2032 /* PRIME */
5523 || ch == 0x2033 /* DOUBLE PRIME */
5524 || ch == 0x2034 /* TRIPLE PRIME */
5525 || ch == 0x2035 /* REVERSED PRIME */
5526 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5527 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5528 || ch == 0x20A7 /* PESETA SIGN */
5529 || ch == 0x2103 /* DEGREE CELSIUS */
5530 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5531 || ch == 0xFDFC /* RIAL SIGN */
5532 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5533 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5534 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5535 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5536 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5537 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5538 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5539 attr |= 1 << LBP_PO;
5541 /* prefix (numeric) */
5542 if ((unicode_attributes[ch].category[0] == 'S'
5543 && unicode_attributes[ch].category[1] == 'c')
5544 || ch == 0x002B /* PLUS SIGN */
5545 || ch == 0x005C /* REVERSE SOLIDUS */
5546 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5547 || ch == 0x2116 /* NUMERO SIGN */
5548 || ch == 0x2212 /* MINUS SIGN */
5549 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5550 if (!(attr & (1 << LBP_PO)))
5551 attr |= 1 << LBP_PR;
5553 /* symbols allowing breaks */
5554 if (ch == 0x002F /* SOLIDUS */)
5555 attr |= 1 << LBP_SY;
5557 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5558 attr |= 1 << LBP_H2;
5560 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5561 attr |= 1 << LBP_H3;
5563 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5564 attr |= 1 << LBP_JL;
5566 if (ch >= 0x1160 && ch <= 0x11A2)
5567 attr |= 1 << LBP_JV;
5569 if (ch >= 0x11A8 && ch <= 0x11F9)
5570 attr |= 1 << LBP_JT;
5572 /* complex context (South East Asian) */
5573 if (((unicode_attributes[ch].category[0] == 'C'
5574 && unicode_attributes[ch].category[1] == 'f')
5575 || (unicode_attributes[ch].category[0] == 'L'
5576 && (unicode_attributes[ch].category[1] == 'm'
5577 || unicode_attributes[ch].category[1] == 'o'))
5578 || (unicode_attributes[ch].category[0] == 'M'
5579 && (unicode_attributes[ch].category[1] == 'c'
5580 || unicode_attributes[ch].category[1] == 'n'))
5581 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5582 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5583 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5584 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5585 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5586 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5587 || (ch >= 0x1000 && ch <= 0x109F)
5588 || (ch >= 0x1780 && ch <= 0x17FF)
5589 || (ch >= 0x1950 && ch <= 0x19DF)))
5590 attr |= 1 << LBP_SA;
5592 /* attached characters and combining marks */
5593 if ((unicode_attributes[ch].category[0] == 'M'
5594 && (unicode_attributes[ch].category[1] == 'c'
5595 || unicode_attributes[ch].category[1] == 'e'
5596 || unicode_attributes[ch].category[1] == 'n'))
5597 || (unicode_attributes[ch].category[0] == 'C'
5598 && (unicode_attributes[ch].category[1] == 'c'
5599 || unicode_attributes[ch].category[1] == 'f')))
5600 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5601 attr |= 1 << LBP_CM;
5604 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5605 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5606 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5607 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5608 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5609 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5610 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5611 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5612 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5613 || ch == 0xFE62 /* SMALL PLUS SIGN */
5614 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5615 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5616 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5617 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5618 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5619 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5620 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5621 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5622 || (ch >= 0x3000 && ch <= 0x33FF
5623 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5624 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5625 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5626 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5627 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5628 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5629 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5630 || ch == 0xFE45 /* SESAME DOT */
5631 || ch == 0xFE46 /* WHITE SESAME DOT */
5632 || ch == 0xFE49 /* DASHED OVERLINE */
5633 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5634 || ch == 0xFE4B /* WAVY OVERLINE */
5635 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5636 || ch == 0xFE4D /* DASHED LOW LINE */
5637 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5638 || ch == 0xFE4F /* WAVY LOW LINE */
5639 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5640 || ch == 0xFE58 /* SMALL EM DASH */
5641 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5642 || ch == 0xFE60 /* SMALL AMPERSAND */
5643 || ch == 0xFE61 /* SMALL ASTERISK */
5644 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5645 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5646 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5647 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5648 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5649 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5650 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5651 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5652 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5653 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5654 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5655 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5656 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5657 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5658 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5659 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5660 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5661 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5662 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5663 || ch == 0xFF5E /* FULLWIDTH TILDE */
5664 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5665 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5666 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5667 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5669 /* ambiguous (ideograph) ? */
5670 if ((unicode_width[ch] != NULL
5671 && unicode_width[ch][0] == 'A'
5673 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5674 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5675 attr |= 1 << LBP_AI;
5677 attr |= 1 << LBP_ID;
5680 /* ordinary alphabetic and symbol characters */
5681 if ((unicode_attributes[ch].category[0] == 'L'
5682 && (unicode_attributes[ch].category[1] == 'u'
5683 || unicode_attributes[ch].category[1] == 'l'
5684 || unicode_attributes[ch].category[1] == 't'
5685 || unicode_attributes[ch].category[1] == 'm'
5686 || unicode_attributes[ch].category[1] == 'o'))
5687 || (unicode_attributes[ch].category[0] == 'S'
5688 && (unicode_attributes[ch].category[1] == 'm'
5689 || unicode_attributes[ch].category[1] == 'k'
5690 || unicode_attributes[ch].category[1] == 'o'))
5691 || (unicode_attributes[ch].category[0] == 'N'
5692 && (unicode_attributes[ch].category[1] == 'l'
5693 || unicode_attributes[ch].category[1] == 'o'))
5694 || (unicode_attributes[ch].category[0] == 'P'
5695 && (unicode_attributes[ch].category[1] == 'c'
5696 || unicode_attributes[ch].category[1] == 'd'
5697 || unicode_attributes[ch].category[1] == 'o'))
5698 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5699 || ch == 0x0601 /* ARABIC SIGN SANAH */
5700 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5701 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5702 || ch == 0x06DD /* ARABIC END OF AYAH */
5703 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5704 || ch == 0x2061 /* FUNCTION APPLICATION */
5705 || ch == 0x2062 /* INVISIBLE TIMES */
5706 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5707 || ch == 0x2064 /* INVISIBLE PLUS */)
5708 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5710 /* ambiguous (alphabetic) ? */
5711 if ((unicode_width[ch] != NULL
5712 && unicode_width[ch][0] == 'A'
5714 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5715 && ch != 0x2022 /* BULLET */
5716 && ch != 0x203E /* OVERLINE */
5717 && ch != 0x2126 /* OHM SIGN */
5718 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5719 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5720 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5721 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5722 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5723 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5724 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5725 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5727 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5728 || ch == 0x00A7 /* SECTION SIGN */
5729 || ch == 0x00A8 /* DIAERESIS */
5730 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5731 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5732 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5733 || ch == 0x00B6 /* PILCROW SIGN */
5734 || ch == 0x00B7 /* MIDDLE DOT */
5735 || ch == 0x00B8 /* CEDILLA */
5736 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5737 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5738 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5739 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5740 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5741 || ch == 0x00BF /* INVERTED QUESTION MARK */
5742 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5743 || ch == 0x00F7 /* DIVISION SIGN */
5744 || ch == 0x02C7 /* CARON */
5745 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5746 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5747 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5748 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5749 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5750 || ch == 0x02D8 /* BREVE */
5751 || ch == 0x02D9 /* DOT ABOVE */
5752 || ch == 0x02DA /* RING ABOVE */
5753 || ch == 0x02DB /* OGONEK */
5754 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5756 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5757 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5758 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5759 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5760 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5761 || ch == 0x2616 /* WHITE SHOGI PIECE */
5762 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5763 attr |= 1 << LBP_AI;
5765 attr |= 1 << LBP_AL;
5766 attr &= ~(1 << LBP_CM);
5772 attr |= 1 << LBP_XX;
5777 /* Output the line breaking properties in a human readable format. */
5779 debug_output_lbp (FILE *stream)
5783 for (i = 0; i < 0x110000; i++)
5785 int attr = get_lbp (i);
5786 if (attr != 1 << LBP_XX)
5788 fprintf (stream, "0x%04X", i);
5789 #define PRINT_BIT(attr,bit) \
5790 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5791 PRINT_BIT(attr,LBP_BK);
5792 PRINT_BIT(attr,LBP_CM);
5793 PRINT_BIT(attr,LBP_WJ);
5794 PRINT_BIT(attr,LBP_ZW);
5795 PRINT_BIT(attr,LBP_GL);
5796 PRINT_BIT(attr,LBP_SP);
5797 PRINT_BIT(attr,LBP_B2);
5798 PRINT_BIT(attr,LBP_BA);
5799 PRINT_BIT(attr,LBP_BB);
5800 PRINT_BIT(attr,LBP_HY);
5801 PRINT_BIT(attr,LBP_CB);
5802 PRINT_BIT(attr,LBP_CL);
5803 PRINT_BIT(attr,LBP_EX);
5804 PRINT_BIT(attr,LBP_IN);
5805 PRINT_BIT(attr,LBP_NS);
5806 PRINT_BIT(attr,LBP_OP);
5807 PRINT_BIT(attr,LBP_QU);
5808 PRINT_BIT(attr,LBP_IS);
5809 PRINT_BIT(attr,LBP_NU);
5810 PRINT_BIT(attr,LBP_PO);
5811 PRINT_BIT(attr,LBP_PR);
5812 PRINT_BIT(attr,LBP_SY);
5813 PRINT_BIT(attr,LBP_AI);
5814 PRINT_BIT(attr,LBP_AL);
5815 PRINT_BIT(attr,LBP_H2);
5816 PRINT_BIT(attr,LBP_H3);
5817 PRINT_BIT(attr,LBP_ID);
5818 PRINT_BIT(attr,LBP_JL);
5819 PRINT_BIT(attr,LBP_JV);
5820 PRINT_BIT(attr,LBP_JT);
5821 PRINT_BIT(attr,LBP_SA);
5822 PRINT_BIT(attr,LBP_XX);
5824 fprintf (stream, "\n");
5830 debug_output_lbrk_tables (const char *filename)
5834 stream = fopen (filename, "w");
5837 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5841 debug_output_lbp (stream);
5843 if (ferror (stream) || fclose (stream))
5845 fprintf (stderr, "error writing to '%s'\n", filename);
5850 /* The line breaking property from the LineBreak.txt file. */
5851 int unicode_org_lbp[0x110000];
5853 /* Stores in unicode_org_lbp[] the line breaking property from the
5854 LineBreak.txt file. */
5856 fill_org_lbp (const char *linebreak_filename)
5860 char field0[FIELDLEN];
5861 char field1[FIELDLEN];
5862 char field2[FIELDLEN];
5865 for (i = 0; i < 0x110000; i++)
5866 unicode_org_lbp[i] = LBP_XX;
5868 stream = fopen (linebreak_filename, "r");
5871 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5887 do c = getc (stream); while (c != EOF && c != '\n');
5891 n = getfield (stream, field0, ';');
5892 n += getfield (stream, field1, ' ');
5893 n += getfield (stream, field2, '\n');
5898 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5902 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5937 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5938 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5939 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5940 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5943 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5944 field1, linebreak_filename, lineno);
5947 i = strtoul (field0, NULL, 16);
5948 if (strstr (field0, "..") != NULL)
5950 /* Deal with a range. */
5951 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5953 unicode_org_lbp[i] = value;
5957 /* Single character line. */
5958 unicode_org_lbp[i] = value;
5961 if (ferror (stream) || fclose (stream))
5963 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5968 /* Output the line breaking properties in a human readable format. */
5970 debug_output_org_lbp (FILE *stream)
5974 for (i = 0; i < 0x110000; i++)
5976 int attr = unicode_org_lbp[i];
5979 fprintf (stream, "0x%04X", i);
5980 #define PRINT_BIT(attr,bit) \
5981 if (attr == bit) fprintf (stream, " " #bit);
5982 PRINT_BIT(attr,LBP_BK);
5983 PRINT_BIT(attr,LBP_CM);
5984 PRINT_BIT(attr,LBP_WJ);
5985 PRINT_BIT(attr,LBP_ZW);
5986 PRINT_BIT(attr,LBP_GL);
5987 PRINT_BIT(attr,LBP_SP);
5988 PRINT_BIT(attr,LBP_B2);
5989 PRINT_BIT(attr,LBP_BA);
5990 PRINT_BIT(attr,LBP_BB);
5991 PRINT_BIT(attr,LBP_HY);
5992 PRINT_BIT(attr,LBP_CB);
5993 PRINT_BIT(attr,LBP_CL);
5994 PRINT_BIT(attr,LBP_EX);
5995 PRINT_BIT(attr,LBP_IN);
5996 PRINT_BIT(attr,LBP_NS);
5997 PRINT_BIT(attr,LBP_OP);
5998 PRINT_BIT(attr,LBP_QU);
5999 PRINT_BIT(attr,LBP_IS);
6000 PRINT_BIT(attr,LBP_NU);
6001 PRINT_BIT(attr,LBP_PO);
6002 PRINT_BIT(attr,LBP_PR);
6003 PRINT_BIT(attr,LBP_SY);
6004 PRINT_BIT(attr,LBP_AI);
6005 PRINT_BIT(attr,LBP_AL);
6006 PRINT_BIT(attr,LBP_H2);
6007 PRINT_BIT(attr,LBP_H3);
6008 PRINT_BIT(attr,LBP_ID);
6009 PRINT_BIT(attr,LBP_JL);
6010 PRINT_BIT(attr,LBP_JV);
6011 PRINT_BIT(attr,LBP_JT);
6012 PRINT_BIT(attr,LBP_SA);
6013 PRINT_BIT(attr,LBP_XX);
6015 fprintf (stream, "\n");
6021 debug_output_org_lbrk_tables (const char *filename)
6025 stream = fopen (filename, "w");
6028 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6032 debug_output_org_lbp (stream);
6034 if (ferror (stream) || fclose (stream))
6036 fprintf (stderr, "error writing to '%s'\n", filename);
6041 /* Construction of sparse 3-level tables. */
6042 #define TABLE lbp_table
6043 #define ELEMENT unsigned char
6044 #define DEFAULT LBP_XX
6045 #define xmalloc malloc
6046 #define xrealloc realloc
6050 output_lbp (FILE *stream1, FILE *stream2)
6054 unsigned int level1_offset, level2_offset, level3_offset;
6058 lbp_table_init (&t);
6060 for (i = 0; i < 0x110000; i++)
6062 int attr = get_lbp (i);
6064 /* Now attr should contain exactly one bit. */
6065 if (attr == 0 || ((attr & (attr - 1)) != 0))
6068 if (attr != 1 << LBP_XX)
6070 unsigned int log2_attr;
6071 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6073 lbp_table_add (&t, i, log2_attr);
6077 lbp_table_finalize (&t);
6080 5 * sizeof (uint32_t);
6082 5 * sizeof (uint32_t)
6083 + t.level1_size * sizeof (uint32_t);
6085 5 * sizeof (uint32_t)
6086 + t.level1_size * sizeof (uint32_t)
6087 + (t.level2_size << t.q) * sizeof (uint32_t);
6089 for (i = 0; i < 5; i++)
6090 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6091 ((uint32_t *) t.result)[i]);
6092 fprintf (stream1, "\n");
6093 fprintf (stream1, "typedef struct\n");
6094 fprintf (stream1, " {\n");
6095 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6096 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6097 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6098 fprintf (stream1, " }\n");
6099 fprintf (stream1, "lbrkprop_t;\n");
6100 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6102 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6103 fprintf (stream2, "{\n");
6104 fprintf (stream2, " {");
6105 if (t.level1_size > 8)
6106 fprintf (stream2, "\n ");
6107 for (i = 0; i < t.level1_size; i++)
6110 if (i > 0 && (i % 8) == 0)
6111 fprintf (stream2, "\n ");
6112 offset = ((uint32_t *) (t.result + level1_offset))[i];
6113 fprintf (stream2, " %5zd%s",
6114 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6115 (i+1 < t.level1_size ? "," : ""));
6117 if (t.level1_size > 8)
6118 fprintf (stream2, "\n ");
6119 fprintf (stream2, " },\n");
6120 fprintf (stream2, " {");
6121 if (t.level2_size << t.q > 8)
6122 fprintf (stream2, "\n ");
6123 for (i = 0; i < t.level2_size << t.q; i++)
6126 if (i > 0 && (i % 8) == 0)
6127 fprintf (stream2, "\n ");
6128 offset = ((uint32_t *) (t.result + level2_offset))[i];
6129 fprintf (stream2, " %5zd%s",
6130 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6131 (i+1 < t.level2_size << t.q ? "," : ""));
6133 if (t.level2_size << t.q > 8)
6134 fprintf (stream2, "\n ");
6135 fprintf (stream2, " },\n");
6136 fprintf (stream2, " {");
6137 if (t.level3_size << t.p > 8)
6138 fprintf (stream2, "\n ");
6139 for (i = 0; i < t.level3_size << t.p; i++)
6141 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6142 const char *value_string;
6145 #define CASE(x) case x: value_string = #x; break;
6182 if (i > 0 && (i % 8) == 0)
6183 fprintf (stream2, "\n ");
6184 fprintf (stream2, " %s%s", value_string,
6185 (i+1 < t.level3_size << t.p ? "," : ""));
6187 if (t.level3_size << t.p > 8)
6188 fprintf (stream2, "\n ");
6189 fprintf (stream2, " }\n");
6190 fprintf (stream2, "};\n");
6194 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6196 const char *filenames[2];
6200 filenames[0] = filename1;
6201 filenames[1] = filename2;
6203 for (i = 0; i < 2; i++)
6205 streams[i] = fopen (filenames[i], "w");
6206 if (streams[i] == NULL)
6208 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6213 for (i = 0; i < 2; i++)
6215 FILE *stream = streams[i];
6217 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6218 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6219 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6221 fprintf (stream, "\n");
6223 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6224 still carries the GPL header), and it's gnulib-tool which replaces the
6225 GPL header with an LGPL header. */
6226 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6227 fprintf (stream, "\n");
6228 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6229 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6230 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6231 fprintf (stream, " (at your option) any later version.\n");
6232 fprintf (stream, "\n");
6233 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6234 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6235 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6236 fprintf (stream, " GNU General Public License for more details.\n");
6237 fprintf (stream, "\n");
6238 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6239 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6240 fprintf (stream, "\n");
6243 output_lbp (streams[0], streams[1]);
6245 for (i = 0; i < 2; i++)
6247 if (ferror (streams[i]) || fclose (streams[i]))
6249 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6255 /* ========================================================================= */
6257 /* Word break property. */
6259 /* Possible values of the Word_Break property. */
6274 WBP_EXTENDNUMLET = 7
6277 /* Returns the word breaking property for ch, as a bit mask. */
6279 get_wbp (unsigned int ch)
6283 if (unicode_attributes[ch].name != NULL)
6286 attr |= 1 << WBP_CR;
6289 attr |= 1 << WBP_LF;
6291 if (ch == 0x000B || ch == 0x000C
6293 || ch == 0x2028 || ch == 0x2029)
6294 attr |= 1 << WBP_NEWLINE;
6296 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6297 || (unicode_attributes[ch].category != NULL
6298 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6299 attr |= 1 << WBP_EXTEND;
6301 if (unicode_attributes[ch].category != NULL
6302 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6303 && ch != 0x200C && ch != 0x200D)
6304 attr |= 1 << WBP_FORMAT;
6306 if ((unicode_scripts[ch] < numscripts
6307 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6308 || (ch >= 0x3031 && ch <= 0x3035)
6309 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6311 attr |= 1 << WBP_KATAKANA;
6313 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6315 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6316 && (attr & (1 << WBP_KATAKANA)) == 0
6317 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6318 && !(unicode_scripts[ch] < numscripts
6319 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6320 && (attr & (1 << WBP_EXTEND)) == 0)
6321 attr |= 1 << WBP_ALETTER;
6323 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6324 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6325 attr |= 1 << WBP_MIDNUMLET;
6327 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6328 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6329 attr |= 1 << WBP_MIDLETTER;
6331 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6332 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6334 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6335 attr |= 1 << WBP_MIDNUM;
6337 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6339 attr |= 1 << WBP_NUMERIC;
6341 if (unicode_attributes[ch].category != NULL
6342 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6343 attr |= 1 << WBP_EXTENDNUMLET;
6348 attr |= 1 << WBP_OTHER;
6353 /* Output the word break property in a human readable format. */
6355 debug_output_wbp (FILE *stream)
6359 for (i = 0; i < 0x110000; i++)
6361 int attr = get_wbp (i);
6362 if (attr != 1 << WBP_OTHER)
6364 fprintf (stream, "0x%04X", i);
6365 if (attr & (1 << WBP_CR))
6366 fprintf (stream, " CR");
6367 if (attr & (1 << WBP_LF))
6368 fprintf (stream, " LF");
6369 if (attr & (1 << WBP_NEWLINE))
6370 fprintf (stream, " Newline");
6371 if (attr & (1 << WBP_EXTEND))
6372 fprintf (stream, " Extend");
6373 if (attr & (1 << WBP_FORMAT))
6374 fprintf (stream, " Format");
6375 if (attr & (1 << WBP_KATAKANA))
6376 fprintf (stream, " Katakana");
6377 if (attr & (1 << WBP_ALETTER))
6378 fprintf (stream, " ALetter");
6379 if (attr & (1 << WBP_MIDNUMLET))
6380 fprintf (stream, " MidNumLet");
6381 if (attr & (1 << WBP_MIDLETTER))
6382 fprintf (stream, " MidLetter");
6383 if (attr & (1 << WBP_MIDNUM))
6384 fprintf (stream, " MidNum");
6385 if (attr & (1 << WBP_NUMERIC))
6386 fprintf (stream, " Numeric");
6387 if (attr & (1 << WBP_EXTENDNUMLET))
6388 fprintf (stream, " ExtendNumLet");
6389 fprintf (stream, "\n");
6395 debug_output_wbrk_tables (const char *filename)
6399 stream = fopen (filename, "w");
6402 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6406 debug_output_wbp (stream);
6408 if (ferror (stream) || fclose (stream))
6410 fprintf (stderr, "error writing to '%s'\n", filename);
6415 /* The word break property from the WordBreakProperty.txt file. */
6416 int unicode_org_wbp[0x110000];
6418 /* Stores in unicode_org_wbp[] the word break property from the
6419 WordBreakProperty.txt file. */
6421 fill_org_wbp (const char *wordbreakproperty_filename)
6426 for (i = 0; i < 0x110000; i++)
6427 unicode_org_wbp[i] = WBP_OTHER;
6429 stream = fopen (wordbreakproperty_filename, "r");
6432 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6439 unsigned int i1, i2;
6440 char padding[200+1];
6441 char propname[200+1];
6444 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6447 if (buf[0] == '\0' || buf[0] == '#')
6450 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6452 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6454 fprintf (stderr, "parse error in '%s'\n",
6455 wordbreakproperty_filename);
6460 #define PROP(name,value) \
6461 if (strcmp (propname, name) == 0) propvalue = value; else
6464 PROP ("Newline", WBP_NEWLINE)
6465 PROP ("Extend", WBP_EXTEND)
6466 PROP ("Format", WBP_FORMAT)
6467 PROP ("Katakana", WBP_KATAKANA)
6468 PROP ("ALetter", WBP_ALETTER)
6469 PROP ("MidNumLet", WBP_MIDNUMLET)
6470 PROP ("MidLetter", WBP_MIDLETTER)
6471 PROP ("MidNum", WBP_MIDNUM)
6472 PROP ("Numeric", WBP_NUMERIC)
6473 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6476 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6477 wordbreakproperty_filename);
6480 if (!(i1 <= i2 && i2 < 0x110000))
6483 for (i = i1; i <= i2; i++)
6484 unicode_org_wbp[i] = propvalue;
6487 if (ferror (stream) || fclose (stream))
6489 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6494 /* Output the word break property in a human readable format. */
6496 debug_output_org_wbp (FILE *stream)
6500 for (i = 0; i < 0x110000; i++)
6502 int propvalue = unicode_org_wbp[i];
6503 if (propvalue != WBP_OTHER)
6505 fprintf (stream, "0x%04X", i);
6506 #define PROP(name,value) \
6507 if (propvalue == value) fprintf (stream, " " name); else
6510 PROP ("Newline", WBP_NEWLINE)
6511 PROP ("Extend", WBP_EXTEND)
6512 PROP ("Format", WBP_FORMAT)
6513 PROP ("Katakana", WBP_KATAKANA)
6514 PROP ("ALetter", WBP_ALETTER)
6515 PROP ("MidNumLet", WBP_MIDNUMLET)
6516 PROP ("MidLetter", WBP_MIDLETTER)
6517 PROP ("MidNum", WBP_MIDNUM)
6518 PROP ("Numeric", WBP_NUMERIC)
6519 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6521 fprintf (stream, " ??");
6522 fprintf (stream, "\n");
6528 debug_output_org_wbrk_tables (const char *filename)
6532 stream = fopen (filename, "w");
6535 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6539 debug_output_org_wbp (stream);
6541 if (ferror (stream) || fclose (stream))
6543 fprintf (stderr, "error writing to '%s'\n", filename);
6548 /* Construction of sparse 3-level tables. */
6549 #define TABLE wbp_table
6550 #define ELEMENT unsigned char
6551 #define DEFAULT WBP_OTHER
6552 #define xmalloc malloc
6553 #define xrealloc realloc
6557 output_wbp (FILE *stream)
6561 unsigned int level1_offset, level2_offset, level3_offset;
6565 wbp_table_init (&t);
6567 for (i = 0; i < 0x110000; i++)
6569 int attr = get_wbp (i);
6571 /* Now attr should contain exactly one bit. */
6572 if (attr == 0 || ((attr & (attr - 1)) != 0))
6575 if (attr != 1 << WBP_OTHER)
6577 unsigned int log2_attr;
6578 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6580 wbp_table_add (&t, i, log2_attr);
6584 wbp_table_finalize (&t);
6587 5 * sizeof (uint32_t);
6589 5 * sizeof (uint32_t)
6590 + t.level1_size * sizeof (uint32_t);
6592 5 * sizeof (uint32_t)
6593 + t.level1_size * sizeof (uint32_t)
6594 + (t.level2_size << t.q) * sizeof (uint32_t);
6596 for (i = 0; i < 5; i++)
6597 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6598 ((uint32_t *) t.result)[i]);
6599 fprintf (stream, "\n");
6600 fprintf (stream, "typedef struct\n");
6601 fprintf (stream, " {\n");
6602 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6603 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6604 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6605 fprintf (stream, " }\n");
6606 fprintf (stream, "wbrkprop_t;\n");
6607 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6608 fprintf (stream, "{\n");
6609 fprintf (stream, " {");
6610 if (t.level1_size > 8)
6611 fprintf (stream, "\n ");
6612 for (i = 0; i < t.level1_size; i++)
6615 if (i > 0 && (i % 8) == 0)
6616 fprintf (stream, "\n ");
6617 offset = ((uint32_t *) (t.result + level1_offset))[i];
6618 fprintf (stream, " %5zd%s",
6619 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6620 (i+1 < t.level1_size ? "," : ""));
6622 if (t.level1_size > 8)
6623 fprintf (stream, "\n ");
6624 fprintf (stream, " },\n");
6625 fprintf (stream, " {");
6626 if (t.level2_size << t.q > 8)
6627 fprintf (stream, "\n ");
6628 for (i = 0; i < t.level2_size << t.q; i++)
6631 if (i > 0 && (i % 8) == 0)
6632 fprintf (stream, "\n ");
6633 offset = ((uint32_t *) (t.result + level2_offset))[i];
6634 fprintf (stream, " %5zd%s",
6635 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6636 (i+1 < t.level2_size << t.q ? "," : ""));
6638 if (t.level2_size << t.q > 8)
6639 fprintf (stream, "\n ");
6640 fprintf (stream, " },\n");
6641 fprintf (stream, " {");
6642 if (t.level3_size << t.p > 4)
6643 fprintf (stream, "\n ");
6644 for (i = 0; i < t.level3_size << t.p; i++)
6646 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6647 const char *value_string;
6650 #define CASE(x) case x: value_string = #x; break;
6659 CASE(WBP_MIDNUMLET);
6660 CASE(WBP_MIDLETTER);
6663 CASE(WBP_EXTENDNUMLET);
6668 if (i > 0 && (i % 4) == 0)
6669 fprintf (stream, "\n ");
6670 fprintf (stream, " %s%s", value_string,
6671 (i+1 < t.level3_size << t.p ? "," : ""));
6673 if (t.level3_size << t.p > 4)
6674 fprintf (stream, "\n ");
6675 fprintf (stream, " }\n");
6676 fprintf (stream, "};\n");
6680 output_wbrk_tables (const char *filename, const char *version)
6684 stream = fopen (filename, "w");
6687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6692 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6693 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6695 fprintf (stream, "\n");
6697 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6698 still carries the GPL header), and it's gnulib-tool which replaces the
6699 GPL header with an LGPL header. */
6700 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6701 fprintf (stream, "\n");
6702 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6703 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6704 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6705 fprintf (stream, " (at your option) any later version.\n");
6706 fprintf (stream, "\n");
6707 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6708 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6709 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6710 fprintf (stream, " GNU General Public License for more details.\n");
6711 fprintf (stream, "\n");
6712 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6713 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6714 fprintf (stream, "\n");
6716 output_wbp (stream);
6718 if (ferror (stream) || fclose (stream))
6720 fprintf (stderr, "error writing to '%s'\n", filename);
6725 /* ========================================================================= */
6727 /* Output the test for a simple character mapping table to the given file. */
6730 output_simple_mapping_test (const char *filename,
6731 const char *function_name,
6732 unsigned int (*func) (unsigned int),
6733 const char *version)
6739 stream = fopen (filename, "w");
6742 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6746 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6747 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
6748 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
6749 fprintf (stream, "\n");
6750 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6751 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6752 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6753 fprintf (stream, " (at your option) any later version.\n");
6754 fprintf (stream, "\n");
6755 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6756 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6757 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6758 fprintf (stream, " GNU General Public License for more details.\n");
6759 fprintf (stream, "\n");
6760 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6761 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6762 fprintf (stream, "\n");
6763 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
6765 fprintf (stream, "\n");
6766 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
6767 fprintf (stream, "\n");
6770 for (ch = 0; ch < 0x110000; ch++)
6772 unsigned int value = func (ch);
6777 fprintf (stream, ",\n");
6778 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
6783 fprintf (stream, "\n");
6785 fprintf (stream, "\n");
6786 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
6787 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
6789 if (ferror (stream) || fclose (stream))
6791 fprintf (stderr, "error writing to '%s'\n", filename);
6796 /* Construction of sparse 3-level tables. */
6797 #define TABLE mapping_table
6798 #define ELEMENT int32_t
6800 #define xmalloc malloc
6801 #define xrealloc realloc
6804 /* Output a simple character mapping table to the given file. */
6807 output_simple_mapping (const char *filename,
6808 unsigned int (*func) (unsigned int),
6809 const char *version)
6813 struct mapping_table t;
6814 unsigned int level1_offset, level2_offset, level3_offset;
6816 stream = fopen (filename, "w");
6819 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6823 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6824 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
6825 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
6830 mapping_table_init (&t);
6832 for (ch = 0; ch < 0x110000; ch++)
6834 int value = (int) func (ch) - (int) ch;
6836 mapping_table_add (&t, ch, value);
6839 mapping_table_finalize (&t);
6841 /* Offsets in t.result, in memory of this process. */
6843 5 * sizeof (uint32_t);
6845 5 * sizeof (uint32_t)
6846 + t.level1_size * sizeof (uint32_t);
6848 5 * sizeof (uint32_t)
6849 + t.level1_size * sizeof (uint32_t)
6850 + (t.level2_size << t.q) * sizeof (uint32_t);
6852 for (i = 0; i < 5; i++)
6853 fprintf (stream, "#define mapping_header_%d %d\n", i,
6854 ((uint32_t *) t.result)[i]);
6855 fprintf (stream, "static const\n");
6856 fprintf (stream, "struct\n");
6857 fprintf (stream, " {\n");
6858 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6859 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6860 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
6861 fprintf (stream, " }\n");
6862 fprintf (stream, "u_mapping =\n");
6863 fprintf (stream, "{\n");
6864 fprintf (stream, " {");
6865 if (t.level1_size > 8)
6866 fprintf (stream, "\n ");
6867 for (i = 0; i < t.level1_size; i++)
6870 if (i > 0 && (i % 8) == 0)
6871 fprintf (stream, "\n ");
6872 offset = ((uint32_t *) (t.result + level1_offset))[i];
6874 fprintf (stream, " %5d", -1);
6876 fprintf (stream, " %5zd",
6877 (offset - level2_offset) / sizeof (uint32_t));
6878 if (i+1 < t.level1_size)
6879 fprintf (stream, ",");
6881 if (t.level1_size > 8)
6882 fprintf (stream, "\n ");
6883 fprintf (stream, " },\n");
6884 fprintf (stream, " {");
6885 if (t.level2_size << t.q > 8)
6886 fprintf (stream, "\n ");
6887 for (i = 0; i < t.level2_size << t.q; i++)
6890 if (i > 0 && (i % 8) == 0)
6891 fprintf (stream, "\n ");
6892 offset = ((uint32_t *) (t.result + level2_offset))[i];
6894 fprintf (stream, " %5d", -1);
6896 fprintf (stream, " %5zd",
6897 (offset - level3_offset) / sizeof (int32_t));
6898 if (i+1 < t.level2_size << t.q)
6899 fprintf (stream, ",");
6901 if (t.level2_size << t.q > 8)
6902 fprintf (stream, "\n ");
6903 fprintf (stream, " },\n");
6904 fprintf (stream, " {");
6905 if (t.level3_size << t.p > 8)
6906 fprintf (stream, "\n ");
6907 for (i = 0; i < t.level3_size << t.p; i++)
6909 if (i > 0 && (i % 8) == 0)
6910 fprintf (stream, "\n ");
6911 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
6912 if (i+1 < t.level3_size << t.p)
6913 fprintf (stream, ",");
6915 if (t.level3_size << t.p > 8)
6916 fprintf (stream, "\n ");
6917 fprintf (stream, " }\n");
6918 fprintf (stream, "};\n");
6920 if (ferror (stream) || fclose (stream))
6922 fprintf (stderr, "error writing to '%s'\n", filename);
6927 /* ========================================================================= */
6930 main (int argc, char * argv[])
6932 const char *unicodedata_filename;
6933 const char *proplist_filename;
6934 const char *derivedproplist_filename;
6935 const char *scripts_filename;
6936 const char *blocks_filename;
6937 const char *proplist30_filename;
6938 const char *eastasianwidth_filename;
6939 const char *linebreak_filename;
6940 const char *wordbreakproperty_filename;
6941 const char *version;
6945 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt version\n",
6950 unicodedata_filename = argv[1];
6951 proplist_filename = argv[2];
6952 derivedproplist_filename = argv[3];
6953 scripts_filename = argv[4];
6954 blocks_filename = argv[5];
6955 proplist30_filename = argv[6];
6956 eastasianwidth_filename = argv[7];
6957 linebreak_filename = argv[8];
6958 wordbreakproperty_filename = argv[9];
6961 fill_attributes (unicodedata_filename);
6962 clear_properties ();
6963 fill_properties (proplist_filename);
6964 fill_properties (derivedproplist_filename);
6965 fill_properties30 (proplist30_filename);
6966 fill_scripts (scripts_filename);
6967 fill_blocks (blocks_filename);
6968 fill_width (eastasianwidth_filename);
6969 fill_org_lbp (linebreak_filename);
6970 fill_org_wbp (wordbreakproperty_filename);
6972 output_categories (version);
6973 output_category ("unictype/categ_of.h", version);
6974 output_combclass ("unictype/combining.h", version);
6975 output_bidi_category ("unictype/bidi_of.h", version);
6976 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
6977 output_decimal_digit ("unictype/decdigit.h", version);
6978 output_digit_test ("../tests/unictype/test-digit.h", version);
6979 output_digit ("unictype/digit.h", version);
6980 output_numeric_test ("../tests/unictype/test-numeric.h", version);
6981 output_numeric ("unictype/numeric.h", version);
6982 output_mirror ("unictype/mirror.h", version);
6983 output_properties (version);
6984 output_scripts (version);
6985 output_scripts_byname (version);
6986 output_blocks (version);
6987 output_ident_properties (version);
6988 output_old_ctype (version);
6990 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
6991 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
6992 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
6994 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
6995 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
6996 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
6998 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
6999 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
7000 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
7001 output_simple_mapping ("unicase/toupper.h", to_upper, version);
7002 output_simple_mapping ("unicase/tolower.h", to_lower, version);
7003 output_simple_mapping ("unicase/totitle.h", to_title, version);
7009 * For Emacs M-x compile
7011 * compile-command: "
7012 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
7014 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
7015 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
7016 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
7017 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
7018 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
7019 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
7020 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
7021 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
7022 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \