1 /* Generate Unicode conforming character classification tables from a
3 Copyright (C) 2000-2002, 2007 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 $ gen-ctype /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
37 /* ========================================================================= */
39 /* Reading UnicodeData.txt. */
42 /* This structure represents one line in the UnicodeData.txt file. */
43 struct unicode_attribute
45 const char *name; /* Character name */
46 const char *category; /* General category */
47 const char *combining; /* Canonical combining class */
48 const char *bidi; /* Bidirectional category */
49 const char *decomposition; /* Character decomposition mapping */
50 const char *decdigit; /* Decimal digit value */
51 const char *digit; /* Digit value */
52 const char *numeric; /* Numeric value */
53 bool mirrored; /* mirrored */
54 const char *oldname; /* Old Unicode 1.0 name */
55 const char *comment; /* Comment */
56 unsigned int upper; /* Uppercase mapping */
57 unsigned int lower; /* Lowercase mapping */
58 unsigned int title; /* Titlecase mapping */
61 /* Missing fields are represented with "" for strings, and NONE for
63 #define NONE (~(unsigned int)0)
65 /* The entire contents of the UnicodeData.txt file. */
66 struct unicode_attribute unicode_attributes [0x110000];
68 /* Stores in unicode_attributes[i] the values from the given fields. */
70 fill_attribute (unsigned int i,
71 const char *field1, const char *field2,
72 const char *field3, const char *field4,
73 const char *field5, const char *field6,
74 const char *field7, const char *field8,
75 const char *field9, const char *field10,
76 const char *field11, const char *field12,
77 const char *field13, const char *field14)
79 struct unicode_attribute * uni;
83 fprintf (stderr, "index too large\n");
86 if (strcmp (field2, "Cs") == 0)
87 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
89 uni = &unicode_attributes[i];
90 /* Copy the strings. */
91 uni->name = strdup (field1);
92 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
93 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
94 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
95 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
96 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
97 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
98 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
99 uni->mirrored = (field9[0] == 'Y');
100 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
101 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
102 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
103 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
104 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
107 /* Maximum length of a field in the UnicodeData.txt file. */
110 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
111 Reads up to (but excluding) DELIM.
112 Returns 1 when a field was successfully read, otherwise 0. */
114 getfield (FILE *stream, char *buffer, int delim)
119 for (; (c = getc (stream)), (c != EOF && c != delim); )
121 /* The original unicode.org UnicodeData.txt file happens to have
122 CR/LF line terminators. Silently convert to LF. */
126 /* Put c into the buffer. */
127 if (++count >= FIELDLEN - 1)
129 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
142 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
145 fill_attributes (const char *unicodedata_filename)
149 char field0[FIELDLEN];
150 char field1[FIELDLEN];
151 char field2[FIELDLEN];
152 char field3[FIELDLEN];
153 char field4[FIELDLEN];
154 char field5[FIELDLEN];
155 char field6[FIELDLEN];
156 char field7[FIELDLEN];
157 char field8[FIELDLEN];
158 char field9[FIELDLEN];
159 char field10[FIELDLEN];
160 char field11[FIELDLEN];
161 char field12[FIELDLEN];
162 char field13[FIELDLEN];
163 char field14[FIELDLEN];
166 for (i = 0; i < 0x110000; i++)
167 unicode_attributes[i].name = NULL;
169 stream = fopen (unicodedata_filename, "r");
172 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
181 n = getfield (stream, field0, ';');
182 n += getfield (stream, field1, ';');
183 n += getfield (stream, field2, ';');
184 n += getfield (stream, field3, ';');
185 n += getfield (stream, field4, ';');
186 n += getfield (stream, field5, ';');
187 n += getfield (stream, field6, ';');
188 n += getfield (stream, field7, ';');
189 n += getfield (stream, field8, ';');
190 n += getfield (stream, field9, ';');
191 n += getfield (stream, field10, ';');
192 n += getfield (stream, field11, ';');
193 n += getfield (stream, field12, ';');
194 n += getfield (stream, field13, ';');
195 n += getfield (stream, field14, '\n');
200 fprintf (stderr, "short line in '%s':%d\n",
201 unicodedata_filename, lineno);
204 i = strtoul (field0, NULL, 16);
206 && strlen (field1) >= 9
207 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
209 /* Deal with a range. */
211 n = getfield (stream, field0, ';');
212 n += getfield (stream, field1, ';');
213 n += getfield (stream, field2, ';');
214 n += getfield (stream, field3, ';');
215 n += getfield (stream, field4, ';');
216 n += getfield (stream, field5, ';');
217 n += getfield (stream, field6, ';');
218 n += getfield (stream, field7, ';');
219 n += getfield (stream, field8, ';');
220 n += getfield (stream, field9, ';');
221 n += getfield (stream, field10, ';');
222 n += getfield (stream, field11, ';');
223 n += getfield (stream, field12, ';');
224 n += getfield (stream, field13, ';');
225 n += getfield (stream, field14, '\n');
228 fprintf (stderr, "missing end range in '%s':%d\n",
229 unicodedata_filename, lineno);
232 if (!(field1[0] == '<'
233 && strlen (field1) >= 8
234 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
236 fprintf (stderr, "missing end range in '%s':%d\n",
237 unicodedata_filename, lineno);
240 field1[strlen (field1) - 7] = '\0';
241 j = strtoul (field0, NULL, 16);
243 fill_attribute (i, field1+1, field2, field3, field4, field5,
244 field6, field7, field8, field9, field10,
245 field11, field12, field13, field14);
249 /* Single character line */
250 fill_attribute (i, field1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
255 if (ferror (stream) || fclose (stream))
257 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
262 /* ========================================================================= */
264 /* General category. */
265 /* See Unicode 3.0 book, section 4.5,
269 is_category_L (unsigned int ch)
271 return (unicode_attributes[ch].name != NULL
272 && unicode_attributes[ch].category[0] == 'L');
276 is_category_Lu (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L'
280 && unicode_attributes[ch].category[1] == 'u');
284 is_category_Ll (unsigned int ch)
286 return (unicode_attributes[ch].name != NULL
287 && unicode_attributes[ch].category[0] == 'L'
288 && unicode_attributes[ch].category[1] == 'l');
292 is_category_Lt (unsigned int ch)
294 return (unicode_attributes[ch].name != NULL
295 && unicode_attributes[ch].category[0] == 'L'
296 && unicode_attributes[ch].category[1] == 't');
300 is_category_Lm (unsigned int ch)
302 return (unicode_attributes[ch].name != NULL
303 && unicode_attributes[ch].category[0] == 'L'
304 && unicode_attributes[ch].category[1] == 'm');
308 is_category_Lo (unsigned int ch)
310 return (unicode_attributes[ch].name != NULL
311 && unicode_attributes[ch].category[0] == 'L'
312 && unicode_attributes[ch].category[1] == 'o');
316 is_category_M (unsigned int ch)
318 return (unicode_attributes[ch].name != NULL
319 && unicode_attributes[ch].category[0] == 'M');
323 is_category_Mn (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M'
327 && unicode_attributes[ch].category[1] == 'n');
331 is_category_Mc (unsigned int ch)
333 return (unicode_attributes[ch].name != NULL
334 && unicode_attributes[ch].category[0] == 'M'
335 && unicode_attributes[ch].category[1] == 'c');
339 is_category_Me (unsigned int ch)
341 return (unicode_attributes[ch].name != NULL
342 && unicode_attributes[ch].category[0] == 'M'
343 && unicode_attributes[ch].category[1] == 'e');
347 is_category_N (unsigned int ch)
349 return (unicode_attributes[ch].name != NULL
350 && unicode_attributes[ch].category[0] == 'N');
354 is_category_Nd (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N'
358 && unicode_attributes[ch].category[1] == 'd');
362 is_category_Nl (unsigned int ch)
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'N'
366 && unicode_attributes[ch].category[1] == 'l');
370 is_category_No (unsigned int ch)
372 return (unicode_attributes[ch].name != NULL
373 && unicode_attributes[ch].category[0] == 'N'
374 && unicode_attributes[ch].category[1] == 'o');
378 is_category_P (unsigned int ch)
380 return (unicode_attributes[ch].name != NULL
381 && unicode_attributes[ch].category[0] == 'P');
385 is_category_Pc (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P'
389 && unicode_attributes[ch].category[1] == 'c');
393 is_category_Pd (unsigned int ch)
395 return (unicode_attributes[ch].name != NULL
396 && unicode_attributes[ch].category[0] == 'P'
397 && unicode_attributes[ch].category[1] == 'd');
401 is_category_Ps (unsigned int ch)
403 return (unicode_attributes[ch].name != NULL
404 && unicode_attributes[ch].category[0] == 'P'
405 && unicode_attributes[ch].category[1] == 's');
409 is_category_Pe (unsigned int ch)
411 return (unicode_attributes[ch].name != NULL
412 && unicode_attributes[ch].category[0] == 'P'
413 && unicode_attributes[ch].category[1] == 'e');
417 is_category_Pi (unsigned int ch)
419 return (unicode_attributes[ch].name != NULL
420 && unicode_attributes[ch].category[0] == 'P'
421 && unicode_attributes[ch].category[1] == 'i');
425 is_category_Pf (unsigned int ch)
427 return (unicode_attributes[ch].name != NULL
428 && unicode_attributes[ch].category[0] == 'P'
429 && unicode_attributes[ch].category[1] == 'f');
433 is_category_Po (unsigned int ch)
435 return (unicode_attributes[ch].name != NULL
436 && unicode_attributes[ch].category[0] == 'P'
437 && unicode_attributes[ch].category[1] == 'o');
441 is_category_S (unsigned int ch)
443 return (unicode_attributes[ch].name != NULL
444 && unicode_attributes[ch].category[0] == 'S');
448 is_category_Sm (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S'
452 && unicode_attributes[ch].category[1] == 'm');
456 is_category_Sc (unsigned int ch)
458 return (unicode_attributes[ch].name != NULL
459 && unicode_attributes[ch].category[0] == 'S'
460 && unicode_attributes[ch].category[1] == 'c');
464 is_category_Sk (unsigned int ch)
466 return (unicode_attributes[ch].name != NULL
467 && unicode_attributes[ch].category[0] == 'S'
468 && unicode_attributes[ch].category[1] == 'k');
472 is_category_So (unsigned int ch)
474 return (unicode_attributes[ch].name != NULL
475 && unicode_attributes[ch].category[0] == 'S'
476 && unicode_attributes[ch].category[1] == 'o');
480 is_category_Z (unsigned int ch)
482 return (unicode_attributes[ch].name != NULL
483 && unicode_attributes[ch].category[0] == 'Z');
487 is_category_Zs (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z'
491 && unicode_attributes[ch].category[1] == 's');
495 is_category_Zl (unsigned int ch)
497 return (unicode_attributes[ch].name != NULL
498 && unicode_attributes[ch].category[0] == 'Z'
499 && unicode_attributes[ch].category[1] == 'l');
503 is_category_Zp (unsigned int ch)
505 return (unicode_attributes[ch].name != NULL
506 && unicode_attributes[ch].category[0] == 'Z'
507 && unicode_attributes[ch].category[1] == 'p');
511 is_category_C (unsigned int ch)
513 return (unicode_attributes[ch].name == NULL
514 || unicode_attributes[ch].category[0] == 'C');
518 is_category_Cc (unsigned int ch)
520 return (unicode_attributes[ch].name != NULL
521 && unicode_attributes[ch].category[0] == 'C'
522 && unicode_attributes[ch].category[1] == 'c');
526 is_category_Cf (unsigned int ch)
528 return (unicode_attributes[ch].name != NULL
529 && unicode_attributes[ch].category[0] == 'C'
530 && unicode_attributes[ch].category[1] == 'f');
534 is_category_Cs (unsigned int ch)
536 return (ch >= 0xd800 && ch < 0xe000);
540 is_category_Co (unsigned int ch)
542 return (unicode_attributes[ch].name != NULL
543 && unicode_attributes[ch].category[0] == 'C'
544 && unicode_attributes[ch].category[1] == 'o');
548 is_category_Cn (unsigned int ch)
550 return (unicode_attributes[ch].name == NULL
551 && !(ch >= 0xd800 && ch < 0xe000));
554 /* Output a boolean property in a human readable format. */
556 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
561 stream = fopen (filename, "w");
564 fprintf (stderr, "cannot open '%s' for writing\n", filename);
568 #if 0 /* This yields huge text output. */
569 for (ch = 0; ch < 0x110000; ch++)
572 fprintf (stream, "0x%04X\n", ch);
575 for (ch = 0; ch < 0x110000; ch++)
578 unsigned int first = ch;
581 while (ch + 1 < 0x110000 && predicate (ch + 1))
585 fprintf (stream, "0x%04X..0x%04X\n", first, last);
587 fprintf (stream, "0x%04X\n", ch);
591 if (ferror (stream) || fclose (stream))
593 fprintf (stderr, "error writing to '%s'\n", filename);
598 /* Output the unit test for a boolean property. */
600 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
606 stream = fopen (filename, "w");
609 fprintf (stderr, "cannot open '%s' for writing\n", filename);
613 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
614 fprintf (stream, "/* Test the Unicode character type functions.\n");
615 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
616 fprintf (stream, "\n");
617 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
618 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
619 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
620 fprintf (stream, " (at your option) any later version.\n");
621 fprintf (stream, "\n");
622 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
623 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
624 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
625 fprintf (stream, " GNU General Public License for more details.\n");
626 fprintf (stream, "\n");
627 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
628 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
629 fprintf (stream, "\n");
630 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
631 fprintf (stream, "\n");
634 for (ch = 0; ch < 0x110000; ch++)
637 unsigned int first = ch;
640 while (ch + 1 < 0x110000 && predicate (ch + 1))
644 fprintf (stream, ",\n");
645 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
649 fprintf (stream, "\n");
651 fprintf (stream, "\n");
652 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
653 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
655 if (ferror (stream) || fclose (stream))
657 fprintf (stderr, "error writing to '%s'\n", filename);
662 /* Construction of sparse 3-level tables. */
663 #define TABLE predicate_table
664 #define xmalloc malloc
665 #define xrealloc realloc
666 #include "3levelbit.h"
668 /* Output a boolean property in a three-level bitmap. */
670 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
674 struct predicate_table t;
675 unsigned int level1_offset, level2_offset, level3_offset;
677 stream = fopen (filename, "w");
680 fprintf (stderr, "cannot open '%s' for writing\n", filename);
684 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
685 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
686 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
691 predicate_table_init (&t);
693 for (ch = 0; ch < 0x110000; ch++)
695 predicate_table_add (&t, ch);
697 predicate_table_finalize (&t);
699 /* Offsets in t.result, in memory of this process. */
701 5 * sizeof (uint32_t);
703 5 * sizeof (uint32_t)
704 + t.level1_size * sizeof (uint32_t);
706 5 * sizeof (uint32_t)
707 + t.level1_size * sizeof (uint32_t)
708 + (t.level2_size << t.q) * sizeof (uint32_t);
710 for (i = 0; i < 5; i++)
712 fprintf (stream, "#define header_%d %d\n", i,
713 ((uint32_t *) t.result)[i]);
715 fprintf (stream, "static const\n");
716 fprintf (stream, "struct\n");
717 fprintf (stream, " {\n");
718 fprintf (stream, " int header[1];\n");
719 fprintf (stream, " int level1[%d];\n", t.level1_size);
720 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
721 fprintf (stream, " /*unsigned*/ int level3[%d << %d];\n", t.level3_size, t.p);
722 fprintf (stream, " }\n");
723 fprintf (stream, "%s =\n", name);
724 fprintf (stream, "{\n");
725 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
726 fprintf (stream, " {");
727 if (t.level1_size > 1)
728 fprintf (stream, "\n ");
729 for (i = 0; i < t.level1_size; i++)
732 if (i > 0 && (i % 1) == 0)
733 fprintf (stream, "\n ");
734 offset = ((uint32_t *) (t.result + level1_offset))[i];
736 fprintf (stream, " %5d", -1);
738 fprintf (stream, " %5d * sizeof (int) / sizeof (short) + %5d",
739 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
740 if (i+1 < t.level1_size)
741 fprintf (stream, ",");
743 if (t.level1_size > 1)
744 fprintf (stream, "\n ");
745 fprintf (stream, " },\n");
746 fprintf (stream, " {");
747 if (t.level2_size << t.q > 1)
748 fprintf (stream, "\n ");
749 for (i = 0; i < t.level2_size << t.q; i++)
752 if (i > 0 && (i % 1) == 0)
753 fprintf (stream, "\n ");
754 offset = ((uint32_t *) (t.result + level2_offset))[i];
756 fprintf (stream, " %5d", -1);
758 fprintf (stream, " %5d + %5d * sizeof (short) / sizeof (int) + %5d",
759 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
760 if (i+1 < t.level2_size << t.q)
761 fprintf (stream, ",");
763 if (t.level2_size << t.q > 1)
764 fprintf (stream, "\n ");
765 fprintf (stream, " },\n");
766 fprintf (stream, " {");
767 if (t.level3_size << t.p > 4)
768 fprintf (stream, "\n ");
769 for (i = 0; i < t.level3_size << t.p; i++)
771 if (i > 0 && (i % 4) == 0)
772 fprintf (stream, "\n ");
773 fprintf (stream, " 0x%08X",
774 ((uint32_t *) (t.result + level3_offset))[i]);
775 if (i+1 < t.level3_size << t.p)
776 fprintf (stream, ",");
778 if (t.level3_size << t.p > 4)
779 fprintf (stream, "\n ");
780 fprintf (stream, " }\n");
781 fprintf (stream, "};\n");
783 if (ferror (stream) || fclose (stream))
785 fprintf (stderr, "error writing to '%s'\n", filename);
790 /* Output all categories. */
792 output_categories (const char *version)
794 #define CATEGORY(C) \
795 debug_output_predicate ("categ_" #C ".txt", is_category_ ## C); \
796 output_predicate_test ("test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
797 output_predicate ("categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
840 UC_CATEGORY_MASK_L = 0x0000001f,
841 UC_CATEGORY_MASK_Lu = 0x00000001,
842 UC_CATEGORY_MASK_Ll = 0x00000002,
843 UC_CATEGORY_MASK_Lt = 0x00000004,
844 UC_CATEGORY_MASK_Lm = 0x00000008,
845 UC_CATEGORY_MASK_Lo = 0x00000010,
846 UC_CATEGORY_MASK_M = 0x000000e0,
847 UC_CATEGORY_MASK_Mn = 0x00000020,
848 UC_CATEGORY_MASK_Mc = 0x00000040,
849 UC_CATEGORY_MASK_Me = 0x00000080,
850 UC_CATEGORY_MASK_N = 0x00000700,
851 UC_CATEGORY_MASK_Nd = 0x00000100,
852 UC_CATEGORY_MASK_Nl = 0x00000200,
853 UC_CATEGORY_MASK_No = 0x00000400,
854 UC_CATEGORY_MASK_P = 0x0003f800,
855 UC_CATEGORY_MASK_Pc = 0x00000800,
856 UC_CATEGORY_MASK_Pd = 0x00001000,
857 UC_CATEGORY_MASK_Ps = 0x00002000,
858 UC_CATEGORY_MASK_Pe = 0x00004000,
859 UC_CATEGORY_MASK_Pi = 0x00008000,
860 UC_CATEGORY_MASK_Pf = 0x00010000,
861 UC_CATEGORY_MASK_Po = 0x00020000,
862 UC_CATEGORY_MASK_S = 0x003c0000,
863 UC_CATEGORY_MASK_Sm = 0x00040000,
864 UC_CATEGORY_MASK_Sc = 0x00080000,
865 UC_CATEGORY_MASK_Sk = 0x00100000,
866 UC_CATEGORY_MASK_So = 0x00200000,
867 UC_CATEGORY_MASK_Z = 0x01c00000,
868 UC_CATEGORY_MASK_Zs = 0x00400000,
869 UC_CATEGORY_MASK_Zl = 0x00800000,
870 UC_CATEGORY_MASK_Zp = 0x01000000,
871 UC_CATEGORY_MASK_C = 0x3e000000,
872 UC_CATEGORY_MASK_Cc = 0x02000000,
873 UC_CATEGORY_MASK_Cf = 0x04000000,
874 UC_CATEGORY_MASK_Cs = 0x08000000,
875 UC_CATEGORY_MASK_Co = 0x10000000,
876 UC_CATEGORY_MASK_Cn = 0x20000000
880 general_category_byname (const char *category_name)
882 if (category_name[0] != '\0'
883 && (category_name[1] == '\0' || category_name[2] == '\0'))
884 switch (category_name[0])
887 switch (category_name[1])
889 case '\0': return UC_CATEGORY_MASK_L;
890 case 'u': return UC_CATEGORY_MASK_Lu;
891 case 'l': return UC_CATEGORY_MASK_Ll;
892 case 't': return UC_CATEGORY_MASK_Lt;
893 case 'm': return UC_CATEGORY_MASK_Lm;
894 case 'o': return UC_CATEGORY_MASK_Lo;
898 switch (category_name[1])
900 case '\0': return UC_CATEGORY_MASK_M;
901 case 'n': return UC_CATEGORY_MASK_Mn;
902 case 'c': return UC_CATEGORY_MASK_Mc;
903 case 'e': return UC_CATEGORY_MASK_Me;
907 switch (category_name[1])
909 case '\0': return UC_CATEGORY_MASK_N;
910 case 'd': return UC_CATEGORY_MASK_Nd;
911 case 'l': return UC_CATEGORY_MASK_Nl;
912 case 'o': return UC_CATEGORY_MASK_No;
916 switch (category_name[1])
918 case '\0': return UC_CATEGORY_MASK_P;
919 case 'c': return UC_CATEGORY_MASK_Pc;
920 case 'd': return UC_CATEGORY_MASK_Pd;
921 case 's': return UC_CATEGORY_MASK_Ps;
922 case 'e': return UC_CATEGORY_MASK_Pe;
923 case 'i': return UC_CATEGORY_MASK_Pi;
924 case 'f': return UC_CATEGORY_MASK_Pf;
925 case 'o': return UC_CATEGORY_MASK_Po;
929 switch (category_name[1])
931 case '\0': return UC_CATEGORY_MASK_S;
932 case 'm': return UC_CATEGORY_MASK_Sm;
933 case 'c': return UC_CATEGORY_MASK_Sc;
934 case 'k': return UC_CATEGORY_MASK_Sk;
935 case 'o': return UC_CATEGORY_MASK_So;
939 switch (category_name[1])
941 case '\0': return UC_CATEGORY_MASK_Z;
942 case 's': return UC_CATEGORY_MASK_Zs;
943 case 'l': return UC_CATEGORY_MASK_Zl;
944 case 'p': return UC_CATEGORY_MASK_Zp;
948 switch (category_name[1])
950 case '\0': return UC_CATEGORY_MASK_C;
951 case 'c': return UC_CATEGORY_MASK_Cc;
952 case 'f': return UC_CATEGORY_MASK_Cf;
953 case 's': return UC_CATEGORY_MASK_Cs;
954 case 'o': return UC_CATEGORY_MASK_Co;
955 case 'n': return UC_CATEGORY_MASK_Cn;
959 /* Invalid category name. */
963 /* Construction of sparse 3-level tables. */
964 #define TABLE category_table
965 #define ELEMENT uint8_t
966 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
967 #define xmalloc malloc
968 #define xrealloc realloc
971 /* Output the per-character category table. */
973 output_category (const char *filename, const char *version)
977 struct category_table t;
978 unsigned int level1_offset, level2_offset, level3_offset;
979 uint16_t *level3_packed;
981 stream = fopen (filename, "w");
984 fprintf (stderr, "cannot open '%s' for writing\n", filename);
988 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
989 fprintf (stream, "/* Categories of Unicode characters. */\n");
990 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
995 category_table_init (&t);
997 for (ch = 0; ch < 0x110000; ch++)
1000 unsigned int log2_value;
1002 if (is_category_Cs (ch))
1003 value = UC_CATEGORY_MASK_Cs;
1004 else if (unicode_attributes[ch].name != NULL)
1005 value = general_category_byname (unicode_attributes[ch].category);
1009 /* Now value should contain exactly one bit. */
1010 if (value == 0 || ((value & (value - 1)) != 0))
1013 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1015 category_table_add (&t, ch, log2_value);
1018 category_table_finalize (&t);
1020 /* Offsets in t.result, in memory of this process. */
1022 5 * sizeof (uint32_t);
1024 5 * sizeof (uint32_t)
1025 + t.level1_size * sizeof (uint32_t);
1027 5 * sizeof (uint32_t)
1028 + t.level1_size * sizeof (uint32_t)
1029 + (t.level2_size << t.q) * sizeof (uint32_t);
1031 for (i = 0; i < 5; i++)
1032 fprintf (stream, "#define category_header_%d %d\n", i,
1033 ((uint32_t *) t.result)[i]);
1034 fprintf (stream, "static const\n");
1035 fprintf (stream, "struct\n");
1036 fprintf (stream, " {\n");
1037 fprintf (stream, " int level1[%d];\n", t.level1_size);
1038 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
1039 fprintf (stream, " unsigned short level3[%d * %d + 1];\n", t.level3_size,
1040 (1 << t.p) * 5 / 16);
1041 fprintf (stream, " }\n");
1042 fprintf (stream, "u_category =\n");
1043 fprintf (stream, "{\n");
1044 fprintf (stream, " {");
1045 if (t.level1_size > 8)
1046 fprintf (stream, "\n ");
1047 for (i = 0; i < t.level1_size; i++)
1050 if (i > 0 && (i % 8) == 0)
1051 fprintf (stream, "\n ");
1052 offset = ((uint32_t *) (t.result + level1_offset))[i];
1054 fprintf (stream, " %5d", -1);
1056 fprintf (stream, " %5d",
1057 (offset - level2_offset) / sizeof (uint32_t));
1058 if (i+1 < t.level1_size)
1059 fprintf (stream, ",");
1061 if (t.level1_size > 8)
1062 fprintf (stream, "\n ");
1063 fprintf (stream, " },\n");
1064 fprintf (stream, " {");
1065 if (t.level2_size << t.q > 8)
1066 fprintf (stream, "\n ");
1067 for (i = 0; i < t.level2_size << t.q; i++)
1070 if (i > 0 && (i % 8) == 0)
1071 fprintf (stream, "\n ");
1072 offset = ((uint32_t *) (t.result + level2_offset))[i];
1074 fprintf (stream, " %5d", -1);
1076 fprintf (stream, " %5d",
1077 (offset - level3_offset) / sizeof (uint8_t));
1078 if (i+1 < t.level2_size << t.q)
1079 fprintf (stream, ",");
1081 if (t.level2_size << t.q > 8)
1082 fprintf (stream, "\n ");
1083 fprintf (stream, " },\n");
1084 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1085 not 32-bit units, in order to make the lookup function easier. */
1088 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1089 for (i = 0; i < t.level3_size << t.p; i++)
1091 unsigned int j = (i * 5) / 16;
1092 unsigned int k = (i * 5) % 16;
1093 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1094 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1095 level3_packed[j] = value & 0xffff;
1096 level3_packed[j+1] = value >> 16;
1098 fprintf (stream, " {");
1099 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1100 fprintf (stream, "\n ");
1101 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1103 if (i > 0 && (i % 8) == 0)
1104 fprintf (stream, "\n ");
1105 fprintf (stream, " 0x%04x", level3_packed[i]);
1106 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1107 fprintf (stream, ",");
1109 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1110 fprintf (stream, "\n ");
1111 fprintf (stream, " }\n");
1112 free (level3_packed);
1113 fprintf (stream, "};\n");
1115 if (ferror (stream) || fclose (stream))
1117 fprintf (stderr, "error writing to '%s'\n", filename);
1122 /* ========================================================================= */
1124 /* Canonical combining class. */
1125 /* See Unicode 3.0 book, section 4.2,
1128 /* Construction of sparse 3-level tables. */
1129 #define TABLE combclass_table
1130 #define ELEMENT uint8_t
1132 #define xmalloc malloc
1133 #define xrealloc realloc
1136 /* Output the per-character combining class table. */
1138 output_combclass (const char *filename, const char *version)
1142 struct combclass_table t;
1143 unsigned int level1_offset, level2_offset, level3_offset;
1145 stream = fopen (filename, "w");
1148 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1152 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1153 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1154 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1159 combclass_table_init (&t);
1161 for (ch = 0; ch < 0x110000; ch++)
1162 if (unicode_attributes[ch].name != NULL)
1164 int value = atoi (unicode_attributes[ch].combining);
1165 if (!(value >= 0 && value <= 255))
1167 combclass_table_add (&t, ch, value);
1170 combclass_table_finalize (&t);
1172 /* Offsets in t.result, in memory of this process. */
1174 5 * sizeof (uint32_t);
1176 5 * sizeof (uint32_t)
1177 + t.level1_size * sizeof (uint32_t);
1179 5 * sizeof (uint32_t)
1180 + t.level1_size * sizeof (uint32_t)
1181 + (t.level2_size << t.q) * sizeof (uint32_t);
1183 for (i = 0; i < 5; i++)
1184 fprintf (stream, "#define combclass_header_%d %d\n", i,
1185 ((uint32_t *) t.result)[i]);
1186 fprintf (stream, "static const\n");
1187 fprintf (stream, "struct\n");
1188 fprintf (stream, " {\n");
1189 fprintf (stream, " int level1[%d];\n", t.level1_size);
1190 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
1191 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1192 fprintf (stream, " }\n");
1193 fprintf (stream, "u_combclass =\n");
1194 fprintf (stream, "{\n");
1195 fprintf (stream, " {");
1196 if (t.level1_size > 8)
1197 fprintf (stream, "\n ");
1198 for (i = 0; i < t.level1_size; i++)
1201 if (i > 0 && (i % 8) == 0)
1202 fprintf (stream, "\n ");
1203 offset = ((uint32_t *) (t.result + level1_offset))[i];
1205 fprintf (stream, " %5d", -1);
1207 fprintf (stream, " %5d",
1208 (offset - level2_offset) / sizeof (uint32_t));
1209 if (i+1 < t.level1_size)
1210 fprintf (stream, ",");
1212 if (t.level1_size > 8)
1213 fprintf (stream, "\n ");
1214 fprintf (stream, " },\n");
1215 fprintf (stream, " {");
1216 if (t.level2_size << t.q > 8)
1217 fprintf (stream, "\n ");
1218 for (i = 0; i < t.level2_size << t.q; i++)
1221 if (i > 0 && (i % 8) == 0)
1222 fprintf (stream, "\n ");
1223 offset = ((uint32_t *) (t.result + level2_offset))[i];
1225 fprintf (stream, " %5d", -1);
1227 fprintf (stream, " %5d",
1228 (offset - level3_offset) / sizeof (uint8_t));
1229 if (i+1 < t.level2_size << t.q)
1230 fprintf (stream, ",");
1232 if (t.level2_size << t.q > 8)
1233 fprintf (stream, "\n ");
1234 fprintf (stream, " },\n");
1235 fprintf (stream, " {");
1236 if (t.level3_size << t.p > 8)
1237 fprintf (stream, "\n ");
1238 for (i = 0; i < t.level3_size << t.p; i++)
1240 if (i > 0 && (i % 8) == 0)
1241 fprintf (stream, "\n ");
1242 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1243 if (i+1 < t.level3_size << t.p)
1244 fprintf (stream, ",");
1246 if (t.level3_size << t.p > 8)
1247 fprintf (stream, "\n ");
1248 fprintf (stream, " }\n");
1249 fprintf (stream, "};\n");
1251 if (ferror (stream) || fclose (stream))
1253 fprintf (stderr, "error writing to '%s'\n", filename);
1258 /* ========================================================================= */
1260 /* Bidirectional category. */
1261 /* See Unicode 3.0 book, section 4.3,
1266 UC_BIDI_L, /* Left-to-Right */
1267 UC_BIDI_LRE, /* Left-to-Right Embedding */
1268 UC_BIDI_LRO, /* Left-to-Right Override */
1269 UC_BIDI_R, /* Right-to-Left */
1270 UC_BIDI_AL, /* Right-to-Left Arabic */
1271 UC_BIDI_RLE, /* Right-to-Left Embedding */
1272 UC_BIDI_RLO, /* Right-to-Left Override */
1273 UC_BIDI_PDF, /* Pop Directional Format */
1274 UC_BIDI_EN, /* European Number */
1275 UC_BIDI_ES, /* European Number Separator */
1276 UC_BIDI_ET, /* European Number Terminator */
1277 UC_BIDI_AN, /* Arabic Number */
1278 UC_BIDI_CS, /* Common Number Separator */
1279 UC_BIDI_NSM, /* Non-Spacing Mark */
1280 UC_BIDI_BN, /* Boundary Neutral */
1281 UC_BIDI_B, /* Paragraph Separator */
1282 UC_BIDI_S, /* Segment Separator */
1283 UC_BIDI_WS, /* Whitespace */
1284 UC_BIDI_ON /* Other Neutral */
1288 bidi_category_byname (const char *category_name)
1290 switch (category_name[0])
1293 switch (category_name[1])
1296 if (category_name[2] == '\0')
1300 if (category_name[2] == '\0')
1306 switch (category_name[1])
1311 if (category_name[2] == '\0')
1317 switch (category_name[1])
1320 if (category_name[2] == '\0')
1326 switch (category_name[1])
1329 if (category_name[2] == '\0')
1333 if (category_name[2] == '\0')
1337 if (category_name[2] == '\0')
1343 switch (category_name[1])
1348 switch (category_name[2])
1351 if (category_name[3] == '\0')
1355 if (category_name[3] == '\0')
1363 switch (category_name[1])
1366 switch (category_name[2])
1369 if (category_name[3] == '\0')
1377 switch (category_name[1])
1380 if (category_name[2] == '\0')
1386 switch (category_name[1])
1389 switch (category_name[2])
1392 if (category_name[3] == '\0')
1400 switch (category_name[1])
1405 switch (category_name[2])
1408 if (category_name[3] == '\0')
1412 if (category_name[3] == '\0')
1420 if (category_name[1] == '\0')
1424 switch (category_name[1])
1427 if (category_name[2] == '\0')
1433 /* Invalid bidi category name. */
1438 get_bidi_category (unsigned int ch)
1440 if (unicode_attributes[ch].name != NULL)
1441 return bidi_category_byname (unicode_attributes[ch].bidi);
1444 /* The bidi category of unassigned characters depends on the range.
1445 See UTR #9 and DerivedBidiClass.txt. */
1446 if ((ch >= 0x0590 && ch <= 0x05FF)
1447 || (ch >= 0x07FB && ch <= 0x08FF)
1448 || (ch >= 0xFB37 && ch <= 0xFB45)
1449 || (ch >= 0x10800 && ch <= 0x10FFF))
1451 else if ((ch >= 0x0600 && ch <= 0x07BF)
1452 || (ch >= 0x2064 && ch <= 0x2069)
1453 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1454 || (ch >= 0xFDFE && ch <= 0xFEFE))
1456 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1457 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1458 || (ch & 0xFFFF) == 0xFFFE
1459 || (ch & 0xFFFF) == 0xFFFF
1460 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1467 /* Construction of sparse 3-level tables. */
1468 #define TABLE bidi_category_table
1469 #define ELEMENT uint8_t
1470 #define DEFAULT UC_BIDI_L
1471 #define xmalloc malloc
1472 #define xrealloc realloc
1475 /* Output the per-character bidi category table. */
1477 output_bidi_category (const char *filename, const char *version)
1481 struct bidi_category_table t;
1482 unsigned int level1_offset, level2_offset, level3_offset;
1483 uint16_t *level3_packed;
1485 stream = fopen (filename, "w");
1488 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1492 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1493 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1494 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1499 bidi_category_table_init (&t);
1501 for (ch = 0; ch < 0x110000; ch++)
1503 int value = get_bidi_category (ch);
1505 bidi_category_table_add (&t, ch, value);
1508 bidi_category_table_finalize (&t);
1510 /* Offsets in t.result, in memory of this process. */
1512 5 * sizeof (uint32_t);
1514 5 * sizeof (uint32_t)
1515 + t.level1_size * sizeof (uint32_t);
1517 5 * sizeof (uint32_t)
1518 + t.level1_size * sizeof (uint32_t)
1519 + (t.level2_size << t.q) * sizeof (uint32_t);
1521 for (i = 0; i < 5; i++)
1522 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1523 ((uint32_t *) t.result)[i]);
1524 fprintf (stream, "static const\n");
1525 fprintf (stream, "struct\n");
1526 fprintf (stream, " {\n");
1527 fprintf (stream, " int level1[%d];\n", t.level1_size);
1528 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
1529 fprintf (stream, " unsigned short level3[%d * %d + 1];\n", t.level3_size,
1530 (1 << t.p) * 5 / 16);
1531 fprintf (stream, " }\n");
1532 fprintf (stream, "u_bidi_category =\n");
1533 fprintf (stream, "{\n");
1534 fprintf (stream, " {");
1535 if (t.level1_size > 8)
1536 fprintf (stream, "\n ");
1537 for (i = 0; i < t.level1_size; i++)
1540 if (i > 0 && (i % 8) == 0)
1541 fprintf (stream, "\n ");
1542 offset = ((uint32_t *) (t.result + level1_offset))[i];
1544 fprintf (stream, " %5d", -1);
1546 fprintf (stream, " %5d",
1547 (offset - level2_offset) / sizeof (uint32_t));
1548 if (i+1 < t.level1_size)
1549 fprintf (stream, ",");
1551 if (t.level1_size > 8)
1552 fprintf (stream, "\n ");
1553 fprintf (stream, " },\n");
1554 fprintf (stream, " {");
1555 if (t.level2_size << t.q > 8)
1556 fprintf (stream, "\n ");
1557 for (i = 0; i < t.level2_size << t.q; i++)
1560 if (i > 0 && (i % 8) == 0)
1561 fprintf (stream, "\n ");
1562 offset = ((uint32_t *) (t.result + level2_offset))[i];
1564 fprintf (stream, " %5d", -1);
1566 fprintf (stream, " %5d",
1567 (offset - level3_offset) / sizeof (uint8_t));
1568 if (i+1 < t.level2_size << t.q)
1569 fprintf (stream, ",");
1571 if (t.level2_size << t.q > 8)
1572 fprintf (stream, "\n ");
1573 fprintf (stream, " },\n");
1574 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1575 not 32-bit units, in order to make the lookup function easier. */
1578 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1579 for (i = 0; i < t.level3_size << t.p; i++)
1581 unsigned int j = (i * 5) / 16;
1582 unsigned int k = (i * 5) % 16;
1583 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1584 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1585 level3_packed[j] = value & 0xffff;
1586 level3_packed[j+1] = value >> 16;
1588 fprintf (stream, " {");
1589 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1590 fprintf (stream, "\n ");
1591 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1593 if (i > 0 && (i % 8) == 0)
1594 fprintf (stream, "\n ");
1595 fprintf (stream, " 0x%04x", level3_packed[i]);
1596 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1597 fprintf (stream, ",");
1599 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1600 fprintf (stream, "\n ");
1601 fprintf (stream, " }\n");
1602 free (level3_packed);
1603 fprintf (stream, "};\n");
1605 if (ferror (stream) || fclose (stream))
1607 fprintf (stderr, "error writing to '%s'\n", filename);
1612 /* ========================================================================= */
1614 /* Decimal digit value. */
1615 /* See Unicode 3.0 book, section 4.6. */
1618 get_decdigit_value (unsigned int ch)
1620 if (unicode_attributes[ch].name != NULL
1621 && unicode_attributes[ch].decdigit[0] != '\0')
1622 return atoi (unicode_attributes[ch].decdigit);
1626 /* Construction of sparse 3-level tables. */
1627 #define TABLE decdigit_table
1628 #define ELEMENT uint8_t
1630 #define xmalloc malloc
1631 #define xrealloc realloc
1634 /* Output the unit test for the per-character decimal digit value table. */
1636 output_decimal_digit_test (const char *filename, const char *version)
1642 stream = fopen (filename, "w");
1645 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1649 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1650 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1651 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1655 for (ch = 0; ch < 0x110000; ch++)
1657 int value = get_decdigit_value (ch);
1659 if (!(value >= -1 && value < 10))
1665 fprintf (stream, ",\n");
1666 fprintf (stream, " { 0x%04X, %d }", ch, value);
1671 fprintf (stream, "\n");
1673 if (ferror (stream) || fclose (stream))
1675 fprintf (stderr, "error writing to '%s'\n", filename);
1680 /* Output the per-character decimal digit value table. */
1682 output_decimal_digit (const char *filename, const char *version)
1686 struct decdigit_table t;
1687 unsigned int level1_offset, level2_offset, level3_offset;
1689 stream = fopen (filename, "w");
1692 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1696 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1697 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1698 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1703 decdigit_table_init (&t);
1705 for (ch = 0; ch < 0x110000; ch++)
1707 int value = 1 + get_decdigit_value (ch);
1709 if (!(value >= 0 && value <= 10))
1712 decdigit_table_add (&t, ch, value);
1715 decdigit_table_finalize (&t);
1717 /* Offsets in t.result, in memory of this process. */
1719 5 * sizeof (uint32_t);
1721 5 * sizeof (uint32_t)
1722 + t.level1_size * sizeof (uint32_t);
1724 5 * sizeof (uint32_t)
1725 + t.level1_size * sizeof (uint32_t)
1726 + (t.level2_size << t.q) * sizeof (uint32_t);
1728 for (i = 0; i < 5; i++)
1729 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1730 ((uint32_t *) t.result)[i]);
1731 fprintf (stream, "static const\n");
1732 fprintf (stream, "struct\n");
1733 fprintf (stream, " {\n");
1734 fprintf (stream, " int level1[%d];\n", t.level1_size);
1735 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
1736 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size,
1738 fprintf (stream, " }\n");
1739 fprintf (stream, "u_decdigit =\n");
1740 fprintf (stream, "{\n");
1741 fprintf (stream, " {");
1742 if (t.level1_size > 8)
1743 fprintf (stream, "\n ");
1744 for (i = 0; i < t.level1_size; i++)
1747 if (i > 0 && (i % 8) == 0)
1748 fprintf (stream, "\n ");
1749 offset = ((uint32_t *) (t.result + level1_offset))[i];
1751 fprintf (stream, " %5d", -1);
1753 fprintf (stream, " %5d",
1754 (offset - level2_offset) / sizeof (uint32_t));
1755 if (i+1 < t.level1_size)
1756 fprintf (stream, ",");
1758 if (t.level1_size > 8)
1759 fprintf (stream, "\n ");
1760 fprintf (stream, " },\n");
1761 fprintf (stream, " {");
1762 if (t.level2_size << t.q > 8)
1763 fprintf (stream, "\n ");
1764 for (i = 0; i < t.level2_size << t.q; i++)
1767 if (i > 0 && (i % 8) == 0)
1768 fprintf (stream, "\n ");
1769 offset = ((uint32_t *) (t.result + level2_offset))[i];
1771 fprintf (stream, " %5d", -1);
1773 fprintf (stream, " %5d",
1774 (offset - level3_offset) / sizeof (uint8_t));
1775 if (i+1 < t.level2_size << t.q)
1776 fprintf (stream, ",");
1778 if (t.level2_size << t.q > 8)
1779 fprintf (stream, "\n ");
1780 fprintf (stream, " },\n");
1781 /* Pack the level3 array. Each entry needs 4 bits only. */
1782 fprintf (stream, " {");
1783 if (t.level3_size << (t.p - 1) > 8)
1784 fprintf (stream, "\n ");
1785 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1787 if (i > 0 && (i % 8) == 0)
1788 fprintf (stream, "\n ");
1789 fprintf (stream, " 0x%02x",
1790 ((uint8_t *) (t.result + level3_offset))[2*i]
1791 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1792 if (i+1 < t.level3_size << (t.p - 1))
1793 fprintf (stream, ",");
1795 if (t.level3_size << (t.p - 1) > 8)
1796 fprintf (stream, "\n ");
1797 fprintf (stream, " }\n");
1798 fprintf (stream, "};\n");
1800 if (ferror (stream) || fclose (stream))
1802 fprintf (stderr, "error writing to '%s'\n", filename);
1807 /* ========================================================================= */
1810 /* See Unicode 3.0 book, section 4.6. */
1813 get_digit_value (unsigned int ch)
1815 if (unicode_attributes[ch].name != NULL
1816 && unicode_attributes[ch].digit[0] != '\0')
1817 return atoi (unicode_attributes[ch].digit);
1821 /* Output the unit test for the per-character digit value table. */
1823 output_digit_test (const char *filename, const char *version)
1829 stream = fopen (filename, "w");
1832 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1836 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1837 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1838 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1842 for (ch = 0; ch < 0x110000; ch++)
1844 int value = get_digit_value (ch);
1846 if (!(value >= -1 && value < 10))
1852 fprintf (stream, ",\n");
1853 fprintf (stream, " { 0x%04X, %d }", ch, value);
1858 fprintf (stream, "\n");
1860 if (ferror (stream) || fclose (stream))
1862 fprintf (stderr, "error writing to '%s'\n", filename);
1867 /* Output the per-character digit value table. */
1869 output_digit (const char *filename, const char *version)
1873 struct decdigit_table t;
1874 unsigned int level1_offset, level2_offset, level3_offset;
1876 stream = fopen (filename, "w");
1879 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1883 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1884 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1885 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1890 decdigit_table_init (&t);
1892 for (ch = 0; ch < 0x110000; ch++)
1894 int value = 1 + get_digit_value (ch);
1896 if (!(value >= 0 && value <= 10))
1899 decdigit_table_add (&t, ch, value);
1902 decdigit_table_finalize (&t);
1904 /* Offsets in t.result, in memory of this process. */
1906 5 * sizeof (uint32_t);
1908 5 * sizeof (uint32_t)
1909 + t.level1_size * sizeof (uint32_t);
1911 5 * sizeof (uint32_t)
1912 + t.level1_size * sizeof (uint32_t)
1913 + (t.level2_size << t.q) * sizeof (uint32_t);
1915 for (i = 0; i < 5; i++)
1916 fprintf (stream, "#define digit_header_%d %d\n", i,
1917 ((uint32_t *) t.result)[i]);
1918 fprintf (stream, "static const\n");
1919 fprintf (stream, "struct\n");
1920 fprintf (stream, " {\n");
1921 fprintf (stream, " int level1[%d];\n", t.level1_size);
1922 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
1923 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size,
1925 fprintf (stream, " }\n");
1926 fprintf (stream, "u_digit =\n");
1927 fprintf (stream, "{\n");
1928 fprintf (stream, " {");
1929 if (t.level1_size > 8)
1930 fprintf (stream, "\n ");
1931 for (i = 0; i < t.level1_size; i++)
1934 if (i > 0 && (i % 8) == 0)
1935 fprintf (stream, "\n ");
1936 offset = ((uint32_t *) (t.result + level1_offset))[i];
1938 fprintf (stream, " %5d", -1);
1940 fprintf (stream, " %5d",
1941 (offset - level2_offset) / sizeof (uint32_t));
1942 if (i+1 < t.level1_size)
1943 fprintf (stream, ",");
1945 if (t.level1_size > 8)
1946 fprintf (stream, "\n ");
1947 fprintf (stream, " },\n");
1948 fprintf (stream, " {");
1949 if (t.level2_size << t.q > 8)
1950 fprintf (stream, "\n ");
1951 for (i = 0; i < t.level2_size << t.q; i++)
1954 if (i > 0 && (i % 8) == 0)
1955 fprintf (stream, "\n ");
1956 offset = ((uint32_t *) (t.result + level2_offset))[i];
1958 fprintf (stream, " %5d", -1);
1960 fprintf (stream, " %5d",
1961 (offset - level3_offset) / sizeof (uint8_t));
1962 if (i+1 < t.level2_size << t.q)
1963 fprintf (stream, ",");
1965 if (t.level2_size << t.q > 8)
1966 fprintf (stream, "\n ");
1967 fprintf (stream, " },\n");
1968 /* Pack the level3 array. Each entry needs 4 bits only. */
1969 fprintf (stream, " {");
1970 if (t.level3_size << (t.p - 1) > 8)
1971 fprintf (stream, "\n ");
1972 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1974 if (i > 0 && (i % 8) == 0)
1975 fprintf (stream, "\n ");
1976 fprintf (stream, " 0x%02x",
1977 ((uint8_t *) (t.result + level3_offset))[2*i]
1978 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1979 if (i+1 < t.level3_size << (t.p - 1))
1980 fprintf (stream, ",");
1982 if (t.level3_size << (t.p - 1) > 8)
1983 fprintf (stream, "\n ");
1984 fprintf (stream, " }\n");
1985 fprintf (stream, "};\n");
1987 if (ferror (stream) || fclose (stream))
1989 fprintf (stderr, "error writing to '%s'\n", filename);
1994 /* ========================================================================= */
1996 /* Numeric value. */
1997 /* See Unicode 3.0 book, section 4.6. */
1999 typedef struct { int numerator; int denominator; } uc_fraction_t;
2001 static uc_fraction_t
2002 get_numeric_value (unsigned int ch)
2004 uc_fraction_t value;
2006 if (unicode_attributes[ch].name != NULL
2007 && unicode_attributes[ch].numeric[0] != '\0')
2009 const char *str = unicode_attributes[ch].numeric;
2010 /* str is of the form "integer" or "integer/posinteger". */
2011 value.numerator = atoi (str);
2012 if (strchr (str, '/') != NULL)
2013 value.denominator = atoi (strchr (str, '/') + 1);
2015 value.denominator = 1;
2019 value.numerator = 0;
2020 value.denominator = 0;
2025 /* Output the unit test for the per-character numeric value table. */
2027 output_numeric_test (const char *filename, const char *version)
2033 stream = fopen (filename, "w");
2036 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2040 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2041 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2042 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2046 for (ch = 0; ch < 0x110000; ch++)
2048 uc_fraction_t value = get_numeric_value (ch);
2050 if (value.numerator != 0 || value.denominator != 0)
2053 fprintf (stream, ",\n");
2054 fprintf (stream, " { 0x%04X, %d, %d }",
2055 ch, value.numerator, value.denominator);
2060 fprintf (stream, "\n");
2062 if (ferror (stream) || fclose (stream))
2064 fprintf (stderr, "error writing to '%s'\n", filename);
2069 /* Construction of sparse 3-level tables. */
2070 #define TABLE numeric_table
2071 #define ELEMENT uint8_t
2073 #define xmalloc malloc
2074 #define xrealloc realloc
2077 /* Output the per-character numeric value table. */
2079 output_numeric (const char *filename, const char *version)
2082 uc_fraction_t fractions[128];
2083 unsigned int nfractions;
2084 unsigned int ch, i, j;
2085 struct numeric_table t;
2086 unsigned int level1_offset, level2_offset, level3_offset;
2087 uint16_t *level3_packed;
2089 stream = fopen (filename, "w");
2092 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2096 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2097 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2098 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2101 /* Create table of occurring fractions. */
2103 for (ch = 0; ch < 0x110000; ch++)
2105 uc_fraction_t value = get_numeric_value (ch);
2107 for (i = 0; i < nfractions; i++)
2108 if (value.numerator == fractions[i].numerator
2109 && value.denominator == fractions[i].denominator)
2111 if (i == nfractions)
2113 if (nfractions == 128)
2115 for (i = 0; i < nfractions; i++)
2116 if (value.denominator < fractions[i].denominator
2117 || (value.denominator == fractions[i].denominator
2118 && value.numerator < fractions[i].numerator))
2120 for (j = nfractions; j > i; j--)
2121 fractions[j] = fractions[j - 1];
2122 fractions[i] = value;
2127 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2129 fprintf (stream, "{\n");
2130 for (i = 0; i < nfractions; i++)
2132 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2133 fractions[i].denominator);
2134 if (i+1 < nfractions)
2135 fprintf (stream, ",");
2136 fprintf (stream, "\n");
2138 fprintf (stream, "};\n");
2142 numeric_table_init (&t);
2144 for (ch = 0; ch < 0x110000; ch++)
2146 uc_fraction_t value = get_numeric_value (ch);
2148 for (i = 0; i < nfractions; i++)
2149 if (value.numerator == fractions[i].numerator
2150 && value.denominator == fractions[i].denominator)
2152 if (i == nfractions)
2155 numeric_table_add (&t, ch, i);
2158 numeric_table_finalize (&t);
2160 /* Offsets in t.result, in memory of this process. */
2162 5 * sizeof (uint32_t);
2164 5 * sizeof (uint32_t)
2165 + t.level1_size * sizeof (uint32_t);
2167 5 * sizeof (uint32_t)
2168 + t.level1_size * sizeof (uint32_t)
2169 + (t.level2_size << t.q) * sizeof (uint32_t);
2171 for (i = 0; i < 5; i++)
2172 fprintf (stream, "#define numeric_header_%d %d\n", i,
2173 ((uint32_t *) t.result)[i]);
2174 fprintf (stream, "static const\n");
2175 fprintf (stream, "struct\n");
2176 fprintf (stream, " {\n");
2177 fprintf (stream, " int level1[%d];\n", t.level1_size);
2178 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
2179 fprintf (stream, " unsigned short level3[%d * %d + 1];\n", t.level3_size,
2180 (1 << t.p) * 7 / 16);
2181 fprintf (stream, " }\n");
2182 fprintf (stream, "u_numeric =\n");
2183 fprintf (stream, "{\n");
2184 fprintf (stream, " {");
2185 if (t.level1_size > 8)
2186 fprintf (stream, "\n ");
2187 for (i = 0; i < t.level1_size; i++)
2190 if (i > 0 && (i % 8) == 0)
2191 fprintf (stream, "\n ");
2192 offset = ((uint32_t *) (t.result + level1_offset))[i];
2194 fprintf (stream, " %5d", -1);
2196 fprintf (stream, " %5d",
2197 (offset - level2_offset) / sizeof (uint32_t));
2198 if (i+1 < t.level1_size)
2199 fprintf (stream, ",");
2201 if (t.level1_size > 8)
2202 fprintf (stream, "\n ");
2203 fprintf (stream, " },\n");
2204 fprintf (stream, " {");
2205 if (t.level2_size << t.q > 8)
2206 fprintf (stream, "\n ");
2207 for (i = 0; i < t.level2_size << t.q; i++)
2210 if (i > 0 && (i % 8) == 0)
2211 fprintf (stream, "\n ");
2212 offset = ((uint32_t *) (t.result + level2_offset))[i];
2214 fprintf (stream, " %5d", -1);
2216 fprintf (stream, " %5d",
2217 (offset - level3_offset) / sizeof (uint8_t));
2218 if (i+1 < t.level2_size << t.q)
2219 fprintf (stream, ",");
2221 if (t.level2_size << t.q > 8)
2222 fprintf (stream, "\n ");
2223 fprintf (stream, " },\n");
2224 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2225 not 32-bit units, in order to make the lookup function easier. */
2228 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2229 for (i = 0; i < t.level3_size << t.p; i++)
2231 unsigned int j = (i * 7) / 16;
2232 unsigned int k = (i * 7) % 16;
2233 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2234 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2235 level3_packed[j] = value & 0xffff;
2236 level3_packed[j+1] = value >> 16;
2238 fprintf (stream, " {");
2239 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2240 fprintf (stream, "\n ");
2241 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2243 if (i > 0 && (i % 8) == 0)
2244 fprintf (stream, "\n ");
2245 fprintf (stream, " 0x%04x", level3_packed[i]);
2246 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2247 fprintf (stream, ",");
2249 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2250 fprintf (stream, "\n ");
2251 fprintf (stream, " }\n");
2252 free (level3_packed);
2253 fprintf (stream, "};\n");
2255 if (ferror (stream) || fclose (stream))
2257 fprintf (stderr, "error writing to '%s'\n", filename);
2262 /* ========================================================================= */
2265 /* See Unicode 3.0 book, section 4.7,
2268 /* List of mirrored character pairs. This is a subset of the characters
2269 having the BidiMirrored property. */
2270 static unsigned int mirror_pairs[][2] =
2327 get_mirror_value (unsigned int ch)
2330 unsigned int mirror_char;
2333 mirrored = (unicode_attributes[ch].name != NULL
2334 && unicode_attributes[ch].mirrored);
2335 mirror_char = 0xfffd;
2336 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2337 if (ch == mirror_pairs[i][0])
2339 mirror_char = mirror_pairs[i][1];
2342 else if (ch == mirror_pairs[i][1])
2344 mirror_char = mirror_pairs[i][0];
2348 return (int) mirror_char - (int) ch;
2351 if (mirror_char != 0xfffd)
2357 /* Construction of sparse 3-level tables. */
2358 #define TABLE mirror_table
2359 #define ELEMENT int32_t
2361 #define xmalloc malloc
2362 #define xrealloc realloc
2365 /* Output the per-character mirror table. */
2367 output_mirror (const char *filename, const char *version)
2371 struct mirror_table t;
2372 unsigned int level1_offset, level2_offset, level3_offset;
2374 stream = fopen (filename, "w");
2377 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2381 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2382 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2383 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2388 mirror_table_init (&t);
2390 for (ch = 0; ch < 0x110000; ch++)
2392 int value = get_mirror_value (ch);
2394 mirror_table_add (&t, ch, value);
2397 mirror_table_finalize (&t);
2399 /* Offsets in t.result, in memory of this process. */
2401 5 * sizeof (uint32_t);
2403 5 * sizeof (uint32_t)
2404 + t.level1_size * sizeof (uint32_t);
2406 5 * sizeof (uint32_t)
2407 + t.level1_size * sizeof (uint32_t)
2408 + (t.level2_size << t.q) * sizeof (uint32_t);
2410 for (i = 0; i < 5; i++)
2411 fprintf (stream, "#define mirror_header_%d %d\n", i,
2412 ((uint32_t *) t.result)[i]);
2413 fprintf (stream, "static const\n");
2414 fprintf (stream, "struct\n");
2415 fprintf (stream, " {\n");
2416 fprintf (stream, " int level1[%d];\n", t.level1_size);
2417 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
2418 fprintf (stream, " int level3[%d << %d];\n", t.level3_size, t.p);
2419 fprintf (stream, " }\n");
2420 fprintf (stream, "u_mirror =\n");
2421 fprintf (stream, "{\n");
2422 fprintf (stream, " {");
2423 if (t.level1_size > 8)
2424 fprintf (stream, "\n ");
2425 for (i = 0; i < t.level1_size; i++)
2428 if (i > 0 && (i % 8) == 0)
2429 fprintf (stream, "\n ");
2430 offset = ((uint32_t *) (t.result + level1_offset))[i];
2432 fprintf (stream, " %5d", -1);
2434 fprintf (stream, " %5d",
2435 (offset - level2_offset) / sizeof (uint32_t));
2436 if (i+1 < t.level1_size)
2437 fprintf (stream, ",");
2439 if (t.level1_size > 8)
2440 fprintf (stream, "\n ");
2441 fprintf (stream, " },\n");
2442 fprintf (stream, " {");
2443 if (t.level2_size << t.q > 8)
2444 fprintf (stream, "\n ");
2445 for (i = 0; i < t.level2_size << t.q; i++)
2448 if (i > 0 && (i % 8) == 0)
2449 fprintf (stream, "\n ");
2450 offset = ((uint32_t *) (t.result + level2_offset))[i];
2452 fprintf (stream, " %5d", -1);
2454 fprintf (stream, " %5d",
2455 (offset - level3_offset) / sizeof (int32_t));
2456 if (i+1 < t.level2_size << t.q)
2457 fprintf (stream, ",");
2459 if (t.level2_size << t.q > 8)
2460 fprintf (stream, "\n ");
2461 fprintf (stream, " },\n");
2462 fprintf (stream, " {");
2463 if (t.level3_size << t.p > 8)
2464 fprintf (stream, "\n ");
2465 for (i = 0; i < t.level3_size << t.p; i++)
2467 if (i > 0 && (i % 8) == 0)
2468 fprintf (stream, "\n ");
2469 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2470 if (i+1 < t.level3_size << t.p)
2471 fprintf (stream, ",");
2473 if (t.level3_size << t.p > 8)
2474 fprintf (stream, "\n ");
2475 fprintf (stream, " }\n");
2476 fprintf (stream, "};\n");
2478 if (ferror (stream) || fclose (stream))
2480 fprintf (stderr, "error writing to '%s'\n", filename);
2485 /* ========================================================================= */
2489 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2498 PROP_QUOTATION_MARK,
2499 PROP_TERMINAL_PUNCTUATION,
2502 PROP_ASCII_HEX_DIGIT,
2503 PROP_OTHER_ALPHABETIC,
2507 PROP_OTHER_LOWERCASE,
2508 PROP_OTHER_UPPERCASE,
2509 PROP_NONCHARACTER_CODE_POINT,
2510 PROP_OTHER_GRAPHEME_EXTEND,
2511 PROP_IDS_BINARY_OPERATOR,
2512 PROP_IDS_TRINARY_OPERATOR,
2514 PROP_UNIFIED_IDEOGRAPH,
2515 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2518 PROP_LOGICAL_ORDER_EXCEPTION,
2519 PROP_OTHER_ID_START,
2520 PROP_OTHER_ID_CONTINUE,
2522 PROP_VARIATION_SELECTOR,
2523 PROP_PATTERN_WHITE_SPACE,
2524 PROP_PATTERN_SYNTAX,
2525 /* DerivedCoreProperties.txt */
2534 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2535 PROP_GRAPHEME_EXTEND,
2539 unsigned long long unicode_properties[0x110000];
2542 clear_properties (void)
2546 for (i = 0; i < 0x110000; i++)
2547 unicode_properties[i] = 0;
2550 /* Stores in unicode_properties[] the properties from the
2551 PropList.txt or DerivedCoreProperties.txt file. */
2553 fill_properties (const char *proplist_filename)
2558 stream = fopen (proplist_filename, "r");
2561 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2568 unsigned int i1, i2;
2569 char padding[200+1];
2570 char propname[200+1];
2571 unsigned int propvalue;
2573 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2576 if (buf[0] == '\0' || buf[0] == '#')
2579 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2581 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2583 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2588 #define PROP(name,value) \
2589 if (strcmp (propname, name) == 0) propvalue = value; else
2591 PROP ("White_Space", PROP_WHITE_SPACE)
2592 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2593 PROP ("Join_Control", PROP_JOIN_CONTROL)
2594 PROP ("Dash", PROP_DASH)
2595 PROP ("Hyphen", PROP_HYPHEN)
2596 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2597 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2598 PROP ("Other_Math", PROP_OTHER_MATH)
2599 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2600 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2601 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2602 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2603 PROP ("Diacritic", PROP_DIACRITIC)
2604 PROP ("Extender", PROP_EXTENDER)
2605 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2606 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2607 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2608 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2609 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2610 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2611 PROP ("Radical", PROP_RADICAL)
2612 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2613 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2614 PROP ("Deprecated", PROP_DEPRECATED)
2615 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2616 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2617 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2618 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2619 PROP ("STerm", PROP_STERM)
2620 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2621 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2622 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2623 /* DerivedCoreProperties.txt */
2624 PROP ("Math", PROP_MATH)
2625 PROP ("Alphabetic", PROP_ALPHABETIC)
2626 PROP ("Lowercase", PROP_LOWERCASE)
2627 PROP ("Uppercase", PROP_UPPERCASE)
2628 PROP ("ID_Start", PROP_ID_START)
2629 PROP ("ID_Continue", PROP_ID_CONTINUE)
2630 PROP ("XID_Start", PROP_XID_START)
2631 PROP ("XID_Continue", PROP_XID_CONTINUE)
2632 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2633 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2634 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2635 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2638 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2642 if (!(i1 <= i2 && i2 < 0x110000))
2645 for (i = i1; i <= i2; i++)
2646 unicode_properties[i] |= 1ULL << propvalue;
2649 if (ferror (stream) || fclose (stream))
2651 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2656 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2659 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2665 for (i = 0; i < 0x110000; i++)
2668 stream = fopen (proplist_filename, "r");
2671 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2675 /* Search for the "Property dump for: ..." line. */
2678 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2680 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2684 while (strstr (buf, property_name) == NULL);
2688 unsigned int i1, i2;
2690 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2694 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2696 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2698 fprintf (stderr, "parse error in property in '%s'\n",
2703 else if (strlen (buf) >= 4)
2705 if (sscanf (buf, "%4X", &i1) < 1)
2707 fprintf (stderr, "parse error in property in '%s'\n",
2715 fprintf (stderr, "parse error in property in '%s'\n",
2719 if (!(i1 <= i2 && i2 < 0x110000))
2721 for (i = i1; i <= i2; i++)
2724 if (ferror (stream) || fclose (stream))
2726 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2731 /* Properties from Unicode 3.0 PropList.txt file. */
2733 /* The paired punctuation property from the PropList.txt file. */
2734 char unicode_pairedpunctuation[0x110000];
2736 /* The left of pair property from the PropList.txt file. */
2737 char unicode_leftofpair[0x110000];
2740 fill_properties30 (const char *proplist30_filename)
2742 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2743 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2746 /* ------------------------------------------------------------------------- */
2748 /* See PropList.txt, UCD.html. */
2750 is_property_white_space (unsigned int ch)
2752 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2755 /* See Unicode 3.0 book, section 4.10,
2756 PropList.txt, UCD.html,
2757 DerivedCoreProperties.txt, UCD.html. */
2759 is_property_alphabetic (unsigned int ch)
2763 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2764 /* For some reason, the following are listed as having property
2765 Alphabetic but not as having property Other_Alphabetic. */
2766 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2767 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2768 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2769 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2770 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2771 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2772 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2773 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2774 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2775 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2776 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2778 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2780 if (result1 != result2)
2785 /* See PropList.txt, UCD.html. */
2787 is_property_other_alphabetic (unsigned int ch)
2789 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2792 /* See PropList.txt, UCD.html. */
2794 is_property_not_a_character (unsigned int ch)
2796 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2799 /* See PropList.txt, UCD.html,
2800 DerivedCoreProperties.txt, UCD.html. */
2802 is_property_default_ignorable_code_point (unsigned int ch)
2805 (is_category_Cf (ch)
2806 && !(ch >= 0xFFF9 && ch <= 0xFFFB)) /* Annotations */
2807 || ((is_category_Cc (ch) || is_category_Cs (ch))
2808 && !is_property_white_space (ch))
2809 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2810 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0)
2811 || is_property_not_a_character (ch);
2813 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2815 if (result1 != result2)
2820 /* See PropList.txt, UCD.html. */
2822 is_property_other_default_ignorable_code_point (unsigned int ch)
2824 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2827 /* See PropList.txt, UCD.html. */
2829 is_property_deprecated (unsigned int ch)
2831 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2834 /* See PropList.txt, UCD.html. */
2836 is_property_logical_order_exception (unsigned int ch)
2838 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2841 /* See PropList.txt, UCD.html. */
2843 is_property_variation_selector (unsigned int ch)
2845 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2848 /* See PropList-3.0.1.txt. */
2850 is_property_private_use (unsigned int ch)
2852 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2853 return (ch >= 0xE000 && ch <= 0xF8FF)
2854 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2855 || (ch >= 0x100000 && ch <= 0x10FFFD);
2858 /* See PropList-3.0.1.txt. */
2860 is_property_unassigned_code_value (unsigned int ch)
2862 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2865 /* See PropList.txt, UCD.html,
2866 DerivedCoreProperties.txt, UCD.html. */
2868 is_property_uppercase (unsigned int ch)
2872 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2874 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2876 if (result1 != result2)
2881 /* See PropList.txt, UCD.html. */
2883 is_property_other_uppercase (unsigned int ch)
2885 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2888 /* See PropList.txt, UCD.html,
2889 DerivedCoreProperties.txt, UCD.html. */
2891 is_property_lowercase (unsigned int ch)
2895 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2897 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2899 if (result1 != result2)
2904 /* See PropList.txt, UCD.html. */
2906 is_property_other_lowercase (unsigned int ch)
2908 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2911 /* See PropList-3.0.1.txt. */
2913 is_property_titlecase (unsigned int ch)
2915 return is_category_Lt (ch);
2918 /* See PropList.txt, UCD.html. */
2920 is_property_soft_dotted (unsigned int ch)
2922 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2925 /* See DerivedCoreProperties.txt, UCD.html. */
2927 is_property_id_start (unsigned int ch)
2929 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2932 /* See PropList.txt, UCD.html. */
2934 is_property_other_id_start (unsigned int ch)
2936 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2939 /* See DerivedCoreProperties.txt, UCD.html. */
2941 is_property_id_continue (unsigned int ch)
2943 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2946 /* See PropList.txt, UCD.html. */
2948 is_property_other_id_continue (unsigned int ch)
2950 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2953 /* See DerivedCoreProperties.txt, UCD.html. */
2955 is_property_xid_start (unsigned int ch)
2957 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2960 /* See DerivedCoreProperties.txt, UCD.html. */
2962 is_property_xid_continue (unsigned int ch)
2964 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2967 /* See PropList.txt, UCD.html. */
2969 is_property_pattern_white_space (unsigned int ch)
2971 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2974 /* See PropList.txt, UCD.html. */
2976 is_property_pattern_syntax (unsigned int ch)
2978 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2981 /* See PropList.txt, UCD.html. */
2983 is_property_join_control (unsigned int ch)
2985 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2988 /* See DerivedCoreProperties.txt, UCD.html. */
2990 is_property_grapheme_base (unsigned int ch)
2992 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2995 /* See DerivedCoreProperties.txt, UCD.html. */
2997 is_property_grapheme_extend (unsigned int ch)
2999 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3002 /* See PropList.txt, UCD.html. */
3004 is_property_other_grapheme_extend (unsigned int ch)
3006 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3009 /* See DerivedCoreProperties.txt, UCD.html. */
3011 is_property_grapheme_link (unsigned int ch)
3013 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3016 /* See PropList.txt, UCD.html. */
3018 is_property_bidi_control (unsigned int ch)
3020 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3023 /* See PropList-3.0.1.txt. */
3025 is_property_bidi_left_to_right (unsigned int ch)
3027 return (get_bidi_category (ch) == UC_BIDI_L);
3030 /* See PropList-3.0.1.txt. */
3032 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3034 return (get_bidi_category (ch) == UC_BIDI_R);
3037 /* See PropList-3.0.1.txt. */
3039 is_property_bidi_arabic_right_to_left (unsigned int ch)
3041 return (get_bidi_category (ch) == UC_BIDI_AL);
3044 /* See PropList-3.0.1.txt. */
3046 is_property_bidi_european_digit (unsigned int ch)
3048 return (get_bidi_category (ch) == UC_BIDI_EN);
3051 /* See PropList-3.0.1.txt. */
3053 is_property_bidi_eur_num_separator (unsigned int ch)
3055 return (get_bidi_category (ch) == UC_BIDI_ES);
3058 /* See PropList-3.0.1.txt. */
3060 is_property_bidi_eur_num_terminator (unsigned int ch)
3062 return (get_bidi_category (ch) == UC_BIDI_ET);
3065 /* See PropList-3.0.1.txt. */
3067 is_property_bidi_arabic_digit (unsigned int ch)
3069 return (get_bidi_category (ch) == UC_BIDI_AN);
3072 /* See PropList-3.0.1.txt. */
3074 is_property_bidi_common_separator (unsigned int ch)
3076 return (get_bidi_category (ch) == UC_BIDI_CS);
3079 /* See PropList-3.0.1.txt. */
3081 is_property_bidi_block_separator (unsigned int ch)
3083 return (get_bidi_category (ch) == UC_BIDI_B);
3086 /* See PropList-3.0.1.txt. */
3088 is_property_bidi_segment_separator (unsigned int ch)
3090 return (get_bidi_category (ch) == UC_BIDI_S);
3093 /* See PropList-3.0.1.txt. */
3095 is_property_bidi_whitespace (unsigned int ch)
3097 return (get_bidi_category (ch) == UC_BIDI_WS);
3100 /* See PropList-3.0.1.txt. */
3102 is_property_bidi_non_spacing_mark (unsigned int ch)
3104 return (get_bidi_category (ch) == UC_BIDI_NSM);
3107 /* See PropList-3.0.1.txt. */
3109 is_property_bidi_boundary_neutral (unsigned int ch)
3111 return (get_bidi_category (ch) == UC_BIDI_BN);
3114 /* See PropList-3.0.1.txt. */
3116 is_property_bidi_pdf (unsigned int ch)
3118 return (get_bidi_category (ch) == UC_BIDI_PDF);
3121 /* See PropList-3.0.1.txt. */
3123 is_property_bidi_embedding_or_override (unsigned int ch)
3125 int category = get_bidi_category (ch);
3126 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3127 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3130 /* See PropList-3.0.1.txt. */
3132 is_property_bidi_other_neutral (unsigned int ch)
3134 return (get_bidi_category (ch) == UC_BIDI_ON);
3137 /* See PropList.txt, UCD.html. */
3139 is_property_hex_digit (unsigned int ch)
3141 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3144 /* See PropList.txt, UCD.html. */
3146 is_property_ascii_hex_digit (unsigned int ch)
3148 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3151 /* See Unicode 3.0 book, section 4.10,
3152 PropList.txt, UCD.html. */
3154 is_property_ideographic (unsigned int ch)
3156 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3159 /* See PropList.txt, UCD.html. */
3161 is_property_unified_ideograph (unsigned int ch)
3163 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3166 /* See PropList.txt, UCD.html. */
3168 is_property_radical (unsigned int ch)
3170 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3173 /* See PropList.txt, UCD.html. */
3175 is_property_ids_binary_operator (unsigned int ch)
3177 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3180 /* See PropList.txt, UCD.html. */
3182 is_property_ids_trinary_operator (unsigned int ch)
3184 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3187 /* See PropList-3.0.1.txt. */
3189 is_property_zero_width (unsigned int ch)
3191 return is_category_Cf (ch)
3192 || (unicode_attributes[ch].name != NULL
3193 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3196 /* See PropList-3.0.1.txt. */
3198 is_property_space (unsigned int ch)
3200 return is_category_Zs (ch);
3203 /* See PropList-3.0.1.txt. */
3205 is_property_non_break (unsigned int ch)
3207 /* This is exactly the set of characters having line breaking
3209 return (ch == 0x00A0 /* NO-BREAK SPACE */
3210 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3211 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3212 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3213 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3214 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3215 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3216 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3217 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3218 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3219 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3220 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3221 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3222 || ch == 0x2007 /* FIGURE SPACE */
3223 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3224 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3227 /* See PropList-3.0.1.txt. */
3229 is_property_iso_control (unsigned int ch)
3232 (unicode_attributes[ch].name != NULL
3233 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3235 is_category_Cc (ch);
3237 if (result1 != result2)
3242 /* See PropList-3.0.1.txt. */
3244 is_property_format_control (unsigned int ch)
3246 return (is_category_Cf (ch)
3247 && get_bidi_category (ch) == UC_BIDI_BN
3248 && !is_property_join_control (ch)
3252 /* See PropList.txt, UCD.html. */
3254 is_property_dash (unsigned int ch)
3256 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3259 /* See PropList.txt, UCD.html. */
3261 is_property_hyphen (unsigned int ch)
3263 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3266 /* See PropList-3.0.1.txt. */
3268 is_property_punctuation (unsigned int ch)
3270 return is_category_P (ch);
3273 /* See PropList-3.0.1.txt. */
3275 is_property_line_separator (unsigned int ch)
3277 return is_category_Zl (ch);
3280 /* See PropList-3.0.1.txt. */
3282 is_property_paragraph_separator (unsigned int ch)
3284 return is_category_Zp (ch);
3287 /* See PropList.txt, UCD.html. */
3289 is_property_quotation_mark (unsigned int ch)
3291 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3294 /* See PropList.txt, UCD.html. */
3296 is_property_sentence_terminal (unsigned int ch)
3298 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3301 /* See PropList.txt, UCD.html. */
3303 is_property_terminal_punctuation (unsigned int ch)
3305 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3308 /* See PropList-3.0.1.txt. */
3310 is_property_currency_symbol (unsigned int ch)
3312 return is_category_Sc (ch);
3315 /* See Unicode 3.0 book, section 4.9,
3316 PropList.txt, UCD.html,
3317 DerivedCoreProperties.txt, UCD.html. */
3319 is_property_math (unsigned int ch)
3323 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3325 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3327 if (result1 != result2)
3332 /* See PropList.txt, UCD.html. */
3334 is_property_other_math (unsigned int ch)
3336 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3339 /* See PropList-3.0.1.txt. */
3341 is_property_paired_punctuation (unsigned int ch)
3343 return unicode_pairedpunctuation[ch];
3346 /* See PropList-3.0.1.txt. */
3348 is_property_left_of_pair (unsigned int ch)
3350 return unicode_leftofpair[ch];
3353 /* See PropList-3.0.1.txt. */
3355 is_property_combining (unsigned int ch)
3357 return (unicode_attributes[ch].name != NULL
3358 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3359 || is_category_Mc (ch)
3360 || is_category_Me (ch)
3361 || is_category_Mn (ch)));
3364 #if 0 /* same as is_property_bidi_non_spacing_mark */
3365 /* See PropList-3.0.1.txt. */
3367 is_property_non_spacing (unsigned int ch)
3369 return (unicode_attributes[ch].name != NULL
3370 && get_bidi_category (ch) == UC_BIDI_NSM);
3374 /* See PropList-3.0.1.txt. */
3376 is_property_composite (unsigned int ch)
3378 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3379 logical in some sense. */
3380 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3382 if (unicode_attributes[ch].name != NULL
3383 && unicode_attributes[ch].decomposition != NULL)
3385 /* Test whether the decomposition contains more than one character,
3386 and the first is not a space. */
3387 const char *decomp = unicode_attributes[ch].decomposition;
3388 if (decomp[0] == '<')
3390 decomp = strchr (decomp, '>') + 1;
3391 if (decomp[0] == ' ')
3394 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3399 /* See PropList-3.0.1.txt. */
3401 is_property_decimal_digit (unsigned int ch)
3403 return is_category_Nd (ch);
3406 /* See PropList-3.0.1.txt. */
3408 is_property_numeric (unsigned int ch)
3410 return ((get_numeric_value (ch)).denominator > 0)
3411 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3412 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3415 /* See PropList.txt, UCD.html. */
3417 is_property_diacritic (unsigned int ch)
3419 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3422 /* See PropList.txt, UCD.html. */
3424 is_property_extender (unsigned int ch)
3426 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3429 /* See PropList-3.0.1.txt. */
3431 is_property_ignorable_control (unsigned int ch)
3433 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3434 || is_category_Cf (ch))
3438 /* ------------------------------------------------------------------------- */
3440 /* Output all properties. */
3442 output_properties (const char *version)
3444 #define PROPERTY(P) \
3445 debug_output_predicate ("pr_" #P ".txt", is_property_ ## P); \
3446 output_predicate_test ("test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3447 output_predicate ("pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3448 PROPERTY(white_space)
3449 PROPERTY(alphabetic)
3450 PROPERTY(other_alphabetic)
3451 PROPERTY(not_a_character)
3452 PROPERTY(default_ignorable_code_point)
3453 PROPERTY(other_default_ignorable_code_point)
3454 PROPERTY(deprecated)
3455 PROPERTY(logical_order_exception)
3456 PROPERTY(variation_selector)
3457 PROPERTY(private_use)
3458 PROPERTY(unassigned_code_value)
3460 PROPERTY(other_uppercase)
3462 PROPERTY(other_lowercase)
3464 PROPERTY(soft_dotted)
3466 PROPERTY(other_id_start)
3467 PROPERTY(id_continue)
3468 PROPERTY(other_id_continue)
3470 PROPERTY(xid_continue)
3471 PROPERTY(pattern_white_space)
3472 PROPERTY(pattern_syntax)
3473 PROPERTY(join_control)
3474 PROPERTY(grapheme_base)
3475 PROPERTY(grapheme_extend)
3476 PROPERTY(other_grapheme_extend)
3477 PROPERTY(grapheme_link)
3478 PROPERTY(bidi_control)
3479 PROPERTY(bidi_left_to_right)
3480 PROPERTY(bidi_hebrew_right_to_left)
3481 PROPERTY(bidi_arabic_right_to_left)
3482 PROPERTY(bidi_european_digit)
3483 PROPERTY(bidi_eur_num_separator)
3484 PROPERTY(bidi_eur_num_terminator)
3485 PROPERTY(bidi_arabic_digit)
3486 PROPERTY(bidi_common_separator)
3487 PROPERTY(bidi_block_separator)
3488 PROPERTY(bidi_segment_separator)
3489 PROPERTY(bidi_whitespace)
3490 PROPERTY(bidi_non_spacing_mark)
3491 PROPERTY(bidi_boundary_neutral)
3493 PROPERTY(bidi_embedding_or_override)
3494 PROPERTY(bidi_other_neutral)
3496 PROPERTY(ascii_hex_digit)
3497 PROPERTY(ideographic)
3498 PROPERTY(unified_ideograph)
3500 PROPERTY(ids_binary_operator)
3501 PROPERTY(ids_trinary_operator)
3502 PROPERTY(zero_width)
3505 PROPERTY(iso_control)
3506 PROPERTY(format_control)
3509 PROPERTY(punctuation)
3510 PROPERTY(line_separator)
3511 PROPERTY(paragraph_separator)
3512 PROPERTY(quotation_mark)
3513 PROPERTY(sentence_terminal)
3514 PROPERTY(terminal_punctuation)
3515 PROPERTY(currency_symbol)
3517 PROPERTY(other_math)
3518 PROPERTY(paired_punctuation)
3519 PROPERTY(left_of_pair)
3522 PROPERTY(decimal_digit)
3526 PROPERTY(ignorable_control)
3530 /* ========================================================================= */
3534 static const char *scripts[256];
3535 static unsigned int numscripts;
3537 static uint8_t unicode_scripts[0x110000];
3540 fill_scripts (const char *scripts_filename)
3545 stream = fopen (scripts_filename, "r");
3548 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3554 for (i = 0; i < 0x110000; i++)
3555 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3560 unsigned int i1, i2;
3561 char padding[200+1];
3562 char scriptname[200+1];
3565 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3568 if (buf[0] == '\0' || buf[0] == '#')
3571 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3573 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3575 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3585 for (script = numscripts - 1; script >= 0; script--)
3586 if (strcmp (scripts[script], scriptname) == 0)
3590 scripts[numscripts] = strdup (scriptname);
3591 script = numscripts;
3593 if (numscripts == 256)
3597 for (i = i1; i <= i2; i++)
3599 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3600 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3601 unicode_scripts[i] = script;
3605 if (ferror (stream) || fclose (stream))
3607 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3612 /* Construction of sparse 3-level tables. */
3613 #define TABLE script_table
3614 #define ELEMENT uint8_t
3615 #define DEFAULT (uint8_t)~(uint8_t)0
3616 #define xmalloc malloc
3617 #define xrealloc realloc
3621 output_scripts (const char *version)
3623 const char *filename = "scripts.h";
3625 unsigned int ch, s, i;
3626 struct script_table t;
3627 unsigned int level1_offset, level2_offset, level3_offset;
3631 const char *lowercase_name;
3634 scriptinfo_t scriptinfo[256];
3636 stream = fopen (filename, "w");
3639 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3643 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3644 fprintf (stream, "/* Unicode scripts. */\n");
3645 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3648 for (s = 0; s < numscripts; s++)
3650 char *lcp = strdup (scripts[s]);
3653 for (cp = lcp; *cp != '\0'; cp++)
3654 if (*cp >= 'A' && *cp <= 'Z')
3657 scriptinfo[s].lowercase_name = lcp;
3660 for (s = 0; s < numscripts; s++)
3662 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3663 scriptinfo[s].lowercase_name);
3664 fprintf (stream, "{\n");
3666 for (ch = 0; ch < 0x110000; ch++)
3667 if (unicode_scripts[ch] == s)
3673 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3678 fprintf (stream, ",\n");
3680 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3682 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3686 fprintf (stream, "\n");
3687 fprintf (stream, "};\n");
3690 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3691 fprintf (stream, "{\n");
3692 for (s = 0; s < numscripts; s++)
3694 fprintf (stream, " {\n");
3695 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3696 scriptinfo[s].lowercase_name);
3697 fprintf (stream, " script_%s_intervals,\n",
3698 scriptinfo[s].lowercase_name);
3699 fprintf (stream, " \"%s\"\n", scripts[s]);
3700 fprintf (stream, " }");
3701 if (s+1 < numscripts)
3702 fprintf (stream, ",");
3703 fprintf (stream, "\n");
3705 fprintf (stream, "};\n");
3709 script_table_init (&t);
3711 for (ch = 0; ch < 0x110000; ch++)
3713 unsigned int s = unicode_scripts[ch];
3714 if (s != (uint8_t)~(uint8_t)0)
3715 script_table_add (&t, ch, s);
3718 script_table_finalize (&t);
3720 /* Offsets in t.result, in memory of this process. */
3722 5 * sizeof (uint32_t);
3724 5 * sizeof (uint32_t)
3725 + t.level1_size * sizeof (uint32_t);
3727 5 * sizeof (uint32_t)
3728 + t.level1_size * sizeof (uint32_t)
3729 + (t.level2_size << t.q) * sizeof (uint32_t);
3731 for (i = 0; i < 5; i++)
3732 fprintf (stream, "#define script_header_%d %d\n", i,
3733 ((uint32_t *) t.result)[i]);
3734 fprintf (stream, "static const\n");
3735 fprintf (stream, "struct\n");
3736 fprintf (stream, " {\n");
3737 fprintf (stream, " int level1[%d];\n", t.level1_size);
3738 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
3739 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
3740 fprintf (stream, " }\n");
3741 fprintf (stream, "u_script =\n");
3742 fprintf (stream, "{\n");
3743 fprintf (stream, " {");
3744 if (t.level1_size > 8)
3745 fprintf (stream, "\n ");
3746 for (i = 0; i < t.level1_size; i++)
3749 if (i > 0 && (i % 8) == 0)
3750 fprintf (stream, "\n ");
3751 offset = ((uint32_t *) (t.result + level1_offset))[i];
3753 fprintf (stream, " %5d", -1);
3755 fprintf (stream, " %5d",
3756 (offset - level2_offset) / sizeof (uint32_t));
3757 if (i+1 < t.level1_size)
3758 fprintf (stream, ",");
3760 if (t.level1_size > 8)
3761 fprintf (stream, "\n ");
3762 fprintf (stream, " },\n");
3763 fprintf (stream, " {");
3764 if (t.level2_size << t.q > 8)
3765 fprintf (stream, "\n ");
3766 for (i = 0; i < t.level2_size << t.q; i++)
3769 if (i > 0 && (i % 8) == 0)
3770 fprintf (stream, "\n ");
3771 offset = ((uint32_t *) (t.result + level2_offset))[i];
3773 fprintf (stream, " %5d", -1);
3775 fprintf (stream, " %5d",
3776 (offset - level3_offset) / sizeof (uint8_t));
3777 if (i+1 < t.level2_size << t.q)
3778 fprintf (stream, ",");
3780 if (t.level2_size << t.q > 8)
3781 fprintf (stream, "\n ");
3782 fprintf (stream, " },\n");
3783 fprintf (stream, " {");
3784 if (t.level3_size << t.p > 8)
3785 fprintf (stream, "\n ");
3786 for (i = 0; i < t.level3_size << t.p; i++)
3788 if (i > 0 && (i % 8) == 0)
3789 fprintf (stream, "\n ");
3790 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3791 if (i+1 < t.level3_size << t.p)
3792 fprintf (stream, ",");
3794 if (t.level3_size << t.p > 8)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " }\n");
3797 fprintf (stream, "};\n");
3799 if (ferror (stream) || fclose (stream))
3801 fprintf (stderr, "error writing to '%s'\n", filename);
3807 output_scripts_byname (const char *version)
3809 const char *filename = "scripts_byname.gperf";
3813 stream = fopen (filename, "w");
3816 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3820 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3821 fprintf (stream, "/* Unicode scripts. */\n");
3822 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3824 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3825 fprintf (stream, "%%struct-type\n");
3826 fprintf (stream, "%%language=ANSI-C\n");
3827 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3828 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3829 fprintf (stream, "%%readonly-tables\n");
3830 fprintf (stream, "%%global-table\n");
3831 fprintf (stream, "%%define word-array-name script_names\n");
3832 fprintf (stream, "%%%%\n");
3833 for (s = 0; s < numscripts; s++)
3834 fprintf (stream, "%s, %u\n", scripts[s], s);
3836 if (ferror (stream) || fclose (stream))
3838 fprintf (stderr, "error writing to '%s'\n", filename);
3843 /* ========================================================================= */
3847 typedef struct { unsigned int start; unsigned int end; const char *name; }
3849 static block_t blocks[256];
3850 static unsigned int numblocks;
3853 fill_blocks (const char *blocks_filename)
3857 stream = fopen (blocks_filename, "r");
3860 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3867 unsigned int i1, i2;
3868 char padding[200+1];
3869 char blockname[200+1];
3871 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3874 if (buf[0] == '\0' || buf[0] == '#')
3877 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3879 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3882 blocks[numblocks].start = i1;
3883 blocks[numblocks].end = i2;
3884 blocks[numblocks].name = strdup (blockname);
3885 /* It must be sorted. */
3886 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3889 if (numblocks == 256)
3893 if (ferror (stream) || fclose (stream))
3895 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3900 /* Return the smallest block index among the blocks for characters >= ch. */
3902 block_first_index (unsigned int ch)
3904 /* Binary search. */
3905 unsigned int lo = 0;
3906 unsigned int hi = numblocks;
3908 All blocks[i], i < lo, have blocks[i].end < ch,
3909 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3912 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3913 if (blocks[mid].end < ch)
3921 /* Return the largest block index among the blocks for characters <= ch,
3924 block_last_index (unsigned int ch)
3926 /* Binary search. */
3927 unsigned int lo = 0;
3928 unsigned int hi = numblocks;
3930 All blocks[i], i < lo, have blocks[i].start <= ch,
3931 all blocks[i], i >= hi, have blocks[i].start > ch. */
3934 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3935 if (blocks[mid].start <= ch)
3944 output_blocks (const char *version)
3946 const char *filename = "blocks.h";
3947 const unsigned int shift = 8; /* bits to shift away for array access */
3948 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3953 stream = fopen (filename, "w");
3956 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3960 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3961 fprintf (stream, "/* Unicode blocks. */\n");
3962 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3965 fprintf (stream, "static const uc_block_t blocks[] =\n");
3966 fprintf (stream, "{\n");
3967 for (i = 0; i < numblocks; i++)
3969 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3970 blocks[i].end, blocks[i].name);
3971 if (i+1 < numblocks)
3972 fprintf (stream, ",");
3973 fprintf (stream, "\n");
3975 fprintf (stream, "};\n");
3976 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3977 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3978 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3979 threshold >> shift);
3980 fprintf (stream, "{\n");
3981 for (i1 = 0; i1 < (threshold >> shift); i1++)
3983 unsigned int first_index = block_first_index (i1 << shift);
3984 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3985 fprintf (stream, " %3d, %3d", first_index, last_index);
3986 if (i1+1 < (threshold >> shift))
3987 fprintf (stream, ",");
3988 fprintf (stream, "\n");
3990 fprintf (stream, "};\n");
3991 fprintf (stream, "#define blocks_upper_first_index %d\n",
3992 block_first_index (threshold));
3993 fprintf (stream, "#define blocks_upper_last_index %d\n",
3994 block_last_index (0x10FFFF));
3996 if (ferror (stream) || fclose (stream))
3998 fprintf (stderr, "error writing to '%s'\n", filename);
4003 /* ========================================================================= */
4005 /* C and Java syntax. */
4009 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4010 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4011 UC_IDENTIFIER_INVALID, /* not valid */
4012 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4015 /* ISO C 99 section 6.4.(3). */
4017 is_c_whitespace (unsigned int ch)
4019 return (ch == ' ' /* space */
4020 || ch == '\t' /* horizontal tab */
4021 || ch == '\n' || ch == '\r' /* new-line */
4022 || ch == '\v' /* vertical tab */
4023 || ch == '\f'); /* form-feed */
4026 /* ISO C 99 section 6.4.2.1 and appendix D. */
4028 c_ident_category (unsigned int ch)
4030 /* Section 6.4.2.1. */
4031 if (ch >= '0' && ch <= '9')
4032 return UC_IDENTIFIER_VALID;
4033 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4034 return UC_IDENTIFIER_START;
4040 || (ch >= 0x00C0 && ch <= 0x00D6)
4041 || (ch >= 0x00D8 && ch <= 0x00F6)
4042 || (ch >= 0x00F8 && ch <= 0x01F5)
4043 || (ch >= 0x01FA && ch <= 0x0217)
4044 || (ch >= 0x0250 && ch <= 0x02A8)
4045 || (ch >= 0x1E00 && ch <= 0x1E9B)
4046 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4050 || (ch >= 0x0388 && ch <= 0x038A)
4052 || (ch >= 0x038E && ch <= 0x03A1)
4053 || (ch >= 0x03A3 && ch <= 0x03CE)
4054 || (ch >= 0x03D0 && ch <= 0x03D6)
4059 || (ch >= 0x03E2 && ch <= 0x03F3)
4060 || (ch >= 0x1F00 && ch <= 0x1F15)
4061 || (ch >= 0x1F18 && ch <= 0x1F1D)
4062 || (ch >= 0x1F20 && ch <= 0x1F45)
4063 || (ch >= 0x1F48 && ch <= 0x1F4D)
4064 || (ch >= 0x1F50 && ch <= 0x1F57)
4068 || (ch >= 0x1F5F && ch <= 0x1F7D)
4069 || (ch >= 0x1F80 && ch <= 0x1FB4)
4070 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4071 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4072 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4073 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4074 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4075 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4076 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4077 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4079 || (ch >= 0x0401 && ch <= 0x040C)
4080 || (ch >= 0x040E && ch <= 0x044F)
4081 || (ch >= 0x0451 && ch <= 0x045C)
4082 || (ch >= 0x045E && ch <= 0x0481)
4083 || (ch >= 0x0490 && ch <= 0x04C4)
4084 || (ch >= 0x04C7 && ch <= 0x04C8)
4085 || (ch >= 0x04CB && ch <= 0x04CC)
4086 || (ch >= 0x04D0 && ch <= 0x04EB)
4087 || (ch >= 0x04EE && ch <= 0x04F5)
4088 || (ch >= 0x04F8 && ch <= 0x04F9)
4090 || (ch >= 0x0531 && ch <= 0x0556)
4091 || (ch >= 0x0561 && ch <= 0x0587)
4093 || (ch >= 0x05B0 && ch <= 0x05B9)
4094 || (ch >= 0x05BB && ch <= 0x05BD)
4096 || (ch >= 0x05C1 && ch <= 0x05C2)
4097 || (ch >= 0x05D0 && ch <= 0x05EA)
4098 || (ch >= 0x05F0 && ch <= 0x05F2)
4100 || (ch >= 0x0621 && ch <= 0x063A)
4101 || (ch >= 0x0640 && ch <= 0x0652)
4102 || (ch >= 0x0670 && ch <= 0x06B7)
4103 || (ch >= 0x06BA && ch <= 0x06BE)
4104 || (ch >= 0x06C0 && ch <= 0x06CE)
4105 || (ch >= 0x06D0 && ch <= 0x06DC)
4106 || (ch >= 0x06E5 && ch <= 0x06E8)
4107 || (ch >= 0x06EA && ch <= 0x06ED)
4109 || (ch >= 0x0901 && ch <= 0x0903)
4110 || (ch >= 0x0905 && ch <= 0x0939)
4111 || (ch >= 0x093E && ch <= 0x094D)
4112 || (ch >= 0x0950 && ch <= 0x0952)
4113 || (ch >= 0x0958 && ch <= 0x0963)
4115 || (ch >= 0x0981 && ch <= 0x0983)
4116 || (ch >= 0x0985 && ch <= 0x098C)
4117 || (ch >= 0x098F && ch <= 0x0990)
4118 || (ch >= 0x0993 && ch <= 0x09A8)
4119 || (ch >= 0x09AA && ch <= 0x09B0)
4121 || (ch >= 0x09B6 && ch <= 0x09B9)
4122 || (ch >= 0x09BE && ch <= 0x09C4)
4123 || (ch >= 0x09C7 && ch <= 0x09C8)
4124 || (ch >= 0x09CB && ch <= 0x09CD)
4125 || (ch >= 0x09DC && ch <= 0x09DD)
4126 || (ch >= 0x09DF && ch <= 0x09E3)
4127 || (ch >= 0x09F0 && ch <= 0x09F1)
4130 || (ch >= 0x0A05 && ch <= 0x0A0A)
4131 || (ch >= 0x0A0F && ch <= 0x0A10)
4132 || (ch >= 0x0A13 && ch <= 0x0A28)
4133 || (ch >= 0x0A2A && ch <= 0x0A30)
4134 || (ch >= 0x0A32 && ch <= 0x0A33)
4135 || (ch >= 0x0A35 && ch <= 0x0A36)
4136 || (ch >= 0x0A38 && ch <= 0x0A39)
4137 || (ch >= 0x0A3E && ch <= 0x0A42)
4138 || (ch >= 0x0A47 && ch <= 0x0A48)
4139 || (ch >= 0x0A4B && ch <= 0x0A4D)
4140 || (ch >= 0x0A59 && ch <= 0x0A5C)
4144 || (ch >= 0x0A81 && ch <= 0x0A83)
4145 || (ch >= 0x0A85 && ch <= 0x0A8B)
4147 || (ch >= 0x0A8F && ch <= 0x0A91)
4148 || (ch >= 0x0A93 && ch <= 0x0AA8)
4149 || (ch >= 0x0AAA && ch <= 0x0AB0)
4150 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4151 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4152 || (ch >= 0x0ABD && ch <= 0x0AC5)
4153 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4154 || (ch >= 0x0ACB && ch <= 0x0ACD)
4158 || (ch >= 0x0B01 && ch <= 0x0B03)
4159 || (ch >= 0x0B05 && ch <= 0x0B0C)
4160 || (ch >= 0x0B0F && ch <= 0x0B10)
4161 || (ch >= 0x0B13 && ch <= 0x0B28)
4162 || (ch >= 0x0B2A && ch <= 0x0B30)
4163 || (ch >= 0x0B32 && ch <= 0x0B33)
4164 || (ch >= 0x0B36 && ch <= 0x0B39)
4165 || (ch >= 0x0B3E && ch <= 0x0B43)
4166 || (ch >= 0x0B47 && ch <= 0x0B48)
4167 || (ch >= 0x0B4B && ch <= 0x0B4D)
4168 || (ch >= 0x0B5C && ch <= 0x0B5D)
4169 || (ch >= 0x0B5F && ch <= 0x0B61)
4171 || (ch >= 0x0B82 && ch <= 0x0B83)
4172 || (ch >= 0x0B85 && ch <= 0x0B8A)
4173 || (ch >= 0x0B8E && ch <= 0x0B90)
4174 || (ch >= 0x0B92 && ch <= 0x0B95)
4175 || (ch >= 0x0B99 && ch <= 0x0B9A)
4177 || (ch >= 0x0B9E && ch <= 0x0B9F)
4178 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4179 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4180 || (ch >= 0x0BAE && ch <= 0x0BB5)
4181 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4182 || (ch >= 0x0BBE && ch <= 0x0BC2)
4183 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4184 || (ch >= 0x0BCA && ch <= 0x0BCD)
4186 || (ch >= 0x0C01 && ch <= 0x0C03)
4187 || (ch >= 0x0C05 && ch <= 0x0C0C)
4188 || (ch >= 0x0C0E && ch <= 0x0C10)
4189 || (ch >= 0x0C12 && ch <= 0x0C28)
4190 || (ch >= 0x0C2A && ch <= 0x0C33)
4191 || (ch >= 0x0C35 && ch <= 0x0C39)
4192 || (ch >= 0x0C3E && ch <= 0x0C44)
4193 || (ch >= 0x0C46 && ch <= 0x0C48)
4194 || (ch >= 0x0C4A && ch <= 0x0C4D)
4195 || (ch >= 0x0C60 && ch <= 0x0C61)
4197 || (ch >= 0x0C82 && ch <= 0x0C83)
4198 || (ch >= 0x0C85 && ch <= 0x0C8C)
4199 || (ch >= 0x0C8E && ch <= 0x0C90)
4200 || (ch >= 0x0C92 && ch <= 0x0CA8)
4201 || (ch >= 0x0CAA && ch <= 0x0CB3)
4202 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4203 || (ch >= 0x0CBE && ch <= 0x0CC4)
4204 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4205 || (ch >= 0x0CCA && ch <= 0x0CCD)
4207 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4209 || (ch >= 0x0D02 && ch <= 0x0D03)
4210 || (ch >= 0x0D05 && ch <= 0x0D0C)
4211 || (ch >= 0x0D0E && ch <= 0x0D10)
4212 || (ch >= 0x0D12 && ch <= 0x0D28)
4213 || (ch >= 0x0D2A && ch <= 0x0D39)
4214 || (ch >= 0x0D3E && ch <= 0x0D43)
4215 || (ch >= 0x0D46 && ch <= 0x0D48)
4216 || (ch >= 0x0D4A && ch <= 0x0D4D)
4217 || (ch >= 0x0D60 && ch <= 0x0D61)
4219 || (ch >= 0x0E01 && ch <= 0x0E3A)
4220 || (ch >= 0x0E40 && ch <= 0x0E5B)
4222 || (ch >= 0x0E81 && ch <= 0x0E82)
4224 || (ch >= 0x0E87 && ch <= 0x0E88)
4227 || (ch >= 0x0E94 && ch <= 0x0E97)
4228 || (ch >= 0x0E99 && ch <= 0x0E9F)
4229 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4232 || (ch >= 0x0EAA && ch <= 0x0EAB)
4233 || (ch >= 0x0EAD && ch <= 0x0EAE)
4234 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4235 || (ch >= 0x0EBB && ch <= 0x0EBD)
4236 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4238 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4239 || (ch >= 0x0EDC && ch <= 0x0EDD)
4242 || (ch >= 0x0F18 && ch <= 0x0F19)
4246 || (ch >= 0x0F3E && ch <= 0x0F47)
4247 || (ch >= 0x0F49 && ch <= 0x0F69)
4248 || (ch >= 0x0F71 && ch <= 0x0F84)
4249 || (ch >= 0x0F86 && ch <= 0x0F8B)
4250 || (ch >= 0x0F90 && ch <= 0x0F95)
4252 || (ch >= 0x0F99 && ch <= 0x0FAD)
4253 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4256 || (ch >= 0x10A0 && ch <= 0x10C5)
4257 || (ch >= 0x10D0 && ch <= 0x10F6)
4259 || (ch >= 0x3041 && ch <= 0x3093)
4260 || (ch >= 0x309B && ch <= 0x309C)
4262 || (ch >= 0x30A1 && ch <= 0x30F6)
4263 || (ch >= 0x30FB && ch <= 0x30FC)
4265 || (ch >= 0x3105 && ch <= 0x312C)
4266 /* CJK Unified Ideographs */
4267 || (ch >= 0x4E00 && ch <= 0x9FA5)
4269 || (ch >= 0xAC00 && ch <= 0xD7A3)
4271 || (ch >= 0x0660 && ch <= 0x0669)
4272 || (ch >= 0x06F0 && ch <= 0x06F9)
4273 || (ch >= 0x0966 && ch <= 0x096F)
4274 || (ch >= 0x09E6 && ch <= 0x09EF)
4275 || (ch >= 0x0A66 && ch <= 0x0A6F)
4276 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4277 || (ch >= 0x0B66 && ch <= 0x0B6F)
4278 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4279 || (ch >= 0x0C66 && ch <= 0x0C6F)
4280 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4281 || (ch >= 0x0D66 && ch <= 0x0D6F)
4282 || (ch >= 0x0E50 && ch <= 0x0E59)
4283 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4284 || (ch >= 0x0F20 && ch <= 0x0F33)
4285 /* Special characters */
4288 || (ch >= 0x02B0 && ch <= 0x02B8)
4290 || (ch >= 0x02BD && ch <= 0x02C1)
4291 || (ch >= 0x02D0 && ch <= 0x02D1)
4292 || (ch >= 0x02E0 && ch <= 0x02E4)
4298 || (ch >= 0x203F && ch <= 0x2040)
4301 || (ch >= 0x210A && ch <= 0x2113)
4303 || (ch >= 0x2118 && ch <= 0x211D)
4307 || (ch >= 0x212A && ch <= 0x2131)
4308 || (ch >= 0x2133 && ch <= 0x2138)
4309 || (ch >= 0x2160 && ch <= 0x2182)
4310 || (ch >= 0x3005 && ch <= 0x3007)
4311 || (ch >= 0x3021 && ch <= 0x3029)
4313 return UC_IDENTIFIER_START;
4314 return UC_IDENTIFIER_INVALID;
4317 /* The Java Language Specification, 3rd edition, §3.6.
4318 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4320 is_java_whitespace (unsigned int ch)
4322 return (ch == ' ' || ch == '\t' || ch == '\f'
4323 || ch == '\n' || ch == '\r');
4326 /* The Java Language Specification, 3rd edition, §3.8.
4327 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4328 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4330 java_ident_category (unsigned int ch)
4332 /* FIXME: Check this against Sun's JDK implementation. */
4333 if (is_category_L (ch) /* = Character.isLetter(ch) */
4334 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4335 || is_category_Sc (ch) /* currency symbol */
4336 || is_category_Pc (ch) /* connector punctuation */
4338 return UC_IDENTIFIER_START;
4339 if (is_category_Nd (ch) /* digit */
4340 || is_category_Mc (ch) /* combining mark */
4341 || is_category_Mn (ch) /* non-spacing mark */
4343 return UC_IDENTIFIER_VALID;
4344 if ((ch >= 0x0000 && ch <= 0x0008)
4345 || (ch >= 0x000E && ch <= 0x001B)
4346 || (ch >= 0x007F && ch <= 0x009F)
4347 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4349 return UC_IDENTIFIER_IGNORABLE;
4350 return UC_IDENTIFIER_INVALID;
4353 /* Construction of sparse 3-level tables. */
4354 #define TABLE identsyntax_table
4355 #define ELEMENT uint8_t
4356 #define DEFAULT UC_IDENTIFIER_INVALID
4357 #define xmalloc malloc
4358 #define xrealloc realloc
4361 /* Output an identifier syntax categorization in a three-level bitmap. */
4363 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4367 struct identsyntax_table t;
4368 unsigned int level1_offset, level2_offset, level3_offset;
4370 stream = fopen (filename, "w");
4373 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4377 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4378 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4379 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4384 identsyntax_table_init (&t);
4386 for (ch = 0; ch < 0x110000; ch++)
4388 int syntaxcode = predicate (ch);
4389 if (syntaxcode != UC_IDENTIFIER_INVALID)
4390 identsyntax_table_add (&t, ch, syntaxcode);
4393 identsyntax_table_finalize (&t);
4395 /* Offsets in t.result, in memory of this process. */
4397 5 * sizeof (uint32_t);
4399 5 * sizeof (uint32_t)
4400 + t.level1_size * sizeof (uint32_t);
4402 5 * sizeof (uint32_t)
4403 + t.level1_size * sizeof (uint32_t)
4404 + (t.level2_size << t.q) * sizeof (uint32_t);
4406 for (i = 0; i < 5; i++)
4407 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4408 ((uint32_t *) t.result)[i]);
4409 fprintf (stream, "static const\n");
4410 fprintf (stream, "struct\n");
4411 fprintf (stream, " {\n");
4412 fprintf (stream, " int level1[%d];\n", t.level1_size);
4413 fprintf (stream, " short level2[%d << %d];\n", t.level2_size, t.q);
4414 fprintf (stream, " unsigned short level3[%d * %d];\n", t.level3_size,
4415 (1 << t.p) * 2 / 16);
4416 fprintf (stream, " }\n");
4417 fprintf (stream, "%s =\n", name);
4418 fprintf (stream, "{\n");
4419 fprintf (stream, " {");
4420 if (t.level1_size > 8)
4421 fprintf (stream, "\n ");
4422 for (i = 0; i < t.level1_size; i++)
4425 if (i > 0 && (i % 8) == 0)
4426 fprintf (stream, "\n ");
4427 offset = ((uint32_t *) (t.result + level1_offset))[i];
4429 fprintf (stream, " %5d", -1);
4431 fprintf (stream, " %5d",
4432 (offset - level2_offset) / sizeof (uint32_t));
4433 if (i+1 < t.level1_size)
4434 fprintf (stream, ",");
4436 if (t.level1_size > 8)
4437 fprintf (stream, "\n ");
4438 fprintf (stream, " },\n");
4439 fprintf (stream, " {");
4440 if (t.level2_size << t.q > 8)
4441 fprintf (stream, "\n ");
4442 for (i = 0; i < t.level2_size << t.q; i++)
4445 if (i > 0 && (i % 8) == 0)
4446 fprintf (stream, "\n ");
4447 offset = ((uint32_t *) (t.result + level2_offset))[i];
4449 fprintf (stream, " %5d", -1);
4451 fprintf (stream, " %5d",
4452 (offset - level3_offset) / sizeof (uint8_t));
4453 if (i+1 < t.level2_size << t.q)
4454 fprintf (stream, ",");
4456 if (t.level2_size << t.q > 8)
4457 fprintf (stream, "\n ");
4458 fprintf (stream, " },\n");
4459 /* Pack the level3 array. Each entry needs 2 bits only. */
4460 fprintf (stream, " {");
4461 if ((t.level3_size << t.p) * 2 / 16 > 8)
4462 fprintf (stream, "\n ");
4463 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4465 if (i > 0 && (i % 8) == 0)
4466 fprintf (stream, "\n ");
4467 fprintf (stream, " 0x%04x",
4468 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4469 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4470 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4476 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4477 fprintf (stream, ",");
4479 if ((t.level3_size << t.p) * 2 / 16 > 8)
4480 fprintf (stream, "\n ");
4481 fprintf (stream, " }\n");
4482 fprintf (stream, "};\n");
4484 if (ferror (stream) || fclose (stream))
4486 fprintf (stderr, "error writing to '%s'\n", filename);
4492 output_ident_properties (const char *version)
4494 #define PROPERTY(P) \
4495 debug_output_predicate ("sy_" #P ".txt", is_ ## P); \
4496 output_predicate_test ("test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4497 output_predicate ("sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4498 PROPERTY(c_whitespace)
4499 PROPERTY(java_whitespace)
4502 output_ident_category ("sy_c_ident.h", c_ident_category, "u_c_ident", version);
4503 output_ident_category ("sy_java_ident.h", java_ident_category, "u_java_ident", version);
4506 /* ========================================================================= */
4508 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4509 glibc/localedata/locales/i18n file, generated by
4510 glibc/localedata/gen-unicode-ctype.c. */
4512 /* Character mappings. */
4515 to_upper (unsigned int ch)
4517 if (unicode_attributes[ch].name != NULL
4518 && unicode_attributes[ch].upper != NONE)
4519 return unicode_attributes[ch].upper;
4525 to_lower (unsigned int ch)
4527 if (unicode_attributes[ch].name != NULL
4528 && unicode_attributes[ch].lower != NONE)
4529 return unicode_attributes[ch].lower;
4535 to_title (unsigned int ch)
4537 if (unicode_attributes[ch].name != NULL
4538 && unicode_attributes[ch].title != NONE)
4539 return unicode_attributes[ch].title;
4544 /* Character class properties. */
4547 is_upper (unsigned int ch)
4549 return (to_lower (ch) != ch);
4553 is_lower (unsigned int ch)
4555 return (to_upper (ch) != ch)
4556 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4561 is_alpha (unsigned int ch)
4563 return (unicode_attributes[ch].name != NULL
4564 && ((unicode_attributes[ch].category[0] == 'L'
4565 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4566 <U0E2F>, <U0E46> should belong to is_punct. */
4567 && (ch != 0x0E2F) && (ch != 0x0E46))
4568 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4569 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4571 || (ch >= 0x0E34 && ch <= 0x0E3A)
4572 || (ch >= 0x0E47 && ch <= 0x0E4E)
4573 /* Avoid warning for <U0345>. */
4575 /* Avoid warnings for <U2160>..<U217F>. */
4576 || (unicode_attributes[ch].category[0] == 'N'
4577 && unicode_attributes[ch].category[1] == 'l')
4578 /* Avoid warnings for <U24B6>..<U24E9>. */
4579 || (unicode_attributes[ch].category[0] == 'S'
4580 && unicode_attributes[ch].category[1] == 'o'
4581 && strstr (unicode_attributes[ch].name, " LETTER ")
4583 /* Consider all the non-ASCII digits as alphabetic.
4584 ISO C 99 forbids us to have them in category "digit",
4585 but we want iswalnum to return true on them. */
4586 || (unicode_attributes[ch].category[0] == 'N'
4587 && unicode_attributes[ch].category[1] == 'd'
4588 && !(ch >= 0x0030 && ch <= 0x0039))));
4592 is_digit (unsigned int ch)
4595 return (unicode_attributes[ch].name != NULL
4596 && unicode_attributes[ch].category[0] == 'N'
4597 && unicode_attributes[ch].category[1] == 'd');
4598 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4599 a zero. Must add <0> in front of them by hand. */
4601 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4604 The iswdigit function tests for any wide character that corresponds
4605 to a decimal-digit character (as defined in 5.2.1).
4607 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4609 return (ch >= 0x0030 && ch <= 0x0039);
4614 is_outdigit (unsigned int ch)
4616 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_alnum (unsigned int ch)
4622 return is_alpha (ch) || is_digit (ch);
4626 is_blank (unsigned int ch)
4628 return (ch == 0x0009 /* '\t' */
4629 /* Category Zs without mention of "<noBreak>" */
4630 || (unicode_attributes[ch].name != NULL
4631 && unicode_attributes[ch].category[0] == 'Z'
4632 && unicode_attributes[ch].category[1] == 's'
4633 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4637 is_space (unsigned int ch)
4639 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4640 should treat it like a punctuation character, not like a space. */
4641 return (ch == 0x0020 /* ' ' */
4642 || ch == 0x000C /* '\f' */
4643 || ch == 0x000A /* '\n' */
4644 || ch == 0x000D /* '\r' */
4645 || ch == 0x0009 /* '\t' */
4646 || ch == 0x000B /* '\v' */
4647 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4648 || (unicode_attributes[ch].name != NULL
4649 && unicode_attributes[ch].category[0] == 'Z'
4650 && (unicode_attributes[ch].category[1] == 'l'
4651 || unicode_attributes[ch].category[1] == 'p'
4652 || (unicode_attributes[ch].category[1] == 's'
4653 && !strstr (unicode_attributes[ch].decomposition,
4658 is_cntrl (unsigned int ch)
4660 return (unicode_attributes[ch].name != NULL
4661 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4662 /* Categories Zl and Zp */
4663 || (unicode_attributes[ch].category[0] == 'Z'
4664 && (unicode_attributes[ch].category[1] == 'l'
4665 || unicode_attributes[ch].category[1] == 'p'))));
4669 is_xdigit (unsigned int ch)
4672 return is_digit (ch)
4673 || (ch >= 0x0041 && ch <= 0x0046)
4674 || (ch >= 0x0061 && ch <= 0x0066);
4676 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4679 The iswxdigit function tests for any wide character that corresponds
4680 to a hexadecimal-digit character (as defined in 6.4.4.1).
4682 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4684 return (ch >= 0x0030 && ch <= 0x0039)
4685 || (ch >= 0x0041 && ch <= 0x0046)
4686 || (ch >= 0x0061 && ch <= 0x0066);
4691 is_graph (unsigned int ch)
4693 return (unicode_attributes[ch].name != NULL
4694 && strcmp (unicode_attributes[ch].name, "<control>")
4699 is_print (unsigned int ch)
4701 return (unicode_attributes[ch].name != NULL
4702 && strcmp (unicode_attributes[ch].name, "<control>")
4703 /* Categories Zl and Zp */
4704 && !(unicode_attributes[ch].name != NULL
4705 && unicode_attributes[ch].category[0] == 'Z'
4706 && (unicode_attributes[ch].category[1] == 'l'
4707 || unicode_attributes[ch].category[1] == 'p')));
4711 is_punct (unsigned int ch)
4714 return (unicode_attributes[ch].name != NULL
4715 && unicode_attributes[ch].category[0] == 'P');
4717 /* The traditional POSIX definition of punctuation is every graphic,
4718 non-alphanumeric character. */
4719 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4723 /* Output all properties. */
4725 output_old_ctype (const char *version)
4727 #define PROPERTY(P) \
4728 debug_output_predicate ("ctype_" #P ".txt", is_ ## P); \
4729 output_predicate_test ("test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4730 output_predicate ("ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4749 is_combining (unsigned int ch)
4751 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4752 file. In 3.0.1 it was identical to the union of the general categories
4753 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4754 PropList.txt file, so we take the latter definition. */
4755 return (unicode_attributes[ch].name != NULL
4756 && unicode_attributes[ch].category[0] == 'M'
4757 && (unicode_attributes[ch].category[1] == 'n'
4758 || unicode_attributes[ch].category[1] == 'c'
4759 || unicode_attributes[ch].category[1] == 'e'));
4763 is_combining_level3 (unsigned int ch)
4765 return is_combining (ch)
4766 && !(unicode_attributes[ch].combining[0] != '\0'
4767 && unicode_attributes[ch].combining[0] != '0'
4768 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4771 /* Return the UCS symbol string for a Unicode character. */
4773 ucs_symbol (unsigned int i)
4775 static char buf[11+1];
4777 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4781 /* Return the UCS symbol range string for a Unicode characters interval. */
4783 ucs_symbol_range (unsigned int low, unsigned int high)
4785 static char buf[24+1];
4787 strcpy (buf, ucs_symbol (low));
4789 strcat (buf, ucs_symbol (high));
4793 /* Output a character class (= property) table. */
4796 output_charclass (FILE *stream, const char *classname,
4797 bool (*func) (unsigned int))
4799 char table[0x110000];
4801 bool need_semicolon;
4802 const int max_column = 75;
4805 for (i = 0; i < 0x110000; i++)
4806 table[i] = (int) func (i);
4808 fprintf (stream, "%s ", classname);
4809 need_semicolon = false;
4811 for (i = 0; i < 0x110000; )
4817 unsigned int low, high;
4823 while (i < 0x110000 && table[i]);
4827 strcpy (buf, ucs_symbol (low));
4829 strcpy (buf, ucs_symbol_range (low, high));
4833 fprintf (stream, ";");
4837 if (column + strlen (buf) > max_column)
4839 fprintf (stream, "/\n ");
4843 fprintf (stream, "%s", buf);
4844 column += strlen (buf);
4845 need_semicolon = true;
4848 fprintf (stream, "\n");
4851 /* Output a character mapping table. */
4854 output_charmap (FILE *stream, const char *mapname,
4855 unsigned int (*func) (unsigned int))
4857 char table[0x110000];
4859 bool need_semicolon;
4860 const int max_column = 75;
4863 for (i = 0; i < 0x110000; i++)
4864 table[i] = (func (i) != i);
4866 fprintf (stream, "%s ", mapname);
4867 need_semicolon = false;
4869 for (i = 0; i < 0x110000; i++)
4875 strcat (buf, ucs_symbol (i));
4877 strcat (buf, ucs_symbol (func (i)));
4882 fprintf (stream, ";");
4886 if (column + strlen (buf) > max_column)
4888 fprintf (stream, "/\n ");
4892 fprintf (stream, "%s", buf);
4893 column += strlen (buf);
4894 need_semicolon = true;
4896 fprintf (stream, "\n");
4899 /* Output the width table. */
4902 output_widthmap (FILE *stream)
4906 /* Output the tables to the given file. */
4909 output_tables (const char *filename, const char *version)
4914 stream = fopen (filename, "w");
4917 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4921 fprintf (stream, "escape_char /\n");
4922 fprintf (stream, "comment_char %%\n");
4923 fprintf (stream, "\n");
4924 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4926 fprintf (stream, "\n");
4928 fprintf (stream, "LC_IDENTIFICATION\n");
4929 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4930 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4931 fprintf (stream, "address \"\"\n");
4932 fprintf (stream, "contact \"\"\n");
4933 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4934 fprintf (stream, "tel \"\"\n");
4935 fprintf (stream, "fax \"\"\n");
4936 fprintf (stream, "language \"\"\n");
4937 fprintf (stream, "territory \"Earth\"\n");
4938 fprintf (stream, "revision \"%s\"\n", version);
4943 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4944 fprintf (stream, "date \"%s\"\n", date);
4946 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4947 fprintf (stream, "END LC_IDENTIFICATION\n");
4948 fprintf (stream, "\n");
4950 /* Verifications. */
4951 for (ch = 0; ch < 0x110000; ch++)
4953 /* toupper restriction: "Only characters specified for the keywords
4954 lower and upper shall be specified. */
4955 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4957 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4958 ucs_symbol (ch), ch, to_upper (ch));
4960 /* tolower restriction: "Only characters specified for the keywords
4961 lower and upper shall be specified. */
4962 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4964 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4965 ucs_symbol (ch), ch, to_lower (ch));
4967 /* alpha restriction: "Characters classified as either upper or lower
4968 shall automatically belong to this class. */
4969 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4970 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4972 /* alpha restriction: "No character specified for the keywords cntrl,
4973 digit, punct or space shall be specified." */
4974 if (is_alpha (ch) && is_cntrl (ch))
4975 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4976 if (is_alpha (ch) && is_digit (ch))
4977 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4978 if (is_alpha (ch) && is_punct (ch))
4979 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4980 if (is_alpha (ch) && is_space (ch))
4981 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4983 /* space restriction: "No character specified for the keywords upper,
4984 lower, alpha, digit, graph or xdigit shall be specified."
4985 upper, lower, alpha already checked above. */
4986 if (is_space (ch) && is_digit (ch))
4987 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4988 if (is_space (ch) && is_graph (ch))
4989 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4990 if (is_space (ch) && is_xdigit (ch))
4991 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4993 /* cntrl restriction: "No character specified for the keywords upper,
4994 lower, alpha, digit, punct, graph, print or xdigit shall be
4995 specified." upper, lower, alpha already checked above. */
4996 if (is_cntrl (ch) && is_digit (ch))
4997 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
4998 if (is_cntrl (ch) && is_punct (ch))
4999 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5000 if (is_cntrl (ch) && is_graph (ch))
5001 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5002 if (is_cntrl (ch) && is_print (ch))
5003 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_xdigit (ch))
5005 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5007 /* punct restriction: "No character specified for the keywords upper,
5008 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5009 be specified." upper, lower, alpha, cntrl already checked above. */
5010 if (is_punct (ch) && is_digit (ch))
5011 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5012 if (is_punct (ch) && is_xdigit (ch))
5013 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5014 if (is_punct (ch) && (ch == 0x0020))
5015 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5017 /* graph restriction: "No character specified for the keyword cntrl
5018 shall be specified." Already checked above. */
5020 /* print restriction: "No character specified for the keyword cntrl
5021 shall be specified." Already checked above. */
5023 /* graph - print relation: differ only in the <space> character.
5024 How is this possible if there are more than one space character?!
5025 I think susv2/xbd/locale.html should speak of "space characters",
5026 not "space character". */
5027 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5029 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5030 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5032 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5035 fprintf (stream, "LC_CTYPE\n");
5036 output_charclass (stream, "upper", is_upper);
5037 output_charclass (stream, "lower", is_lower);
5038 output_charclass (stream, "alpha", is_alpha);
5039 output_charclass (stream, "digit", is_digit);
5040 output_charclass (stream, "outdigit", is_outdigit);
5041 output_charclass (stream, "blank", is_blank);
5042 output_charclass (stream, "space", is_space);
5043 output_charclass (stream, "cntrl", is_cntrl);
5044 output_charclass (stream, "punct", is_punct);
5045 output_charclass (stream, "xdigit", is_xdigit);
5046 output_charclass (stream, "graph", is_graph);
5047 output_charclass (stream, "print", is_print);
5048 output_charclass (stream, "class \"combining\";", is_combining);
5049 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5050 output_charmap (stream, "toupper", to_upper);
5051 output_charmap (stream, "tolower", to_lower);
5052 output_charmap (stream, "map \"totitle\";", to_title);
5053 output_widthmap (stream);
5054 fprintf (stream, "END LC_CTYPE\n");
5056 if (ferror (stream) || fclose (stream))
5058 fprintf (stderr, "error writing to '%s'\n", filename);
5066 main (int argc, char * argv[])
5068 const char *unicodedata_filename;
5069 const char *proplist_filename;
5070 const char *derivedproplist_filename;
5071 const char *scripts_filename;
5072 const char *blocks_filename;
5073 const char *proplist30_filename;
5074 const char *version;
5078 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt version\n",
5083 unicodedata_filename = argv[1];
5084 proplist_filename = argv[2];
5085 derivedproplist_filename = argv[3];
5086 scripts_filename = argv[4];
5087 blocks_filename = argv[5];
5088 proplist30_filename = argv[6];
5091 fill_attributes (unicodedata_filename);
5092 clear_properties ();
5093 fill_properties (proplist_filename);
5094 fill_properties (derivedproplist_filename);
5095 fill_properties30 (proplist30_filename);
5096 fill_scripts (scripts_filename);
5097 fill_blocks (blocks_filename);
5099 output_categories (version);
5100 output_category ("categ_of.h", version);
5101 output_combclass ("combining.h", version);
5102 output_bidi_category ("bidi_of.h", version);
5103 output_decimal_digit_test ("test-decdigit.h", version);
5104 output_decimal_digit ("decdigit.h", version);
5105 output_digit_test ("test-digit.h", version);
5106 output_digit ("digit.h", version);
5107 output_numeric_test ("test-numeric.h", version);
5108 output_numeric ("numeric.h", version);
5109 output_mirror ("mirror.h", version);
5110 output_properties (version);
5111 output_scripts (version);
5112 output_scripts_byname (version);
5113 output_blocks (version);
5114 output_ident_properties (version);
5115 output_old_ctype (version);
5121 * For Emacs M-x compile
5123 * compile-command: "
5124 gcc -O -Wall gen-ctype.c -o gen-ctype && \
5126 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \
5127 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/PropList.txt \
5128 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/DerivedCoreProperties.txt \
5129 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Scripts.txt \
5130 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Blocks.txt \
5131 /gfs/ibook/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \