X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=23462386e6b7306a93a23adfd17517636a3029c0;hb=04606dd55fd8f1a122a21c4b4e09ab2a36498385;hp=8cc21ee276947b8f170139e2fdf6aeb202ad68ed;hpb=a749f6be4198e009a48e59e35509d1c8c6deac8d;p=gnulib.git diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 8cc21ee27..23462386e 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -1,5 +1,6 @@ /* Generate Unicode conforming character classification tables and - Line Break Properties tables from a UnicodeData file. + line break properties tables and word break property tables and + decomposition/composition and case mapping tables from a UnicodeData file. Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. Written by Bruno Haible , 2000-2002. @@ -25,6 +26,8 @@ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ /usr/local/share/Unidata/LineBreak.txt \ + /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/CompositionExclusions.txt \ 5.1.0 */ @@ -736,7 +739,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd", + fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -756,7 +759,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd", + fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1054,7 +1057,7 @@ output_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1074,7 +1077,7 @@ output_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1205,7 +1208,7 @@ output_combclass (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1225,7 +1228,7 @@ output_combclass (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1544,7 +1547,7 @@ output_bidi_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1564,7 +1567,7 @@ output_bidi_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1751,7 +1754,7 @@ output_decimal_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1771,7 +1774,7 @@ output_decimal_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1938,7 +1941,7 @@ output_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1958,7 +1961,7 @@ output_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -2194,7 +2197,7 @@ output_numeric (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -2214,7 +2217,7 @@ output_numeric (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -2432,7 +2435,7 @@ output_mirror (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -2452,7 +2455,7 @@ output_mirror (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -3752,7 +3755,7 @@ output_scripts (const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -3772,7 +3775,7 @@ output_scripts (const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -4428,7 +4431,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -4448,7 +4451,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -5298,16 +5301,8 @@ get_lbp (unsigned int ch) || ch == 0x0FBE /* TIBETAN KU RU KHA */ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ -#if !REVISION_22 - || ch == 0x1802 /* MONGOLIAN COMMA */ - || ch == 0x1803 /* MONGOLIAN FULL STOP */ -#endif || ch == 0x1804 /* MONGOLIAN COLON */ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ -#if !REVISION_22 - || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ - || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ -#endif || ch == 0x1B5A /* BALINESE PANTI */ || ch == 0x1B5B /* BALINESE PAMADA */ || ch == 0x1B5C /* BALINESE WINDU */ @@ -5320,15 +5315,9 @@ get_lbp (unsigned int ch) || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ -#if !REVISION_22 - || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ -#endif || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ -#if !REVISION_22 - || ch == 0x2CFE /* COPTIC FULL STOP */ -#endif || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ @@ -5343,9 +5332,6 @@ get_lbp (unsigned int ch) || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ -#if !REVISION_22 - || ch == 0x1A1E /* BUGINESE PALLAWA */ -#endif || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) @@ -5353,10 +5339,8 @@ get_lbp (unsigned int ch) /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ -#if REVISION_22 || ch == 0x1FFD /* GREEK OXIA */ || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ -#endif || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ @@ -5402,15 +5386,9 @@ get_lbp (unsigned int ch) if (ch == 0x0021 /* EXCLAMATION MARK */ || ch == 0x003F /* QUESTION MARK */ || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ -#if !REVISION_22 - || ch == 0x060C /* ARABIC COMMA */ -#endif || ch == 0x061B /* ARABIC SEMICOLON */ || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ || ch == 0x061F /* ARABIC QUESTION MARK */ -#if !REVISION_22 - || ch == 0x066A /* ARABIC PERCENT SIGN */ -#endif || ch == 0x06D4 /* ARABIC FULL STOP */ || ch == 0x07F9 /* NKO EXCLAMATION MARK */ || ch == 0x0F0D /* TIBETAN MARK SHAD */ @@ -5419,22 +5397,20 @@ get_lbp (unsigned int ch) || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ -#if REVISION_22 || ch == 0x1802 /* MONGOLIAN COMMA */ || ch == 0x1803 /* MONGOLIAN FULL STOP */ || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ -#endif || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ || ch == 0x1945 /* LIMBU QUESTION MARK */ || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ -#if REVISION_22 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ || ch == 0x2CFE /* COPTIC FULL STOP */ -#endif || ch == 0x2E2E /* REVERSED QUESTION MARK */ +#if REVISION_22 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ +#endif || ch == 0xA60E /* VAI FULL STOP */ || ch == 0xA876 /* PHAGS-PA MARK SHAD */ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ @@ -5489,10 +5465,8 @@ get_lbp (unsigned int ch) /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's') -#if REVISION_22 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00BF /* INVERTED QUESTION MARK */ -#endif || ch == 0x2E18 /* INVERTED INTERROBANG */) attr |= 1 << LBP_OP; @@ -5521,9 +5495,7 @@ get_lbp (unsigned int ch) || ch == 0x003B /* SEMICOLON */ || ch == 0x037E /* GREEK QUESTION MARK */ || ch == 0x0589 /* ARMENIAN FULL STOP */ -#if REVISION_22 || ch == 0x060C /* ARABIC COMMA */ -#endif || ch == 0x060D /* ARABIC DATE SEPARATOR */ || ch == 0x07F8 /* NKO COMMA */ || ch == 0x2044 /* FRACTION SLASH */ @@ -5545,9 +5517,7 @@ get_lbp (unsigned int ch) || ch == 0x00A2 /* CENT SIGN */ || ch == 0x00B0 /* DEGREE SIGN */ || ch == 0x060B /* AFGHANI SIGN */ -#if REVISION_22 || ch == 0x066A /* ARABIC PERCENT SIGN */ -#endif || ch == 0x2030 /* PER MILLE SIGN */ || ch == 0x2031 /* PER TEN THOUSAND SIGN */ || ch == 0x2032 /* PRIME */ @@ -5562,7 +5532,11 @@ get_lbp (unsigned int ch) || ch == 0xFDFC /* RIAL SIGN */ || ch == 0xFE6A /* SMALL PERCENT SIGN */ || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ - || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) + || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ + || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */) attr |= 1 << LBP_PO; /* prefix (numeric) */ @@ -5606,6 +5580,8 @@ get_lbp (unsigned int ch) && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'n')) /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ + || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) && ((ch >= 0x0E00 && ch <= 0x0EFF) @@ -5631,7 +5607,7 @@ get_lbp (unsigned int ch) || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */ + || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ @@ -6135,9 +6111,13 @@ output_lbp (FILE *stream1, FILE *stream2) if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; - fprintf (stream2, " %5zd%s", - offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t), - (i+1 < t.level1_size ? "," : "")); + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); @@ -6151,9 +6131,13 @@ output_lbp (FILE *stream1, FILE *stream2) if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; - fprintf (stream2, " %5zd%s", - offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t), - (i+1 < t.level2_size << t.q ? "," : "")); + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); @@ -6279,67 +6263,1278 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve /* ========================================================================= */ -int -main (int argc, char * argv[]) +/* Word break property. */ + +/* Possible values of the Word_Break property. */ +enum { - const char *unicodedata_filename; - const char *proplist_filename; - const char *derivedproplist_filename; - const char *scripts_filename; - const char *blocks_filename; - const char *proplist30_filename; - const char *eastasianwidth_filename; - const char *linebreak_filename; - const char *version; + WBP_OTHER = 0, + WBP_CR = 11, + WBP_LF = 12, + WBP_NEWLINE = 10, + WBP_EXTEND = 8, + WBP_FORMAT = 9, + WBP_KATAKANA = 1, + WBP_ALETTER = 2, + WBP_MIDNUMLET = 3, + WBP_MIDLETTER = 4, + WBP_MIDNUM = 5, + WBP_NUMERIC = 6, + WBP_EXTENDNUMLET = 7 +}; + +/* Returns the word breaking property for ch, as a bit mask. */ +static int +get_wbp (unsigned int ch) +{ + int attr = 0; - if (argc != 10) + if (unicode_attributes[ch].name != NULL) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt version\n", - argv[0]); + if (ch == 0x000D) + attr |= 1 << WBP_CR; + + if (ch == 0x000A) + attr |= 1 << WBP_LF; + + if (ch == 0x000B || ch == 0x000C + || ch == 0x0085 + || ch == 0x2028 || ch == 0x2029) + attr |= 1 << WBP_NEWLINE; + + if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0 + || (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Mc") == 0)) + attr |= 1 << WBP_EXTEND; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Cf") == 0 + && ch != 0x200C && ch != 0x200D) + attr |= 1 << WBP_FORMAT; + + if ((unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) + || (ch >= 0x3031 && ch <= 0x3035) + || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC + || ch == 0xFF70) + attr |= 1 << WBP_KATAKANA; + + if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 + || ch == 0x05F3) + && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 + && (attr & (1 << WBP_KATAKANA)) == 0 + && ((get_lbp (ch) >> LBP_SA) & 1) == 0 + && !(unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) + && (attr & (1 << WBP_EXTEND)) == 0) + attr |= 1 << WBP_ALETTER; + + if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) + attr |= 1 << WBP_MIDNUMLET; + + if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) + attr |= 1 << WBP_MIDLETTER; + + if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 + || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C + || ch == 0xFF1B) + && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) + attr |= 1 << WBP_MIDNUM; + + if (((get_lbp (ch) >> LBP_NU) & 1) != 0 + && ch != 0x066C) + attr |= 1 << WBP_NUMERIC; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Pc") == 0) + attr |= 1 << WBP_EXTENDNUMLET; + } + + if (attr == 0) + /* other */ + attr |= 1 << WBP_OTHER; + + return attr; +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + if (attr != 1 << WBP_OTHER) + { + fprintf (stream, "0x%04X", i); + if (attr & (1 << WBP_CR)) + fprintf (stream, " CR"); + if (attr & (1 << WBP_LF)) + fprintf (stream, " LF"); + if (attr & (1 << WBP_NEWLINE)) + fprintf (stream, " Newline"); + if (attr & (1 << WBP_EXTEND)) + fprintf (stream, " Extend"); + if (attr & (1 << WBP_FORMAT)) + fprintf (stream, " Format"); + if (attr & (1 << WBP_KATAKANA)) + fprintf (stream, " Katakana"); + if (attr & (1 << WBP_ALETTER)) + fprintf (stream, " ALetter"); + if (attr & (1 << WBP_MIDNUMLET)) + fprintf (stream, " MidNumLet"); + if (attr & (1 << WBP_MIDLETTER)) + fprintf (stream, " MidLetter"); + if (attr & (1 << WBP_MIDNUM)) + fprintf (stream, " MidNum"); + if (attr & (1 << WBP_NUMERIC)) + fprintf (stream, " Numeric"); + if (attr & (1 << WBP_EXTENDNUMLET)) + fprintf (stream, " ExtendNumLet"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } - unicodedata_filename = argv[1]; - proplist_filename = argv[2]; - derivedproplist_filename = argv[3]; - scripts_filename = argv[4]; - blocks_filename = argv[5]; - proplist30_filename = argv[6]; - eastasianwidth_filename = argv[7]; - linebreak_filename = argv[8]; - version = argv[9]; + debug_output_wbp (stream); - fill_attributes (unicodedata_filename); - clear_properties (); - fill_properties (proplist_filename); - fill_properties (derivedproplist_filename); - fill_properties30 (proplist30_filename); - fill_scripts (scripts_filename); - fill_blocks (blocks_filename); - fill_width (eastasianwidth_filename); - fill_org_lbp (linebreak_filename); + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} - output_categories (version); - output_category ("unictype/categ_of.h", version); - output_combclass ("unictype/combining.h", version); - output_bidi_category ("unictype/bidi_of.h", version); - output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); - output_decimal_digit ("unictype/decdigit.h", version); - output_digit_test ("../tests/unictype/test-digit.h", version); - output_digit ("unictype/digit.h", version); - output_numeric_test ("../tests/unictype/test-numeric.h", version); - output_numeric ("unictype/numeric.h", version); - output_mirror ("unictype/mirror.h", version); - output_properties (version); - output_scripts (version); - output_scripts_byname (version); - output_blocks (version); - output_ident_properties (version); - output_old_ctype (version); +/* The word break property from the WordBreakProperty.txt file. */ +int unicode_org_wbp[0x110000]; - debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); - debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); - output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); +/* Stores in unicode_org_wbp[] the word break property from the + WordBreakProperty.txt file. */ +static void +fill_org_wbp (const char *wordbreakproperty_filename) +{ + unsigned int i; + FILE *stream; + + for (i = 0; i < 0x110000; i++) + unicode_org_wbp[i] = WBP_OTHER; + + stream = fopen (wordbreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + wordbreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, + wordbreakproperty_filename); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_wbp[i] = propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename); + exit (1); + } +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_org_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int propvalue = unicode_org_wbp[i]; + if (propvalue != WBP_OTHER) + { + fprintf (stream, "0x%04X", i); +#define PROP(name,value) \ + if (propvalue == value) fprintf (stream, " " name); else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + fprintf (stream, " ??"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_org_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_org_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE wbp_table +#define ELEMENT unsigned char +#define DEFAULT WBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_wbp (FILE *stream) +{ + unsigned int i; + struct wbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + t.p = 7; + t.q = 9; + wbp_table_init (&t); + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + + /* Now attr should contain exactly one bit. */ + if (attr == 0 || ((attr & (attr - 1)) != 0)) + abort (); + + if (attr != 1 << WBP_OTHER) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + + wbp_table_add (&t, i, log2_attr); + } + } + + wbp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define wbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "\n"); + fprintf (stream, "typedef struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "wbrkprop_t;\n"); + fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; + const char *value_string; + switch (value) + { +#define CASE(x) case x: value_string = #x; break; + CASE(WBP_OTHER); + CASE(WBP_CR); + CASE(WBP_LF); + CASE(WBP_NEWLINE); + CASE(WBP_EXTEND); + CASE(WBP_FORMAT); + CASE(WBP_KATAKANA); + CASE(WBP_ALETTER); + CASE(WBP_MIDNUMLET); + CASE(WBP_MIDLETTER); + CASE(WBP_MIDNUM); + CASE(WBP_NUMERIC); + CASE(WBP_EXTENDNUMLET); +#undef CASE + default: + abort (); + } + if (i > 0 && (i % 4) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %s%s", value_string, + (i+1 < t.level3_size << t.p ? "," : "")); + } + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); +} + +static void +output_wbrk_tables (const char *filename, const char *version) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + output_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Maximum number of characters into which a single Unicode character can be + decomposed. */ +#define MAX_DECOMP_LENGTH 18 + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* An encircled form. */ + UC_DECOMP_SUPER, /* A superscript form. */ + UC_DECOMP_SUB, /* A subscript form. */ + UC_DECOMP_VERTICAL,/* A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* A vulgar fraction form. */ + UC_DECOMP_COMPAT /* Otherwise unspecified compatibility character. */ +}; + +/* Return the decomposition for a Unicode character (ignoring Hangul Jamo + decompositions). Return the type, or -1 for none. */ +static int +get_decomposition (unsigned int ch, + unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) +{ + const char *decomposition = unicode_attributes[ch].decomposition; + + if (decomposition != NULL && decomposition[0] != '\0') + { + int type = UC_DECOMP_CANONICAL; + unsigned int length; + char *endptr; + + if (decomposition[0] == '<') + { + const char *rangle; + size_t typelen; + + rangle = strchr (decomposition + 1, '>'); + if (rangle == NULL) + abort (); + typelen = rangle + 1 - decomposition; +#define TYPE(t1,t2) \ + if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ + type = t2; \ + else + TYPE ("", UC_DECOMP_FONT) + TYPE ("", UC_DECOMP_NOBREAK) + TYPE ("", UC_DECOMP_INITIAL) + TYPE ("", UC_DECOMP_MEDIAL) + TYPE ("", UC_DECOMP_FINAL) + TYPE ("", UC_DECOMP_ISOLATED) + TYPE ("", UC_DECOMP_CIRCLE) + TYPE ("", UC_DECOMP_SUPER) + TYPE ("", UC_DECOMP_SUB) + TYPE ("", UC_DECOMP_VERTICAL) + TYPE ("", UC_DECOMP_WIDE) + TYPE ("", UC_DECOMP_NARROW) + TYPE ("", UC_DECOMP_SMALL) + TYPE ("", UC_DECOMP_SQUARE) + TYPE ("", UC_DECOMP_FRACTION) + TYPE ("", UC_DECOMP_COMPAT) + { + fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); + exit (1); + } +#undef TYPE + decomposition = rangle + 1; + if (decomposition[0] == ' ') + decomposition++; + } + for (length = 0; length < MAX_DECOMP_LENGTH; length++) + { + decomposed[length] = strtoul (decomposition, &endptr, 16); + if (endptr == decomposition) + break; + decomposition = endptr; + if (decomposition[0] == ' ') + decomposition++; + } + if (*decomposition != '\0') + /* MAX_DECOMP_LENGTH is too small. */ + abort (); + + *lengthp = length; + return type; + } + else + return -1; +} + +/* Construction of sparse 3-level tables. */ +#define TABLE decomp_table +#define ELEMENT uint16_t +#define DEFAULT (uint16_t)(-1) +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_decomposition (FILE *stream1, FILE *stream2) +{ + struct decomp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + unsigned int offset; + unsigned int ch; + unsigned int i; + + t.p = 5; + t.q = 5; + decomp_table_init (&t); + + fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n"); + fprintf (stream1, "\n"); + fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{"); + offset = 0; + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type >= 0) + { + if (!(offset < (1 << 15))) + abort (); + decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); + + /* Produce length 3-bytes entries. */ + if (length == 0) + /* We would need a special representation of zero-length entries. */ + abort (); + for (i = 0; i < length; i++) + { + if (offset > 0) + fprintf (stream2, ","); + if ((offset % 4) == 0) + fprintf (stream2, "\n "); + if (!(decomposed[i] < (1 << 18))) + abort (); + fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", + (((i+1 < length ? (1 << 23) : 0) + | (i == 0 ? (type << 18) : 0) + | decomposed[i]) >> 16) & 0xff, + (decomposed[i] >> 8) & 0xff, + decomposed[i] & 0xff); + offset++; + } + } + } + + fprintf (stream2, "\n};\n"); + fprintf (stream2, "\n"); + + decomp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream1, "#define decomp_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream1, "\n"); + fprintf (stream1, "typedef struct\n"); + fprintf (stream1, " {\n"); + fprintf (stream1, " int level1[%zu];\n", t.level1_size); + fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream1, " }\n"); + fprintf (stream1, "decomp_index_table_t;\n"); + fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n"); + fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n"); + fprintf (stream2, "{\n"); + fprintf (stream2, " {"); + if (t.level1_size > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); + } + if (t.level1_size > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (uint16_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + uint16_t value = ((uint16_t *) (t.result + level3_offset))[i]; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value); + if (i+1 < t.level3_size << t.p) + fprintf (stream2, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " }\n"); + fprintf (stream2, "};\n"); +} + +static void +output_decomposition_tables (const char *filename1, const char *filename2, const char *version) +{ + const char *filenames[2]; + FILE *streams[2]; + size_t i; + + filenames[0] = filename1; + filenames[1] = filename2; + + for (i = 0; i < 2; i++) + { + streams[i] = fopen (filenames[i], "w"); + if (streams[i] == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } + } + + for (i = 0; i < 2; i++) + { + FILE *stream = streams[i]; + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Decomposition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + } + + output_decomposition (streams[0], streams[1]); + + for (i = 0; i < 2; i++) + { + if (ferror (streams[i]) || fclose (streams[i])) + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } + } +} + +/* The "excluded from composition" property from the CompositionExclusions.txt file. */ +char unicode_composition_exclusions[0x110000]; + +static void +fill_composition_exclusions (const char *compositionexclusions_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (compositionexclusions_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename); + exit (1); + } + + for (i = 0; i < 0x110000; i++) + unicode_composition_exclusions[i] = 0; + + for (;;) + { + char buf[200+1]; + unsigned int i; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X", &i) != 1) + { + fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); + exit (1); + } + if (!(i < 0x110000)) + abort (); + + unicode_composition_exclusions[i] = 1; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename); + exit (1); + } +} + +static void +debug_output_composition_tables (const char *filename) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", + code1, + code2, + combined, + unicode_attributes[code2].combining); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_composition_tables (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + /* The composition table is a set of mappings (code1, code2) -> combined, + with 928 entries, + 367 values for code1 (from 0x003C to 0x30FD), + 54 values for code2 (from 0x0300 to 0x309A). + For a fixed code1, there are from 1 to 19 possible values for code2. + For a fixed code2, there are from 1 to 117 possible values for code1. + This is a very sparse matrix. + + We want an O(1) hash lookup. + + We could implement the hash lookup by mapping (code1, code2) to a linear + combination mul1*code1 + mul2*code2, which is then used as an index into + a 3-level table. But this leads to a table of size 37 KB. + + We use gperf to implement the hash lookup, giving it the 928 sets of + 4 bytes (code1, code2) as input. gperf generates a hash table of size + 1527, which is quite good (60% filled). It requires an auxiliary table + lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ + + fprintf (stream, "struct composition_rule { char codes[4]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name codes\n"); + fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + if (!(code1 < 0x10000)) + abort (); + if (!(code2 < 0x10000)) + abort (); + if (!(combined < 0x10000)) + abort (); + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 8) & 0xff, code2 & 0xff, + combined); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Output the test for a simple character mapping table to the given file. */ + +static void +output_simple_mapping_test (const char *filename, + const char *function_name, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode character mapping functions.\n"); + fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + fprintf (stream, "#include \"test-mapping-part1.h\"\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int value = func (ch); + + if (value != ch) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + fprintf (stream, "\n"); + fprintf (stream, "#define MAP(c) %s (c)\n", function_name); + fprintf (stream, "#include \"test-mapping-part2.h\"\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE mapping_table +#define ELEMENT int32_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output a simple character mapping table to the given file. */ + +static void +output_simple_mapping (const char *filename, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct mapping_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + mapping_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = (int) func (ch) - (int) ch; + + mapping_table_add (&t, ch, value); + } + + mapping_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define mapping_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_mapping =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +int +main (int argc, char * argv[]) +{ + const char *unicodedata_filename; + const char *proplist_filename; + const char *derivedproplist_filename; + const char *scripts_filename; + const char *blocks_filename; + const char *proplist30_filename; + const char *eastasianwidth_filename; + const char *linebreak_filename; + const char *wordbreakproperty_filename; + const char *compositionexclusions_filename; + const char *version; + + if (argc != 12) + { + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt version\n", + argv[0]); + exit (1); + } + + unicodedata_filename = argv[1]; + proplist_filename = argv[2]; + derivedproplist_filename = argv[3]; + scripts_filename = argv[4]; + blocks_filename = argv[5]; + proplist30_filename = argv[6]; + eastasianwidth_filename = argv[7]; + linebreak_filename = argv[8]; + wordbreakproperty_filename = argv[9]; + compositionexclusions_filename = argv[10]; + version = argv[11]; + + fill_attributes (unicodedata_filename); + clear_properties (); + fill_properties (proplist_filename); + fill_properties (derivedproplist_filename); + fill_properties30 (proplist30_filename); + fill_scripts (scripts_filename); + fill_blocks (blocks_filename); + fill_width (eastasianwidth_filename); + fill_org_lbp (linebreak_filename); + fill_org_wbp (wordbreakproperty_filename); + fill_composition_exclusions (compositionexclusions_filename); + + output_categories (version); + output_category ("unictype/categ_of.h", version); + output_combclass ("unictype/combining.h", version); + output_bidi_category ("unictype/bidi_of.h", version); + output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); + output_decimal_digit ("unictype/decdigit.h", version); + output_digit_test ("../tests/unictype/test-digit.h", version); + output_digit ("unictype/digit.h", version); + output_numeric_test ("../tests/unictype/test-numeric.h", version); + output_numeric ("unictype/numeric.h", version); + output_mirror ("unictype/mirror.h", version); + output_properties (version); + output_scripts (version); + output_scripts_byname (version); + output_blocks (version); + output_ident_properties (version); + output_old_ctype (version); + + debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); + debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); + output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); + + debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt"); + debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); + output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); + debug_output_composition_tables ("uninorm/composition.txt"); + output_composition_tables ("uninorm/composition-table.gperf", version); + + output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version); + output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version); + output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version); + output_simple_mapping ("unicase/toupper.h", to_upper, version); + output_simple_mapping ("unicase/tolower.h", to_lower, version); + output_simple_mapping ("unicase/totitle.h", to_title, version); return 0; } @@ -6358,6 +7553,8 @@ main (int argc, char * argv[]) /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ 5.1.0 " * End: