X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=a50751758ca0942c2591304c579fe6ebc1473b24;hb=6cd8877397a867ec952ddd4a487d52ceaf3212f5;hp=b1149fd19095e3aaa1b35b27dba117e55289c2ae;hpb=5ccf18f30a760b348cd07ebac61946e694d11462;p=gnulib.git diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index b1149fd19..a50751758 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -1,5 +1,6 @@ /* Generate Unicode conforming character classification tables and - Line Break Properties tables from a UnicodeData file. + line break properties tables and word break property tables and + decomposition/composition and case mapping tables from a UnicodeData file. Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. Written by Bruno Haible , 2000-2002. @@ -25,7 +26,11 @@ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ /usr/local/share/Unidata/LineBreak.txt \ - 5.0.0 + /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/CompositionExclusions.txt \ + /usr/local/share/Unidata/SpecialCasing.txt \ + /usr/local/share/Unidata/CaseFolding.txt \ + 5.1.0 */ #include @@ -736,7 +741,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd", + fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -756,7 +761,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd", + fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1054,7 +1059,7 @@ output_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1074,7 +1079,7 @@ output_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1205,7 +1210,7 @@ output_combclass (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1225,7 +1230,7 @@ output_combclass (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1544,7 +1549,7 @@ output_bidi_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1564,7 +1569,7 @@ output_bidi_category (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1751,7 +1756,7 @@ output_decimal_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1771,7 +1776,7 @@ output_decimal_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -1938,7 +1943,7 @@ output_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -1958,7 +1963,7 @@ output_digit (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -2194,7 +2199,7 @@ output_numeric (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -2214,7 +2219,7 @@ output_numeric (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -2432,7 +2437,7 @@ output_mirror (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -2452,7 +2457,7 @@ output_mirror (const char *filename, const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -2766,6 +2771,7 @@ is_property_alphabetic (unsigned int ch) Alphabetic but not as having property Other_Alphabetic. */ || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */ || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */ + || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */ || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */ || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ @@ -2804,12 +2810,10 @@ is_property_default_ignorable_code_point (unsigned int ch) { bool result1 = (is_category_Cf (ch) - && !(ch >= 0xFFF9 && ch <= 0xFFFB)) /* Annotations */ - || ((is_category_Cc (ch) || is_category_Cs (ch)) - && !is_property_white_space (ch)) + && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ + && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) - || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0) - || is_property_not_a_character (ch); + || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); bool result2 = ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0); @@ -3753,7 +3757,7 @@ output_scripts (const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -3773,7 +3777,7 @@ output_scripts (const char *version) if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -4429,7 +4433,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); @@ -4449,7 +4453,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co if (offset == 0) fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zd", + fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); @@ -5299,16 +5303,8 @@ get_lbp (unsigned int ch) || ch == 0x0FBE /* TIBETAN KU RU KHA */ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ -#if !REVISION_22 - || ch == 0x1802 /* MONGOLIAN COMMA */ - || ch == 0x1803 /* MONGOLIAN FULL STOP */ -#endif || ch == 0x1804 /* MONGOLIAN COLON */ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ -#if !REVISION_22 - || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ - || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ -#endif || ch == 0x1B5A /* BALINESE PANTI */ || ch == 0x1B5B /* BALINESE PAMADA */ || ch == 0x1B5C /* BALINESE WINDU */ @@ -5321,15 +5317,9 @@ get_lbp (unsigned int ch) || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ -#if !REVISION_22 - || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ -#endif || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ -#if !REVISION_22 - || ch == 0x2CFE /* COPTIC FULL STOP */ -#endif || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ @@ -5344,9 +5334,6 @@ get_lbp (unsigned int ch) || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ -#if !REVISION_22 - || ch == 0x1A1E /* BUGINESE PALLAWA */ -#endif || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) @@ -5354,10 +5341,8 @@ get_lbp (unsigned int ch) /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ -#if REVISION_22 || ch == 0x1FFD /* GREEK OXIA */ || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ -#endif || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ @@ -5403,15 +5388,9 @@ get_lbp (unsigned int ch) if (ch == 0x0021 /* EXCLAMATION MARK */ || ch == 0x003F /* QUESTION MARK */ || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ -#if !REVISION_22 - || ch == 0x060C /* ARABIC COMMA */ -#endif || ch == 0x061B /* ARABIC SEMICOLON */ || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ || ch == 0x061F /* ARABIC QUESTION MARK */ -#if !REVISION_22 - || ch == 0x066A /* ARABIC PERCENT SIGN */ -#endif || ch == 0x06D4 /* ARABIC FULL STOP */ || ch == 0x07F9 /* NKO EXCLAMATION MARK */ || ch == 0x0F0D /* TIBETAN MARK SHAD */ @@ -5420,22 +5399,20 @@ get_lbp (unsigned int ch) || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ -#if REVISION_22 || ch == 0x1802 /* MONGOLIAN COMMA */ || ch == 0x1803 /* MONGOLIAN FULL STOP */ || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ -#endif || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ || ch == 0x1945 /* LIMBU QUESTION MARK */ || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ -#if REVISION_22 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ || ch == 0x2CFE /* COPTIC FULL STOP */ -#endif || ch == 0x2E2E /* REVERSED QUESTION MARK */ +#if REVISION_22 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ +#endif || ch == 0xA60E /* VAI FULL STOP */ || ch == 0xA876 /* PHAGS-PA MARK SHAD */ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ @@ -5490,10 +5467,8 @@ get_lbp (unsigned int ch) /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's') -#if REVISION_22 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00BF /* INVERTED QUESTION MARK */ -#endif || ch == 0x2E18 /* INVERTED INTERROBANG */) attr |= 1 << LBP_OP; @@ -5522,9 +5497,7 @@ get_lbp (unsigned int ch) || ch == 0x003B /* SEMICOLON */ || ch == 0x037E /* GREEK QUESTION MARK */ || ch == 0x0589 /* ARMENIAN FULL STOP */ -#if REVISION_22 || ch == 0x060C /* ARABIC COMMA */ -#endif || ch == 0x060D /* ARABIC DATE SEPARATOR */ || ch == 0x07F8 /* NKO COMMA */ || ch == 0x2044 /* FRACTION SLASH */ @@ -5546,9 +5519,7 @@ get_lbp (unsigned int ch) || ch == 0x00A2 /* CENT SIGN */ || ch == 0x00B0 /* DEGREE SIGN */ || ch == 0x060B /* AFGHANI SIGN */ -#if REVISION_22 || ch == 0x066A /* ARABIC PERCENT SIGN */ -#endif || ch == 0x2030 /* PER MILLE SIGN */ || ch == 0x2031 /* PER TEN THOUSAND SIGN */ || ch == 0x2032 /* PRIME */ @@ -5563,7 +5534,11 @@ get_lbp (unsigned int ch) || ch == 0xFDFC /* RIAL SIGN */ || ch == 0xFE6A /* SMALL PERCENT SIGN */ || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ - || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) + || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ + || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */) attr |= 1 << LBP_PO; /* prefix (numeric) */ @@ -5607,6 +5582,8 @@ get_lbp (unsigned int ch) && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'n')) /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ + || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) && ((ch >= 0x0E00 && ch <= 0x0EFF) @@ -5632,7 +5609,7 @@ get_lbp (unsigned int ch) || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */ + || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ @@ -6136,9 +6113,13 @@ output_lbp (FILE *stream1, FILE *stream2) if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; - fprintf (stream2, " %5zd%s", - offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t), - (i+1 < t.level1_size ? "," : "")); + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); @@ -6152,9 +6133,13 @@ output_lbp (FILE *stream1, FILE *stream2) if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; - fprintf (stream2, " %5zd%s", - offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t), - (i+1 < t.level2_size << t.q ? "," : "")); + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); @@ -6280,67 +6265,2043 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve /* ========================================================================= */ -int -main (int argc, char * argv[]) +/* Word break property. */ + +/* Possible values of the Word_Break property. */ +enum { - const char *unicodedata_filename; - const char *proplist_filename; - const char *derivedproplist_filename; - const char *scripts_filename; - const char *blocks_filename; - const char *proplist30_filename; - const char *eastasianwidth_filename; - const char *linebreak_filename; - const char *version; + WBP_OTHER = 0, + WBP_CR = 11, + WBP_LF = 12, + WBP_NEWLINE = 10, + WBP_EXTEND = 8, + WBP_FORMAT = 9, + WBP_KATAKANA = 1, + WBP_ALETTER = 2, + WBP_MIDNUMLET = 3, + WBP_MIDLETTER = 4, + WBP_MIDNUM = 5, + WBP_NUMERIC = 6, + WBP_EXTENDNUMLET = 7 +}; + +/* Returns the word breaking property for ch, as a bit mask. */ +static int +get_wbp (unsigned int ch) +{ + int attr = 0; - if (argc != 10) + if (unicode_attributes[ch].name != NULL) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt version\n", - argv[0]); + if (ch == 0x000D) + attr |= 1 << WBP_CR; + + if (ch == 0x000A) + attr |= 1 << WBP_LF; + + if (ch == 0x000B || ch == 0x000C + || ch == 0x0085 + || ch == 0x2028 || ch == 0x2029) + attr |= 1 << WBP_NEWLINE; + + if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0 + || (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Mc") == 0)) + attr |= 1 << WBP_EXTEND; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Cf") == 0 + && ch != 0x200C && ch != 0x200D) + attr |= 1 << WBP_FORMAT; + + if ((unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) + || (ch >= 0x3031 && ch <= 0x3035) + || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC + || ch == 0xFF70) + attr |= 1 << WBP_KATAKANA; + + if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 + || ch == 0x05F3) + && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 + && (attr & (1 << WBP_KATAKANA)) == 0 + && ((get_lbp (ch) >> LBP_SA) & 1) == 0 + && !(unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) + && (attr & (1 << WBP_EXTEND)) == 0) + attr |= 1 << WBP_ALETTER; + + if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) + attr |= 1 << WBP_MIDNUMLET; + + if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) + attr |= 1 << WBP_MIDLETTER; + + if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 + || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C + || ch == 0xFF1B) + && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) + attr |= 1 << WBP_MIDNUM; + + if (((get_lbp (ch) >> LBP_NU) & 1) != 0 + && ch != 0x066C) + attr |= 1 << WBP_NUMERIC; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Pc") == 0) + attr |= 1 << WBP_EXTENDNUMLET; + } + + if (attr == 0) + /* other */ + attr |= 1 << WBP_OTHER; + + return attr; +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + if (attr != 1 << WBP_OTHER) + { + fprintf (stream, "0x%04X", i); + if (attr & (1 << WBP_CR)) + fprintf (stream, " CR"); + if (attr & (1 << WBP_LF)) + fprintf (stream, " LF"); + if (attr & (1 << WBP_NEWLINE)) + fprintf (stream, " Newline"); + if (attr & (1 << WBP_EXTEND)) + fprintf (stream, " Extend"); + if (attr & (1 << WBP_FORMAT)) + fprintf (stream, " Format"); + if (attr & (1 << WBP_KATAKANA)) + fprintf (stream, " Katakana"); + if (attr & (1 << WBP_ALETTER)) + fprintf (stream, " ALetter"); + if (attr & (1 << WBP_MIDNUMLET)) + fprintf (stream, " MidNumLet"); + if (attr & (1 << WBP_MIDLETTER)) + fprintf (stream, " MidLetter"); + if (attr & (1 << WBP_MIDNUM)) + fprintf (stream, " MidNum"); + if (attr & (1 << WBP_NUMERIC)) + fprintf (stream, " Numeric"); + if (attr & (1 << WBP_EXTENDNUMLET)) + fprintf (stream, " ExtendNumLet"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } - unicodedata_filename = argv[1]; - proplist_filename = argv[2]; - derivedproplist_filename = argv[3]; - scripts_filename = argv[4]; - blocks_filename = argv[5]; - proplist30_filename = argv[6]; - eastasianwidth_filename = argv[7]; - linebreak_filename = argv[8]; - version = argv[9]; + debug_output_wbp (stream); - fill_attributes (unicodedata_filename); - clear_properties (); - fill_properties (proplist_filename); - fill_properties (derivedproplist_filename); - fill_properties30 (proplist30_filename); - fill_scripts (scripts_filename); - fill_blocks (blocks_filename); - fill_width (eastasianwidth_filename); - fill_org_lbp (linebreak_filename); + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} - output_categories (version); - output_category ("unictype/categ_of.h", version); - output_combclass ("unictype/combining.h", version); - output_bidi_category ("unictype/bidi_of.h", version); - output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); - output_decimal_digit ("unictype/decdigit.h", version); - output_digit_test ("../tests/unictype/test-digit.h", version); - output_digit ("unictype/digit.h", version); - output_numeric_test ("../tests/unictype/test-numeric.h", version); - output_numeric ("unictype/numeric.h", version); - output_mirror ("unictype/mirror.h", version); - output_properties (version); - output_scripts (version); - output_scripts_byname (version); - output_blocks (version); - output_ident_properties (version); - output_old_ctype (version); +/* The word break property from the WordBreakProperty.txt file. */ +int unicode_org_wbp[0x110000]; - debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); - debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); - output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); +/* Stores in unicode_org_wbp[] the word break property from the + WordBreakProperty.txt file. */ +static void +fill_org_wbp (const char *wordbreakproperty_filename) +{ + unsigned int i; + FILE *stream; + + for (i = 0; i < 0x110000; i++) + unicode_org_wbp[i] = WBP_OTHER; + + stream = fopen (wordbreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + wordbreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, + wordbreakproperty_filename); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_wbp[i] = propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename); + exit (1); + } +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_org_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int propvalue = unicode_org_wbp[i]; + if (propvalue != WBP_OTHER) + { + fprintf (stream, "0x%04X", i); +#define PROP(name,value) \ + if (propvalue == value) fprintf (stream, " " name); else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + fprintf (stream, " ??"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_org_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_org_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE wbp_table +#define ELEMENT unsigned char +#define DEFAULT WBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_wbp (FILE *stream) +{ + unsigned int i; + struct wbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + t.p = 7; + t.q = 9; + wbp_table_init (&t); + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + + /* Now attr should contain exactly one bit. */ + if (attr == 0 || ((attr & (attr - 1)) != 0)) + abort (); + + if (attr != 1 << WBP_OTHER) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + + wbp_table_add (&t, i, log2_attr); + } + } + + wbp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define wbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "\n"); + fprintf (stream, "typedef struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "wbrkprop_t;\n"); + fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; + const char *value_string; + switch (value) + { +#define CASE(x) case x: value_string = #x; break; + CASE(WBP_OTHER); + CASE(WBP_CR); + CASE(WBP_LF); + CASE(WBP_NEWLINE); + CASE(WBP_EXTEND); + CASE(WBP_FORMAT); + CASE(WBP_KATAKANA); + CASE(WBP_ALETTER); + CASE(WBP_MIDNUMLET); + CASE(WBP_MIDLETTER); + CASE(WBP_MIDNUM); + CASE(WBP_NUMERIC); + CASE(WBP_EXTENDNUMLET); +#undef CASE + default: + abort (); + } + if (i > 0 && (i % 4) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %s%s", value_string, + (i+1 < t.level3_size << t.p ? "," : "")); + } + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); +} + +static void +output_wbrk_tables (const char *filename, const char *version) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + output_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Maximum number of characters into which a single Unicode character can be + decomposed. */ +#define MAX_DECOMP_LENGTH 18 + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* An encircled form. */ + UC_DECOMP_SUPER, /* A superscript form. */ + UC_DECOMP_SUB, /* A subscript form. */ + UC_DECOMP_VERTICAL,/* A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* A vulgar fraction form. */ + UC_DECOMP_COMPAT /* Otherwise unspecified compatibility character. */ +}; + +/* Return the decomposition for a Unicode character (ignoring Hangul Jamo + decompositions). Return the type, or -1 for none. */ +static int +get_decomposition (unsigned int ch, + unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) +{ + const char *decomposition = unicode_attributes[ch].decomposition; + + if (decomposition != NULL && decomposition[0] != '\0') + { + int type = UC_DECOMP_CANONICAL; + unsigned int length; + char *endptr; + + if (decomposition[0] == '<') + { + const char *rangle; + size_t typelen; + + rangle = strchr (decomposition + 1, '>'); + if (rangle == NULL) + abort (); + typelen = rangle + 1 - decomposition; +#define TYPE(t1,t2) \ + if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ + type = t2; \ + else + TYPE ("", UC_DECOMP_FONT) + TYPE ("", UC_DECOMP_NOBREAK) + TYPE ("", UC_DECOMP_INITIAL) + TYPE ("", UC_DECOMP_MEDIAL) + TYPE ("", UC_DECOMP_FINAL) + TYPE ("", UC_DECOMP_ISOLATED) + TYPE ("", UC_DECOMP_CIRCLE) + TYPE ("", UC_DECOMP_SUPER) + TYPE ("", UC_DECOMP_SUB) + TYPE ("", UC_DECOMP_VERTICAL) + TYPE ("", UC_DECOMP_WIDE) + TYPE ("", UC_DECOMP_NARROW) + TYPE ("", UC_DECOMP_SMALL) + TYPE ("", UC_DECOMP_SQUARE) + TYPE ("", UC_DECOMP_FRACTION) + TYPE ("", UC_DECOMP_COMPAT) + { + fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); + exit (1); + } +#undef TYPE + decomposition = rangle + 1; + if (decomposition[0] == ' ') + decomposition++; + } + for (length = 0; length < MAX_DECOMP_LENGTH; length++) + { + decomposed[length] = strtoul (decomposition, &endptr, 16); + if (endptr == decomposition) + break; + decomposition = endptr; + if (decomposition[0] == ' ') + decomposition++; + } + if (*decomposition != '\0') + /* MAX_DECOMP_LENGTH is too small. */ + abort (); + + *lengthp = length; + return type; + } + else + return -1; +} + +/* Construction of sparse 3-level tables. */ +#define TABLE decomp_table +#define ELEMENT uint16_t +#define DEFAULT (uint16_t)(-1) +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_decomposition (FILE *stream1, FILE *stream2) +{ + struct decomp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + unsigned int offset; + unsigned int ch; + unsigned int i; + + t.p = 5; + t.q = 5; + decomp_table_init (&t); + + fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n"); + fprintf (stream1, "\n"); + fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{"); + offset = 0; + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type >= 0) + { + if (!(offset < (1 << 15))) + abort (); + decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); + + /* Produce length 3-bytes entries. */ + if (length == 0) + /* We would need a special representation of zero-length entries. */ + abort (); + for (i = 0; i < length; i++) + { + if (offset > 0) + fprintf (stream2, ","); + if ((offset % 4) == 0) + fprintf (stream2, "\n "); + if (!(decomposed[i] < (1 << 18))) + abort (); + fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", + (((i+1 < length ? (1 << 23) : 0) + | (i == 0 ? (type << 18) : 0) + | decomposed[i]) >> 16) & 0xff, + (decomposed[i] >> 8) & 0xff, + decomposed[i] & 0xff); + offset++; + } + } + } + + fprintf (stream2, "\n};\n"); + fprintf (stream2, "\n"); + + decomp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream1, "#define decomp_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream1, "\n"); + fprintf (stream1, "typedef struct\n"); + fprintf (stream1, " {\n"); + fprintf (stream1, " int level1[%zu];\n", t.level1_size); + fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream1, " }\n"); + fprintf (stream1, "decomp_index_table_t;\n"); + fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n"); + fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n"); + fprintf (stream2, "{\n"); + fprintf (stream2, " {"); + if (t.level1_size > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); + } + if (t.level1_size > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (uint16_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + uint16_t value = ((uint16_t *) (t.result + level3_offset))[i]; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value); + if (i+1 < t.level3_size << t.p) + fprintf (stream2, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " }\n"); + fprintf (stream2, "};\n"); +} + +static void +output_decomposition_tables (const char *filename1, const char *filename2, const char *version) +{ + const char *filenames[2]; + FILE *streams[2]; + size_t i; + + filenames[0] = filename1; + filenames[1] = filename2; + + for (i = 0; i < 2; i++) + { + streams[i] = fopen (filenames[i], "w"); + if (streams[i] == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } + } + + for (i = 0; i < 2; i++) + { + FILE *stream = streams[i]; + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Decomposition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + } + + output_decomposition (streams[0], streams[1]); + + for (i = 0; i < 2; i++) + { + if (ferror (streams[i]) || fclose (streams[i])) + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } + } +} + +/* The "excluded from composition" property from the CompositionExclusions.txt file. */ +char unicode_composition_exclusions[0x110000]; + +static void +fill_composition_exclusions (const char *compositionexclusions_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (compositionexclusions_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename); + exit (1); + } + + for (i = 0; i < 0x110000; i++) + unicode_composition_exclusions[i] = 0; + + for (;;) + { + char buf[200+1]; + unsigned int i; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X", &i) != 1) + { + fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); + exit (1); + } + if (!(i < 0x110000)) + abort (); + + unicode_composition_exclusions[i] = 1; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename); + exit (1); + } +} + +static void +debug_output_composition_tables (const char *filename) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", + code1, + code2, + combined, + unicode_attributes[code2].combining); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_composition_tables (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + /* The composition table is a set of mappings (code1, code2) -> combined, + with 928 entries, + 367 values for code1 (from 0x003C to 0x30FD), + 54 values for code2 (from 0x0300 to 0x309A). + For a fixed code1, there are from 1 to 19 possible values for code2. + For a fixed code2, there are from 1 to 117 possible values for code1. + This is a very sparse matrix. + + We want an O(1) hash lookup. + + We could implement the hash lookup by mapping (code1, code2) to a linear + combination mul1*code1 + mul2*code2, which is then used as an index into + a 3-level table. But this leads to a table of size 37 KB. + + We use gperf to implement the hash lookup, giving it the 928 sets of + 4 bytes (code1, code2) as input. gperf generates a hash table of size + 1527, which is quite good (60% filled). It requires an auxiliary table + lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ + + fprintf (stream, "struct composition_rule { char codes[4]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name codes\n"); + fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + if (!(code1 < 0x10000)) + abort (); + if (!(code2 < 0x10000)) + abort (); + if (!(combined < 0x10000)) + abort (); + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 8) & 0xff, code2 & 0xff, + combined); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Output the test for a simple character mapping table to the given file. */ + +static void +output_simple_mapping_test (const char *filename, + const char *function_name, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode character mapping functions.\n"); + fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + fprintf (stream, "#include \"test-mapping-part1.h\"\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int value = func (ch); + + if (value != ch) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + fprintf (stream, "\n"); + fprintf (stream, "#define MAP(c) %s (c)\n", function_name); + fprintf (stream, "#include \"test-mapping-part2.h\"\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE mapping_table +#define ELEMENT int32_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output a simple character mapping table to the given file. */ + +static void +output_simple_mapping (const char *filename, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct mapping_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + mapping_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = (int) func (ch) - (int) ch; + + mapping_table_add (&t, ch, value); + } + + mapping_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define mapping_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_mapping =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* A special casing context. + A context is negated through x -> -x. */ +enum +{ + SCC_ALWAYS = 0, + SCC_FINAL_SIGMA, + SCC_AFTER_SOFT_DOTTED, + SCC_MORE_ABOVE, + SCC_BEFORE_DOT, + SCC_AFTER_I +}; + +/* A special casing rule. */ +struct special_casing_rule +{ + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + unsigned int casefold_mapping[3]; + const char *language; + int context; +}; + +/* The special casing rules. */ +struct special_casing_rule **casing_rules; +unsigned int num_casing_rules; +unsigned int allocated_casing_rules; + +static void +add_casing_rule (struct special_casing_rule *new_rule) +{ + if (num_casing_rules == allocated_casing_rules) + { + allocated_casing_rules = 2 * allocated_casing_rules; + if (allocated_casing_rules < 16) + allocated_casing_rules = 16; + casing_rules = + (struct special_casing_rule **) + realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *)); + } + casing_rules[num_casing_rules++] = new_rule; +} + +/* Stores in casing_rules the special casing rules found in + specialcasing_filename. */ +static void +fill_casing_rules (const char *specialcasing_filename) +{ + FILE *stream; + + stream = fopen (specialcasing_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename); + exit (1); + } + + casing_rules = NULL; + num_casing_rules = 0; + allocated_casing_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + char *language; + int context; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan lower mapping. */ + for (i = 0; i < 3; i++) + lower_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + lower_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan title mapping. */ + for (i = 0; i < 3; i++) + title_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + title_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan upper mapping. */ + for (i = 0; i < 3; i++) + upper_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + upper_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan language and context. */ + language = NULL; + context = SCC_ALWAYS; + while (*scanptr == ' ') + scanptr++; + if (*scanptr != '\0' && *scanptr != '#') + { + const char *word_begin = scanptr; + const char *word_end; + + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + + while (*scanptr == ' ') + scanptr++; + + if (word_end - word_begin == 2) + { + language = (char *) malloc ((word_end - word_begin) + 1); + memcpy (language, word_begin, 2); + language[word_end - word_begin] = '\0'; + word_begin = word_end = NULL; + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + word_begin = scanptr; + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + } + } + + if (word_end > word_begin) + { + bool negate = false; + + if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0) + { + word_begin += 4; + negate = true; + } + if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0) + context = SCC_FINAL_SIGMA; + else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0) + context = SCC_AFTER_SOFT_DOTTED; + else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0) + context = SCC_MORE_ABOVE; + else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0) + context = SCC_BEFORE_DOT; + else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0) + context = SCC_AFTER_I; + else + { + fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename); + exit (1); + } + if (negate) + context = - context; + } + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + } + + /* Store the rule. */ + { + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + new_rule->code = code; + new_rule->language = language; + new_rule->context = context; + memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping)); + + add_casing_rule (new_rule); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", specialcasing_filename); + exit (1); + } +} + +/* A casefolding rule. */ +struct casefold_rule +{ + unsigned int code; + unsigned int mapping[3]; + const char *language; +}; + +/* The casefolding rules. */ +struct casefold_rule **casefolding_rules; +unsigned int num_casefolding_rules; +unsigned int allocated_casefolding_rules; + +/* Stores in casefolding_rules the case folding rules found in + casefolding_filename. */ +static void +fill_casefolding_rules (const char *casefolding_filename) +{ + FILE *stream; + + stream = fopen (casefolding_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename); + exit (1); + } + + casefolding_rules = NULL; + num_casefolding_rules = 0; + allocated_casefolding_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + char type; + unsigned int mapping[3]; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan type. */ + while (*scanptr == ' ') + scanptr++; + + switch (*scanptr) + { + case 'C': case 'F': case 'S': case 'T': + type = *scanptr; + break; + default: + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan casefold mapping. */ + for (i = 0; i < 3; i++) + mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */ + if (type != 'S') + { + const char * const *languages; + unsigned int languages_count; + + /* Type 'T' indicates that the rule is applicable to Turkish + languages only. */ + if (type == 'T') + { + static const char * const turkish_languages[] = { "tr", "az" }; + languages = turkish_languages; + languages_count = 2; + } + else + { + static const char * const all_languages[] = { NULL }; + languages = all_languages; + languages_count = 1; + } + + for (i = 0; i < languages_count; i++) + { + /* Store a new rule. */ + struct casefold_rule *new_rule = + (struct casefold_rule *) malloc (sizeof (struct casefold_rule)); + new_rule->code = code; + memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping)); + new_rule->language = languages[i]; + + if (num_casefolding_rules == allocated_casefolding_rules) + { + allocated_casefolding_rules = 2 * allocated_casefolding_rules; + if (allocated_casefolding_rules < 16) + allocated_casefolding_rules = 16; + casefolding_rules = + (struct casefold_rule **) + realloc (casefolding_rules, + allocated_casefolding_rules * sizeof (struct casefold_rule *)); + } + casefolding_rules[num_casefolding_rules++] = new_rule; + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", casefolding_filename); + exit (1); + } +} + +/* Casefold mapping, when it maps to a single character. */ +unsigned int unicode_casefold[0x110000]; + +static unsigned int +to_casefold (unsigned int ch) +{ + return unicode_casefold[ch]; +} + +/* Redistribute the casefolding_rules: + - Rules that map to a single character, language independently, are stored + in unicode_casefold. + - Other rules are merged into casing_rules. */ +static void +redistribute_casefolding_rules (void) +{ + unsigned int ch, i, j; + + /* Fill unicode_casefold[]. */ + for (ch = 0; ch < 0x110000; ch++) + unicode_casefold[ch] = ch; + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (cfrule->language == NULL && cfrule->mapping[1] == 0) + { + ch = cfrule->code; + if (!(ch < 0x110000)) + abort (); + unicode_casefold[ch] = cfrule->mapping[0]; + } + } + + /* Extend the special casing rules by filling in their casefold_mapping[] + field. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + unsigned int k; + + rule->casefold_mapping[0] = to_casefold (rule->code); + for (k = 1; k < 3; k++) + rule->casefold_mapping[k] = 0; + } + + /* Now merge the other casefolding rules into casing_rules. */ + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (!(cfrule->language == NULL && cfrule->mapping[1] == 0)) + { + /* Find a rule that applies to the same code, same language, and it + has context SCC_ALWAYS. At the same time, update all rules that + have the same code and same or more specific language. */ + struct special_casing_rule *found_rule = NULL; + + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && (cfrule->language == NULL + || (rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0))) + { + memcpy (rule->casefold_mapping, cfrule->mapping, + sizeof (rule->casefold_mapping)); + + if ((cfrule->language == NULL + ? rule->language == NULL + : rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0) + && rule->context == SCC_ALWAYS) + { + /* Found it. */ + found_rule = rule; + } + } + } + + if (found_rule == NULL) + { + /* Create a new rule. */ + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + + /* Try to find a rule that applies to the same code, no language + restriction, and with context SCC_ALWAYS. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && rule->context == SCC_ALWAYS + && rule->language == NULL) + { + /* Found it. */ + found_rule = rule; + break; + } + } + + new_rule->code = cfrule->code; + new_rule->language = cfrule->language; + new_rule->context = SCC_ALWAYS; + if (found_rule != NULL) + { + memcpy (new_rule->lower_mapping, found_rule->lower_mapping, + sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, found_rule->title_mapping, + sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, found_rule->upper_mapping, + sizeof (new_rule->upper_mapping)); + } + else + { + unsigned int k; + + new_rule->lower_mapping[0] = to_lower (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->lower_mapping[k] = 0; + new_rule->title_mapping[0] = to_title (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->title_mapping[k] = 0; + new_rule->upper_mapping[0] = to_upper (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->upper_mapping[k] = 0; + } + memcpy (new_rule->casefold_mapping, cfrule->mapping, + sizeof (new_rule->casefold_mapping)); + + add_casing_rule (new_rule); + } + } + } +} + +static int +compare_casing_rules (const void *a, const void *b) +{ + struct special_casing_rule *a_rule = *(struct special_casing_rule **) a; + struct special_casing_rule *b_rule = *(struct special_casing_rule **) b; + unsigned int a_code = a_rule->code; + unsigned int b_code = b_rule->code; + + if (a_code < b_code) + return -1; + if (a_code > b_code) + return 1; + + /* Sort the more specific rules before the more general ones. */ + return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0)) + + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0))); +} + +static void +sort_casing_rules (void) +{ + /* Sort the rules 1. by code, 2. by specificity. */ + if (num_casing_rules > 1) + qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *), + compare_casing_rules); +} + +/* Output the special casing rules. */ +static void +output_casing_rules (const char *filename, const char *version) +{ + FILE *stream; + unsigned int i, j; + unsigned int minor; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Special casing rules of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "struct special_casing_rule { char code[3]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name code\n"); + fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + minor = 0; + for (i = 0; i < num_casing_rules; i++) + { + struct special_casing_rule *rule = casing_rules[i]; + int context; + + if (i > 0 && rule->code == casing_rules[i - 1]->code) + minor += 1; + else + minor = 0; + + if (!(rule->code < 0x10000)) + { + fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code); + exit (1); + } + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ", + (rule->code >> 8) & 0xff, rule->code & 0xff, minor); + + fprintf (stream, "%d, ", + i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0); + + context = rule->context; + if (context < 0) + { + fprintf (stream, "-"); + context = - context; + } + else + fprintf (stream, " "); + switch (context) + { + case SCC_ALWAYS: + fprintf (stream, "SCC_ALWAYS "); + break; + case SCC_FINAL_SIGMA: + fprintf (stream, "SCC_FINAL_SIGMA "); + break; + case SCC_AFTER_SOFT_DOTTED: + fprintf (stream, "SCC_AFTER_SOFT_DOTTED"); + break; + case SCC_MORE_ABOVE: + fprintf (stream, "SCC_MORE_ABOVE "); + break; + case SCC_BEFORE_DOT: + fprintf (stream, "SCC_BEFORE_DOT "); + break; + case SCC_AFTER_I: + fprintf (stream, "SCC_AFTER_I "); + break; + default: + abort (); + } + fprintf (stream, ", "); + + if (rule->language != NULL) + { + if (strlen (rule->language) != 2) + abort (); + fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]); + } + else + fprintf (stream, "{ '\\0', '\\0' }, "); + + fprintf (stream, "{ "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->upper_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->upper_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->upper_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->lower_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->lower_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->lower_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->title_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->title_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->title_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->casefold_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->casefold_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->casefold_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }\n"); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +int +main (int argc, char * argv[]) +{ + const char *unicodedata_filename; + const char *proplist_filename; + const char *derivedproplist_filename; + const char *scripts_filename; + const char *blocks_filename; + const char *proplist30_filename; + const char *eastasianwidth_filename; + const char *linebreak_filename; + const char *wordbreakproperty_filename; + const char *compositionexclusions_filename; + const char *specialcasing_filename; + const char *casefolding_filename; + const char *version; + + if (argc != 14) + { + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + argv[0]); + exit (1); + } + + unicodedata_filename = argv[1]; + proplist_filename = argv[2]; + derivedproplist_filename = argv[3]; + scripts_filename = argv[4]; + blocks_filename = argv[5]; + proplist30_filename = argv[6]; + eastasianwidth_filename = argv[7]; + linebreak_filename = argv[8]; + wordbreakproperty_filename = argv[9]; + compositionexclusions_filename = argv[10]; + specialcasing_filename = argv[11]; + casefolding_filename = argv[12]; + version = argv[13]; + + fill_attributes (unicodedata_filename); + clear_properties (); + fill_properties (proplist_filename); + fill_properties (derivedproplist_filename); + fill_properties30 (proplist30_filename); + fill_scripts (scripts_filename); + fill_blocks (blocks_filename); + fill_width (eastasianwidth_filename); + fill_org_lbp (linebreak_filename); + fill_org_wbp (wordbreakproperty_filename); + fill_composition_exclusions (compositionexclusions_filename); + fill_casing_rules (specialcasing_filename); + fill_casefolding_rules (casefolding_filename); + redistribute_casefolding_rules (); + sort_casing_rules (); + + output_categories (version); + output_category ("unictype/categ_of.h", version); + output_combclass ("unictype/combining.h", version); + output_bidi_category ("unictype/bidi_of.h", version); + output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); + output_decimal_digit ("unictype/decdigit.h", version); + output_digit_test ("../tests/unictype/test-digit.h", version); + output_digit ("unictype/digit.h", version); + output_numeric_test ("../tests/unictype/test-numeric.h", version); + output_numeric ("unictype/numeric.h", version); + output_mirror ("unictype/mirror.h", version); + output_properties (version); + output_scripts (version); + output_scripts_byname (version); + output_blocks (version); + output_ident_properties (version); + output_old_ctype (version); + + debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); + debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); + output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); + + debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt"); + debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); + output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); + debug_output_composition_tables ("uninorm/composition.txt"); + output_composition_tables ("uninorm/composition-table.gperf", version); + + output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version); + output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version); + output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version); + output_simple_mapping ("unicase/toupper.h", to_upper, version); + output_simple_mapping ("unicase/tolower.h", to_lower, version); + output_simple_mapping ("unicase/totitle.h", to_title, version); + output_simple_mapping ("unicase/tocasefold.h", to_casefold, version); + output_casing_rules ("unicase/special-casing-table.gperf", version); return 0; } @@ -6351,15 +8312,19 @@ main (int argc, char * argv[]) * compile-command: " gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ ./gen-uni-tables \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/PropList.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/DerivedCoreProperties.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Scripts.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Blocks.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/EastAsianWidth.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/LineBreak.txt \ - 5.0.0 + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \ + 5.1.0 " * End: */