X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=0eddbb19b3716179d830bcf4e00bba8fd9e6c9df;hb=395f9e6a19b504609aa8b0bd7f8cc55d4547d8dd;hp=170e80754ada0faccbbe8dcdacfe12d2096c1cea;hpb=d60f3b0c6b0f93a601acd1cfd3923f94ca05abb0;p=gnulib.git diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 170e80754..0eddbb19b 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -21,6 +21,7 @@ $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \ /usr/local/share/Unidata/PropList.txt \ /usr/local/share/Unidata/DerivedCoreProperties.txt \ + /usr/local/share/Unidata/ArabicShaping.txt \ /usr/local/share/Unidata/Scripts.txt \ /usr/local/share/Unidata/Blocks.txt \ /usr/local/share/Unidata/PropList-3.0.1.txt \ @@ -31,7 +32,7 @@ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ - 5.1.0 + 6.0.0 */ #include @@ -259,6 +260,7 @@ fill_attributes (const char *unicodedata_filename) field11, field12, field13, field14); } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); @@ -280,6 +282,17 @@ is_category_L (unsigned int ch) } static bool +is_category_LC (unsigned int ch) +{ + /* See PropertyValueAliases.txt. */ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't')); +} + +static bool is_category_Lu (unsigned int ch) { return (unicode_attributes[ch].name != NULL @@ -690,7 +703,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* %s of Unicode characters. */\n", comment); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 4; /* or: 5 */ @@ -803,6 +816,7 @@ output_categories (const char *version) output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \ output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version); CATEGORY (L) + CATEGORY (LC) CATEGORY (Lu) CATEGORY (Ll) CATEGORY (Lt) @@ -845,6 +859,7 @@ output_categories (const char *version) enum { UC_CATEGORY_MASK_L = 0x0000001f, + UC_CATEGORY_MASK_LC = 0x00000007, UC_CATEGORY_MASK_Lu = 0x00000001, UC_CATEGORY_MASK_Ll = 0x00000002, UC_CATEGORY_MASK_Lt = 0x00000004, @@ -894,6 +909,7 @@ general_category_byname (const char *category_name) switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_L; + case 'C': return UC_CATEGORY_MASK_LC; case 'u': return UC_CATEGORY_MASK_Lu; case 'l': return UC_CATEGORY_MASK_Ll; case 't': return UC_CATEGORY_MASK_Lt; @@ -994,7 +1010,7 @@ output_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1158,7 +1174,7 @@ output_combclass (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Combining class of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1498,7 +1514,7 @@ output_bidi_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Bidi categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1655,7 +1671,7 @@ output_decimal_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -1702,7 +1718,7 @@ output_decimal_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1842,7 +1858,7 @@ output_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -1889,7 +1905,7 @@ output_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -2046,7 +2062,7 @@ output_numeric_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -2102,7 +2118,7 @@ output_numeric (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); /* Create table of occurring fractions. */ @@ -2387,7 +2403,7 @@ output_mirror (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Mirrored Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -2491,6 +2507,24 @@ output_mirror (const char *filename, const char *version) /* ========================================================================= */ +/* Particular values of the word break property. */ + +static bool +is_WBP_MIDNUMLET (unsigned int ch) +{ + return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E); +} + +static bool +is_WBP_MIDLETTER (unsigned int ch) +{ + return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A); +} + +/* ========================================================================= */ + /* Properties. */ /* Reading PropList.txt and DerivedCoreProperties.txt. */ @@ -2534,6 +2568,13 @@ enum PROP_ALPHABETIC, PROP_LOWERCASE, PROP_UPPERCASE, + PROP_CASED, + PROP_CASE_IGNORABLE, + PROP_CHANGES_WHEN_LOWERCASED, + PROP_CHANGES_WHEN_UPPERCASED, + PROP_CHANGES_WHEN_TITLECASED, + PROP_CHANGES_WHEN_CASEFOLDED, + PROP_CHANGES_WHEN_CASEMAPPED, PROP_ID_START, PROP_ID_CONTINUE, PROP_XID_START, @@ -2632,6 +2673,13 @@ fill_properties (const char *proplist_filename) PROP ("Alphabetic", PROP_ALPHABETIC) PROP ("Lowercase", PROP_LOWERCASE) PROP ("Uppercase", PROP_UPPERCASE) + PROP ("Cased", PROP_CASED) + PROP ("Case_Ignorable", PROP_CASE_IGNORABLE) + PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED) + PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED) + PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED) + PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED) + PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED) PROP ("ID_Start", PROP_ID_START) PROP ("ID_Continue", PROP_ID_CONTINUE) PROP ("XID_Start", PROP_XID_START) @@ -2728,6 +2776,7 @@ fill_property30 (char array[0x110000], const char *proplist_filename, const char for (i = i1; i <= i2; i++) array[i] = 1; } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", proplist_filename); @@ -2777,6 +2826,7 @@ is_property_alphabetic (unsigned int ch) || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */ + || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */ || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */ || (ch == 0x10341) /* GOTHIC LETTER NINETY */ || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ @@ -2812,7 +2862,10 @@ is_property_default_ignorable_code_point (unsigned int ch) bool result1 = (is_category_Cf (ch) && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ - && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)) + && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F) + /* For some reason, the following are not listed as having property + Default_Ignorable_Code_Point. */ + && !(ch == 0x110BD)) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); bool result2 = @@ -2921,6 +2974,79 @@ is_property_titlecase (unsigned int ch) return is_category_Lt (ch); } +/* See DerivedCoreProperties.txt. */ +static bool +is_property_cased (unsigned int ch) +{ + bool result1 = (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_case_ignorable (unsigned int ch) +{ + bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch) + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_lowercased (unsigned int ch) +{ + bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0); + bool result2 = (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].lower != NONE + && unicode_attributes[ch].lower != ch); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_uppercased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_titlecased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casefolded (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casemapped (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0); +} + /* See PropList.txt, UCD.html. */ static bool is_property_soft_dotted (unsigned int ch) @@ -3425,112 +3551,729 @@ is_property_diacritic (unsigned int ch) return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0); } -/* See PropList.txt, UCD.html. */ -static bool -is_property_extender (unsigned int ch) -{ - return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0); -} +/* See PropList.txt, UCD.html. */ +static bool +is_property_extender (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_ignorable_control (unsigned int ch) +{ + return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN) + || is_category_Cf (ch)) + && ch != 0x0000; +} + +/* ------------------------------------------------------------------------- */ + +/* Output all properties. */ +static void +output_properties (const char *version) +{ +#define PROPERTY(P) \ + debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \ + output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \ + output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version); + PROPERTY(white_space) + PROPERTY(alphabetic) + PROPERTY(other_alphabetic) + PROPERTY(not_a_character) + PROPERTY(default_ignorable_code_point) + PROPERTY(other_default_ignorable_code_point) + PROPERTY(deprecated) + PROPERTY(logical_order_exception) + PROPERTY(variation_selector) + PROPERTY(private_use) + PROPERTY(unassigned_code_value) + PROPERTY(uppercase) + PROPERTY(other_uppercase) + PROPERTY(lowercase) + PROPERTY(other_lowercase) + PROPERTY(titlecase) + PROPERTY(cased) + PROPERTY(case_ignorable) + PROPERTY(changes_when_lowercased) + PROPERTY(changes_when_uppercased) + PROPERTY(changes_when_titlecased) + PROPERTY(changes_when_casefolded) + PROPERTY(changes_when_casemapped) + PROPERTY(soft_dotted) + PROPERTY(id_start) + PROPERTY(other_id_start) + PROPERTY(id_continue) + PROPERTY(other_id_continue) + PROPERTY(xid_start) + PROPERTY(xid_continue) + PROPERTY(pattern_white_space) + PROPERTY(pattern_syntax) + PROPERTY(join_control) + PROPERTY(grapheme_base) + PROPERTY(grapheme_extend) + PROPERTY(other_grapheme_extend) + PROPERTY(grapheme_link) + PROPERTY(bidi_control) + PROPERTY(bidi_left_to_right) + PROPERTY(bidi_hebrew_right_to_left) + PROPERTY(bidi_arabic_right_to_left) + PROPERTY(bidi_european_digit) + PROPERTY(bidi_eur_num_separator) + PROPERTY(bidi_eur_num_terminator) + PROPERTY(bidi_arabic_digit) + PROPERTY(bidi_common_separator) + PROPERTY(bidi_block_separator) + PROPERTY(bidi_segment_separator) + PROPERTY(bidi_whitespace) + PROPERTY(bidi_non_spacing_mark) + PROPERTY(bidi_boundary_neutral) + PROPERTY(bidi_pdf) + PROPERTY(bidi_embedding_or_override) + PROPERTY(bidi_other_neutral) + PROPERTY(hex_digit) + PROPERTY(ascii_hex_digit) + PROPERTY(ideographic) + PROPERTY(unified_ideograph) + PROPERTY(radical) + PROPERTY(ids_binary_operator) + PROPERTY(ids_trinary_operator) + PROPERTY(zero_width) + PROPERTY(space) + PROPERTY(non_break) + PROPERTY(iso_control) + PROPERTY(format_control) + PROPERTY(dash) + PROPERTY(hyphen) + PROPERTY(punctuation) + PROPERTY(line_separator) + PROPERTY(paragraph_separator) + PROPERTY(quotation_mark) + PROPERTY(sentence_terminal) + PROPERTY(terminal_punctuation) + PROPERTY(currency_symbol) + PROPERTY(math) + PROPERTY(other_math) + PROPERTY(paired_punctuation) + PROPERTY(left_of_pair) + PROPERTY(combining) + PROPERTY(composite) + PROPERTY(decimal_digit) + PROPERTY(numeric) + PROPERTY(diacritic) + PROPERTY(extender) + PROPERTY(ignorable_control) +#undef PROPERTY +} + +/* ========================================================================= */ + +/* Arabic Shaping. */ + +enum +{ + UC_JOINING_TYPE_U, /* Non_Joining */ + UC_JOINING_TYPE_T, /* Transparent */ + UC_JOINING_TYPE_C, /* Join_Causing */ + UC_JOINING_TYPE_L, /* Left_Joining */ + UC_JOINING_TYPE_R, /* Right_Joining */ + UC_JOINING_TYPE_D /* Dual_Joining */ +}; + +static uint8_t unicode_joining_type[0x110000]; + +enum +{ + UC_JOINING_GROUP_NONE, /* No_Joining_Group */ + UC_JOINING_GROUP_AIN, /* Ain */ + UC_JOINING_GROUP_ALAPH, /* Alaph */ + UC_JOINING_GROUP_ALEF, /* Alef */ + UC_JOINING_GROUP_BEH, /* Beh */ + UC_JOINING_GROUP_BETH, /* Beth */ + UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */ + UC_JOINING_GROUP_DAL, /* Dal */ + UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */ + UC_JOINING_GROUP_E, /* E */ + UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */ + UC_JOINING_GROUP_FE, /* Fe */ + UC_JOINING_GROUP_FEH, /* Feh */ + UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */ + UC_JOINING_GROUP_GAF, /* Gaf */ + UC_JOINING_GROUP_GAMAL, /* Gamal */ + UC_JOINING_GROUP_HAH, /* Hah */ + UC_JOINING_GROUP_HE, /* He */ + UC_JOINING_GROUP_HEH, /* Heh */ + UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */ + UC_JOINING_GROUP_HETH, /* Heth */ + UC_JOINING_GROUP_KAF, /* Kaf */ + UC_JOINING_GROUP_KAPH, /* Kaph */ + UC_JOINING_GROUP_KHAPH, /* Khaph */ + UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */ + UC_JOINING_GROUP_LAM, /* Lam */ + UC_JOINING_GROUP_LAMADH, /* Lamadh */ + UC_JOINING_GROUP_MEEM, /* Meem */ + UC_JOINING_GROUP_MIM, /* Mim */ + UC_JOINING_GROUP_NOON, /* Noon */ + UC_JOINING_GROUP_NUN, /* Nun */ + UC_JOINING_GROUP_NYA, /* Nya */ + UC_JOINING_GROUP_PE, /* Pe */ + UC_JOINING_GROUP_QAF, /* Qaf */ + UC_JOINING_GROUP_QAPH, /* Qaph */ + UC_JOINING_GROUP_REH, /* Reh */ + UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */ + UC_JOINING_GROUP_SAD, /* Sad */ + UC_JOINING_GROUP_SADHE, /* Sadhe */ + UC_JOINING_GROUP_SEEN, /* Seen */ + UC_JOINING_GROUP_SEMKATH, /* Semkath */ + UC_JOINING_GROUP_SHIN, /* Shin */ + UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */ + UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */ + UC_JOINING_GROUP_TAH, /* Tah */ + UC_JOINING_GROUP_TAW, /* Taw */ + UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */ + UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */ + UC_JOINING_GROUP_TETH, /* Teth */ + UC_JOINING_GROUP_WAW, /* Waw */ + UC_JOINING_GROUP_YEH, /* Yeh */ + UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */ + UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */ + UC_JOINING_GROUP_YUDH, /* Yudh */ + UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */ + UC_JOINING_GROUP_ZAIN, /* Zain */ + UC_JOINING_GROUP_ZHAIN /* Zhain */ +}; + +static uint8_t unicode_joining_group[0x110000]; + +static void +fill_arabicshaping (const char *arabicshaping_filename) +{ + FILE *stream; + unsigned int i; + int lineno; + + stream = fopen (arabicshaping_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename); + exit (1); + } + + for (i = 0; i < 0x110000; i++) + { + unicode_joining_type[i] = (uint8_t)~(uint8_t)0; + unicode_joining_group[i] = UC_JOINING_GROUP_NONE; + } + + lineno = 0; + for (;;) + { + char buf[100+1]; + char separator1[100+1]; + char padding1[100+1]; + char schematic_name[100+1]; + char separator2[100+1]; + char padding2[100+1]; + char joining_type_name[100+1]; + char separator3[100+1]; + char padding3[100+1]; + char joining_group_name[100+1]; + int joining_type; + int joining_group; + + lineno++; + if (fscanf (stream, "%100[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]", + &i, separator1, padding1, schematic_name, separator2, + padding2, joining_type_name, separator3, padding3, + joining_group_name) != 10) + { + fprintf (stderr, "parse error in '%s':%d\n", + arabicshaping_filename, lineno); + exit (1); + } + if (i >= 0x110000) + abort (); + +#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name; + if (false) {} + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + else + { + fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n", + joining_type_name, arabicshaping_filename, lineno); + exit (1); + } + + /* Remove trailing spaces. */ + while (joining_group_name[0] != '\0' + && joining_group_name[strlen (joining_group_name) - 1] == ' ') + joining_group_name[strlen (joining_group_name) - 1] = '\0'; + +#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value; + if (false) {} + TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group") + TRY(UC_JOINING_GROUP_AIN, "AIN") + TRY(UC_JOINING_GROUP_ALAPH, "ALAPH") + TRY(UC_JOINING_GROUP_ALEF, "ALEF") + TRY(UC_JOINING_GROUP_BEH, "BEH") + TRY(UC_JOINING_GROUP_BETH, "BETH") + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE") + TRY(UC_JOINING_GROUP_DAL, "DAL") + TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH") + TRY(UC_JOINING_GROUP_E, "E") + TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH") + TRY(UC_JOINING_GROUP_FE, "FE") + TRY(UC_JOINING_GROUP_FEH, "FEH") + TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH") + TRY(UC_JOINING_GROUP_GAF, "GAF") + TRY(UC_JOINING_GROUP_GAMAL, "GAMAL") + TRY(UC_JOINING_GROUP_HAH, "HAH") + TRY(UC_JOINING_GROUP_HE, "HE") + TRY(UC_JOINING_GROUP_HEH, "HEH") + TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL") + TRY(UC_JOINING_GROUP_HETH, "HETH") + TRY(UC_JOINING_GROUP_KAF, "KAF") + TRY(UC_JOINING_GROUP_KAPH, "KAPH") + TRY(UC_JOINING_GROUP_KHAPH, "KHAPH") + TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH") + TRY(UC_JOINING_GROUP_LAM, "LAM") + TRY(UC_JOINING_GROUP_LAMADH, "LAMADH") + TRY(UC_JOINING_GROUP_MEEM, "MEEM") + TRY(UC_JOINING_GROUP_MIM, "MIM") + TRY(UC_JOINING_GROUP_NOON, "NOON") + TRY(UC_JOINING_GROUP_NUN, "NUN") + TRY(UC_JOINING_GROUP_NYA, "NYA") + TRY(UC_JOINING_GROUP_PE, "PE") + TRY(UC_JOINING_GROUP_QAF, "QAF") + TRY(UC_JOINING_GROUP_QAPH, "QAPH") + TRY(UC_JOINING_GROUP_REH, "REH") + TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE") + TRY(UC_JOINING_GROUP_SAD, "SAD") + TRY(UC_JOINING_GROUP_SADHE, "SADHE") + TRY(UC_JOINING_GROUP_SEEN, "SEEN") + TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH") + TRY(UC_JOINING_GROUP_SHIN, "SHIN") + TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF") + TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW") + TRY(UC_JOINING_GROUP_TAH, "TAH") + TRY(UC_JOINING_GROUP_TAW, "TAW") + TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA") + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL") + TRY(UC_JOINING_GROUP_TETH, "TETH") + TRY(UC_JOINING_GROUP_WAW, "WAW") + TRY(UC_JOINING_GROUP_YEH, "YEH") + TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE") + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL") + TRY(UC_JOINING_GROUP_YUDH, "YUDH") + TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE") + TRY(UC_JOINING_GROUP_ZAIN, "ZAIN") + TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN") +#undef TRY + else + { + fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n", + joining_group_name, arabicshaping_filename, lineno); + exit (1); + } + + unicode_joining_type[i] = joining_type; + unicode_joining_group[i] = joining_group; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename); + exit (1); + } +} + +/* Convert a Joining_Type value to a C identifier. */ +static const char * +joining_type_as_c_identifier (int joining_type) +{ +#define TRY(value) if (joining_type == value) return #value; + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + abort (); +} + +static void +output_joining_type_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_type[ch]; + + if (value != (uint8_t)~(uint8_t)0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE joining_type_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_joining_type (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct joining_type_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint8_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + joining_type_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + uint8_t value = unicode_joining_type[ch]; + + joining_type_table_add (&t, ch, value); + } + + joining_type_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define joining_type_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size, + (1 << t.p) * 4 / 8); + fprintf (stream, " }\n"); + fprintf (stream, "u_joining_type =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 4 bits only. */ + level3_packed = + (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 4) / 8; + unsigned int k = (i * 4) % 8; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f; + level3_packed[j] |= (value << k); + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 4 / 8) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Convert a Joining_Group value to a C identifier. */ +static const char * +joining_group_as_c_identifier (int joining_group) +{ +#define TRY(value) if (joining_group == value) return #value; + TRY(UC_JOINING_GROUP_NONE) + TRY(UC_JOINING_GROUP_AIN) + TRY(UC_JOINING_GROUP_ALAPH) + TRY(UC_JOINING_GROUP_ALEF) + TRY(UC_JOINING_GROUP_BEH) + TRY(UC_JOINING_GROUP_BETH) + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE) + TRY(UC_JOINING_GROUP_DAL) + TRY(UC_JOINING_GROUP_DALATH_RISH) + TRY(UC_JOINING_GROUP_E) + TRY(UC_JOINING_GROUP_FARSI_YEH) + TRY(UC_JOINING_GROUP_FE) + TRY(UC_JOINING_GROUP_FEH) + TRY(UC_JOINING_GROUP_FINAL_SEMKATH) + TRY(UC_JOINING_GROUP_GAF) + TRY(UC_JOINING_GROUP_GAMAL) + TRY(UC_JOINING_GROUP_HAH) + TRY(UC_JOINING_GROUP_HE) + TRY(UC_JOINING_GROUP_HEH) + TRY(UC_JOINING_GROUP_HEH_GOAL) + TRY(UC_JOINING_GROUP_HETH) + TRY(UC_JOINING_GROUP_KAF) + TRY(UC_JOINING_GROUP_KAPH) + TRY(UC_JOINING_GROUP_KHAPH) + TRY(UC_JOINING_GROUP_KNOTTED_HEH) + TRY(UC_JOINING_GROUP_LAM) + TRY(UC_JOINING_GROUP_LAMADH) + TRY(UC_JOINING_GROUP_MEEM) + TRY(UC_JOINING_GROUP_MIM) + TRY(UC_JOINING_GROUP_NOON) + TRY(UC_JOINING_GROUP_NUN) + TRY(UC_JOINING_GROUP_NYA) + TRY(UC_JOINING_GROUP_PE) + TRY(UC_JOINING_GROUP_QAF) + TRY(UC_JOINING_GROUP_QAPH) + TRY(UC_JOINING_GROUP_REH) + TRY(UC_JOINING_GROUP_REVERSED_PE) + TRY(UC_JOINING_GROUP_SAD) + TRY(UC_JOINING_GROUP_SADHE) + TRY(UC_JOINING_GROUP_SEEN) + TRY(UC_JOINING_GROUP_SEMKATH) + TRY(UC_JOINING_GROUP_SHIN) + TRY(UC_JOINING_GROUP_SWASH_KAF) + TRY(UC_JOINING_GROUP_SYRIAC_WAW) + TRY(UC_JOINING_GROUP_TAH) + TRY(UC_JOINING_GROUP_TAW) + TRY(UC_JOINING_GROUP_TEH_MARBUTA) + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL) + TRY(UC_JOINING_GROUP_TETH) + TRY(UC_JOINING_GROUP_WAW) + TRY(UC_JOINING_GROUP_YEH) + TRY(UC_JOINING_GROUP_YEH_BARREE) + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL) + TRY(UC_JOINING_GROUP_YUDH) + TRY(UC_JOINING_GROUP_YUDH_HE) + TRY(UC_JOINING_GROUP_ZAIN) + TRY(UC_JOINING_GROUP_ZHAIN) +#undef TRY + abort (); +} + +static void +output_joining_group_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining group of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_group[ch]; + + if (value != UC_JOINING_GROUP_NONE) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_joining_group (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch_min, ch_max, ch, i; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + ch_min = 0x10FFFF; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_min = ch; + break; + } + + ch_max = 0; + for (ch = 0x10FFFF; ch > 0; ch--) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_max = ch; + break; + } + + if (!(ch_min <= ch_max)) + abort (); -/* See PropList-3.0.1.txt. */ -static bool -is_property_ignorable_control (unsigned int ch) -{ - return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN) - || is_category_Cf (ch)) - && ch != 0x0000; -} + /* If the interval [ch_min, ch_max] is too large, we should better use a + 3-level table. */ + if (!(ch_max - ch_min < 0x200)) + abort (); -/* ------------------------------------------------------------------------- */ + fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min); + fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n", + ch_max + 1, ch_min); + fprintf (stream, "{"); + for (i = 0; i <= ch_max - ch_min; i++) + { + const char *s; -/* Output all properties. */ -static void -output_properties (const char *version) -{ -#define PROPERTY(P) \ - debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \ - output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \ - output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version); - PROPERTY(white_space) - PROPERTY(alphabetic) - PROPERTY(other_alphabetic) - PROPERTY(not_a_character) - PROPERTY(default_ignorable_code_point) - PROPERTY(other_default_ignorable_code_point) - PROPERTY(deprecated) - PROPERTY(logical_order_exception) - PROPERTY(variation_selector) - PROPERTY(private_use) - PROPERTY(unassigned_code_value) - PROPERTY(uppercase) - PROPERTY(other_uppercase) - PROPERTY(lowercase) - PROPERTY(other_lowercase) - PROPERTY(titlecase) - PROPERTY(soft_dotted) - PROPERTY(id_start) - PROPERTY(other_id_start) - PROPERTY(id_continue) - PROPERTY(other_id_continue) - PROPERTY(xid_start) - PROPERTY(xid_continue) - PROPERTY(pattern_white_space) - PROPERTY(pattern_syntax) - PROPERTY(join_control) - PROPERTY(grapheme_base) - PROPERTY(grapheme_extend) - PROPERTY(other_grapheme_extend) - PROPERTY(grapheme_link) - PROPERTY(bidi_control) - PROPERTY(bidi_left_to_right) - PROPERTY(bidi_hebrew_right_to_left) - PROPERTY(bidi_arabic_right_to_left) - PROPERTY(bidi_european_digit) - PROPERTY(bidi_eur_num_separator) - PROPERTY(bidi_eur_num_terminator) - PROPERTY(bidi_arabic_digit) - PROPERTY(bidi_common_separator) - PROPERTY(bidi_block_separator) - PROPERTY(bidi_segment_separator) - PROPERTY(bidi_whitespace) - PROPERTY(bidi_non_spacing_mark) - PROPERTY(bidi_boundary_neutral) - PROPERTY(bidi_pdf) - PROPERTY(bidi_embedding_or_override) - PROPERTY(bidi_other_neutral) - PROPERTY(hex_digit) - PROPERTY(ascii_hex_digit) - PROPERTY(ideographic) - PROPERTY(unified_ideograph) - PROPERTY(radical) - PROPERTY(ids_binary_operator) - PROPERTY(ids_trinary_operator) - PROPERTY(zero_width) - PROPERTY(space) - PROPERTY(non_break) - PROPERTY(iso_control) - PROPERTY(format_control) - PROPERTY(dash) - PROPERTY(hyphen) - PROPERTY(punctuation) - PROPERTY(line_separator) - PROPERTY(paragraph_separator) - PROPERTY(quotation_mark) - PROPERTY(sentence_terminal) - PROPERTY(terminal_punctuation) - PROPERTY(currency_symbol) - PROPERTY(math) - PROPERTY(other_math) - PROPERTY(paired_punctuation) - PROPERTY(left_of_pair) - PROPERTY(combining) - PROPERTY(composite) - PROPERTY(decimal_digit) - PROPERTY(numeric) - PROPERTY(diacritic) - PROPERTY(extender) - PROPERTY(ignorable_control) -#undef PROPERTY + ch = ch_min + i; + if ((i % 2) == 0) + fprintf (stream, "\n "); + s = joining_group_as_c_identifier (unicode_joining_group[ch]); + fprintf (stream, " %s", s); + if (i+1 <= ch_max - ch_min) + { + fprintf (stream, ","); + if (((i+1) % 2) != 0) + fprintf (stream, "%*s", 38 - (int) strlen (s), ""); + } + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } } /* ========================================================================= */ @@ -3648,7 +4391,7 @@ output_scripts (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); for (s = 0; s < numscripts; s++) @@ -3825,9 +4568,9 @@ output_scripts_byname (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); - fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n"); + fprintf (stream, "struct named_script { int name; unsigned int index; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define hash-function-name scripts_hash\n"); @@ -3835,6 +4578,8 @@ output_scripts_byname (const char *version) fprintf (stream, "%%readonly-tables\n"); fprintf (stream, "%%global-table\n"); fprintf (stream, "%%define word-array-name script_names\n"); + fprintf (stream, "%%pic\n"); + fprintf (stream, "%%define string-pool-name script_stringpool\n"); fprintf (stream, "%%%%\n"); for (s = 0; s < numscripts; s++) fprintf (stream, "%s, %u\n", scripts[s], s); @@ -3965,7 +4710,7 @@ output_blocks (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode blocks. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "static const uc_block_t blocks[] =\n"); @@ -4382,7 +5127,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Language syntax properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; /* or 8 */ @@ -4927,7 +5672,7 @@ output_tables (const char *filename, const char *version) fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); - fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", + fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n", version); fprintf (stream, "\n"); @@ -5135,6 +5880,7 @@ fill_width (const char *width_filename) unicode_width[i] = strdup (field1); } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", width_filename); @@ -5144,54 +5890,281 @@ fill_width (const char *width_filename) /* ========================================================================= */ -/* Line breaking classification. */ +/* Non-spacing attribute and width. */ + +/* The non-spacing attribute table consists of: + - Non-spacing characters; generated from PropList.txt or + "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" + - Format control characters; generated from + "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" + - Zero width characters; generated from + "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" + */ + +static bool +is_nonspacing (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (get_bidi_category (ch) == UC_BIDI_NSM + || is_category_Cc (ch) || is_category_Cf (ch) + || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0)); +} + +static void +output_nonspacing_property (const char *filename) +{ + FILE *stream; + int ind[0x110000 / 0x200]; + unsigned int i; + unsigned int i_max; + int next_ind; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + next_ind = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = false; + unsigned int ch; + + if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */ + for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++) + if (is_nonspacing (ch)) + { + nontrivial = true; + break; + } + if (nontrivial) + ind[i] = next_ind++; + else + ind[i] = -1; + } + + fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n", + next_ind); + i_max = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = (ind[i] >= 0); + + if (nontrivial) + { + unsigned int j; + + fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1); + for (j = 0; j < 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + unsigned int l; + unsigned char bits = 0; + + for (l = 0; l < 8; l++) + { + unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l; + + if (is_nonspacing (ch)) + bits |= 1 << l; + } + fprintf (stream, " 0x%02x%c", bits, + ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1); + } + i_max = i; + } + } + fprintf (stream, "};\n"); + + i_max = ((i_max + 8 - 1) / 8) * 8; + fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n", + i_max); + { + unsigned int j; + + for (j = 0; j < i_max / 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + i = j * 8 + k; + fprintf (stream, " %2d%c", ind[i], + j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1); + } + } + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */ +static char +symbolic_width (unsigned int ch) +{ + /* Test for unassigned character. */ + if (is_property_unassigned_code_value (ch)) + { + /* Unicode TR#11 section "Unassigned and Private-Use Characters". */ + if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */ + return 'A'; + if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */ + || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */ + || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */) + return '2'; + return 0; + } + else + { + /* Test for non-spacing or control character. */ + if (is_category_Cc (ch) && ch < 0x00A0) + return 0; + if (is_nonspacing (ch)) + return '0'; + /* Test for double-width character. */ + if (unicode_width[ch] != NULL + && (strcmp (unicode_width[ch], "W") == 0 + || strcmp (unicode_width[ch], "F") == 0)) + return '2'; + /* Test for half-width character. */ + if (unicode_width[ch] != NULL + && strcmp (unicode_width[ch], "H") == 0) + return '1'; + } + /* In ancient CJK encodings, Cyrillic and most other characters are + double-width as well. */ + if (ch >= 0x00A1 && ch < 0x10000) + return 'A'; + return '1'; +} + +static void +output_width_property_test (const char *filename) +{ + FILE *stream; + unsigned int interval_start, interval_end, ch; + char interval_value; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + interval_value = 0; + interval_start = interval_end = 0; /* avoid GCC warning */ + for (ch = 0; ch < 0x110000; ch++) + { + char value = symbolic_width (ch); + if (value != 0) /* skip Cc control characters and unassigned characters */ + { + if (value == interval_value) + /* Extend the interval. */ + interval_end = ch; + else + { + /* Terminate the interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + /* Start a new interval. */ + interval_start = interval_end = ch; + interval_value = value; + } + } + } + /* Terminate the last interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Line breaking classification. + Updated for Unicode TR #14 revision 26. */ enum { - /* Values >= 24 are resolved at run time. */ - LBP_BK = 24, /* mandatory break */ + /* Values >= 25 are resolved at run time. */ + LBP_BK = 25, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 25, /* attached characters and combining marks */ + LBP_CM = 26, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ - LBP_ZW = 26, /* zero width space */ + LBP_ZW = 27, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ - LBP_SP = 27, /* space */ + LBP_SP = 28, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ - LBP_CB = 28, /* contingent break opportunity */ + LBP_CB = 29, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ - LBP_EX = 7, /* exclamation/interrogation */ - LBP_IN = 8, /* inseparable */ - LBP_NS = 9, /* non starter */ - LBP_OP = 10, /* opening punctuation */ - LBP_QU = 11, /* ambiguous quotation */ - LBP_IS = 12, /* infix separator (numeric) */ - LBP_NU = 13, /* numeric */ - LBP_PO = 14, /* postfix (numeric) */ - LBP_PR = 15, /* prefix (numeric) */ - LBP_SY = 16, /* symbols allowing breaks */ - LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */ - LBP_AL = 17, /* ordinary alphabetic and symbol characters */ - LBP_H2 = 18, /* Hangul LV syllable */ - LBP_H3 = 19, /* Hangul LVT syllable */ - LBP_ID = 20, /* ideographic */ - LBP_JL = 21, /* Hangul L Jamo */ - LBP_JV = 22, /* Hangul V Jamo */ - LBP_JT = 23, /* Hangul T Jamo */ - LBP_SA = 30, /* complex context (South East Asian) */ - LBP_XX = 31 /* unknown */ + LBP_CP = 7, /* closing parenthesis */ + LBP_EX = 8, /* exclamation/interrogation */ + LBP_IN = 9, /* inseparable */ + LBP_NS = 10, /* non starter */ + LBP_OP = 11, /* opening punctuation */ + LBP_QU = 12, /* ambiguous quotation */ + LBP_IS = 13, /* infix separator (numeric) */ + LBP_NU = 14, /* numeric */ + LBP_PO = 15, /* postfix (numeric) */ + LBP_PR = 16, /* prefix (numeric) */ + LBP_SY = 17, /* symbols allowing breaks */ + LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */ + LBP_AL = 18, /* ordinary alphabetic and symbol characters */ + LBP_H2 = 19, /* Hangul LV syllable */ + LBP_H3 = 20, /* Hangul LVT syllable */ + LBP_ID = 21, /* ideographic */ + LBP_JL = 22, /* Hangul L Jamo */ + LBP_JV = 23, /* Hangul V Jamo */ + LBP_JT = 24, /* Hangul T Jamo */ + LBP_SA = 31, /* complex context (South East Asian) */ + LBP_XX = 32 /* unknown */ }; /* Returns the line breaking classification for ch, as a bit mask. */ -static int +static int64_t get_lbp (unsigned int ch) { - int attr = 0; + int64_t attr = 0; if (unicode_attributes[ch].name != NULL) { @@ -5201,15 +6174,15 @@ get_lbp (unsigned int ch) || ch == 0x000B /* line tabulation */ || ch == 0x2028 /* LINE SEPARATOR */ || ch == 0x2029 /* PARAGRAPH SEPARATOR */) - attr |= 1 << LBP_BK; + attr |= (int64_t) 1 << LBP_BK; if (ch == 0x2060 /* WORD JOINER */ || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) - attr |= 1 << LBP_WJ; + attr |= (int64_t) 1 << LBP_WJ; /* zero width space */ if (ch == 0x200B /* ZERO WIDTH SPACE */) - attr |= 1 << LBP_ZW; + attr |= (int64_t) 1 << LBP_ZW; /* non-breaking (glue) */ if (ch == 0x00A0 /* NO-BREAK SPACE */ @@ -5221,19 +6194,23 @@ get_lbp (unsigned int ch) || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ - || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */) - attr |= 1 << LBP_GL; + || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */ + || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */) + attr |= (int64_t) 1 << LBP_GL; /* space */ if (ch == 0x0020 /* SPACE */) - attr |= 1 << LBP_SP; + attr |= (int64_t) 1 << LBP_SP; /* break opportunity before and after */ if (ch == 0x2014 /* EM DASH */) - attr |= 1 << LBP_B2; + attr |= (int64_t) 1 << LBP_B2; /* break opportunity after */ - if (ch == 0x1680 /* OGHAM SPACE MARK */ + if (/* Breaking Spaces */ + ch == 0x1680 /* OGHAM SPACE MARK */ || ch == 0x2000 /* EN QUAD */ || ch == 0x2001 /* EM QUAD */ || ch == 0x2002 /* EN SPACE */ @@ -5245,12 +6222,17 @@ get_lbp (unsigned int ch) || ch == 0x2009 /* THIN SPACE */ || ch == 0x200A /* HAIR SPACE */ || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ + /* Tabs */ || ch == 0x0009 /* tab */ + /* Conditional Hyphens */ || ch == 0x00AD /* SOFT HYPHEN */ + /* Breaking Hyphens */ || ch == 0x058A /* ARMENIAN HYPHEN */ + || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */ || ch == 0x2010 /* HYPHEN */ || ch == 0x2012 /* FIGURE DASH */ || ch == 0x2013 /* EN DASH */ + /* Visible Word Dividers */ || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ || ch == 0x1361 /* ETHIOPIC WORDSPACE */ @@ -5258,6 +6240,7 @@ get_lbp (unsigned int ch) || ch == 0x17DA /* KHMER SIGN KOOMUUT */ || ch == 0x2027 /* HYPHENATION POINT */ || ch == 0x007C /* VERTICAL LINE */ + /* Historic Word Separators */ || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ @@ -5274,6 +6257,7 @@ get_lbp (unsigned int ch) || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ || ch == 0x2E30 /* RING POINT */ + || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */ || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ || ch == 0x10102 /* AEGEAN CHECK MARK */ @@ -5281,6 +6265,7 @@ get_lbp (unsigned int ch) || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ + /* Dandas */ || ch == 0x0964 /* DEVANAGARI DANDA */ || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ @@ -5300,17 +6285,18 @@ get_lbp (unsigned int ch) || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ + /* Tibetan */ || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ || ch == 0x0F85 /* TIBETAN MARK PALUTA */ || ch == 0x0FBE /* TIBETAN KU RU KHA */ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ + /* Other Terminating Punctuation */ || ch == 0x1804 /* MONGOLIAN COLON */ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ || ch == 0x1B5A /* BALINESE PANTI */ || ch == 0x1B5B /* BALINESE PAMADA */ - || ch == 0x1B5C /* BALINESE WINDU */ || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ || ch == 0x1B60 /* BALINESE PAMENENG */ || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ @@ -5337,10 +6323,36 @@ get_lbp (unsigned int ch) || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */ + || ch == 0xA4FE /* LISU PUNCTUATION COMMA */ + || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */ + || ch == 0xA6F3 /* BAMUM FULL STOP */ + || ch == 0xA6F4 /* BAMUM COLON */ + || ch == 0xA6F5 /* BAMUM COMMA */ + || ch == 0xA6F6 /* BAMUM SEMICOLON */ + || ch == 0xA6F7 /* BAMUM QUESTION MARK */ + || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */ + || ch == 0xA9C8 /* JAVANESE PADA LINGSA */ + || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */ + || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */ + || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */ + || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */ + || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */ + || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */ + || ch == 0x11047 /* BRAHMI DANDA */ + || ch == 0x11048 /* BRAHMI DOUBLE DANDA */ + || ch == 0x110BE /* KAITHI SECTION MARK */ + || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */ + || ch == 0x110C0 /* KAITHI DANDA */ + || ch == 0x110C1 /* KAITHI DOUBLE DANDA */ || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) - attr |= 1 << LBP_BA; + attr |= (int64_t) 1 << LBP_BA; /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ @@ -5362,19 +6374,25 @@ get_lbp (unsigned int ch) || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) - attr |= 1 << LBP_BB; + attr |= (int64_t) 1 << LBP_BB; /* hyphen */ if (ch == 0x002D /* HYPHEN-MINUS */) - attr |= 1 << LBP_HY; + attr |= (int64_t) 1 << LBP_HY; /* contingent break opportunity */ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) - attr |= 1 << LBP_CB; + attr |= (int64_t) 1 << LBP_CB; + + /* closing parenthesis */ + if (ch == 0x0029 /* RIGHT PARENTHESIS */ + || ch == 0x005D /* RIGHT SQUARE BRACKET */) + attr |= (int64_t) 1 << LBP_CP; /* closing punctuation */ if ((unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'e') + && unicode_attributes[ch].category[1] == 'e' + && !(attr & ((int64_t) 1 << LBP_CP))) || ch == 0x3001 /* IDEOGRAPHIC COMMA */ || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ @@ -5384,8 +6402,17 @@ get_lbp (unsigned int ch) || ch == 0xFF0C /* FULLWIDTH COMMA */ || ch == 0xFF0E /* FULLWIDTH FULL STOP */ || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ - || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */) - attr |= 1 << LBP_CL; + || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */ + || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */ + || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */ + || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */ + || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */ + || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */ + || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */ + || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */) + attr |= (int64_t) 1 << LBP_CL; /* exclamation/interrogation */ if (ch == 0x0021 /* EXCLAMATION MARK */ @@ -5413,9 +6440,6 @@ get_lbp (unsigned int ch) || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ || ch == 0x2CFE /* COPTIC FULL STOP */ || ch == 0x2E2E /* REVERSED QUESTION MARK */ -#if REVISION_22 - || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ -#endif || ch == 0xA60E /* VAI FULL STOP */ || ch == 0xA876 /* PHAGS-PA MARK SHAD */ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ @@ -5425,14 +6449,14 @@ get_lbp (unsigned int ch) || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) - attr |= 1 << LBP_EX; + attr |= (int64_t) 1 << LBP_EX; /* inseparable */ if (ch == 0x2024 /* ONE DOT LEADER */ || ch == 0x2025 /* TWO DOT LEADER */ || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) - attr |= 1 << LBP_IN; + attr |= (int64_t) 1 << LBP_IN; /* non starter */ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ @@ -5465,15 +6489,22 @@ get_lbp (unsigned int ch) || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) - attr |= 1 << LBP_NS; + attr |= (int64_t) 1 << LBP_NS; /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's') || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00BF /* INVERTED QUESTION MARK */ - || ch == 0x2E18 /* INVERTED INTERROBANG */) - attr |= 1 << LBP_OP; + || ch == 0x2E18 /* INVERTED INTERROBANG */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */ + || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */ + || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */ + || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */ + || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */ + || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */) + attr |= (int64_t) 1 << LBP_OP; /* ambiguous quotation */ if ((unicode_attributes[ch].category[0] == 'P' @@ -5491,7 +6522,7 @@ get_lbp (unsigned int ch) || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ || ch == 0x2E0B /* RAISED SQUARE */) - attr |= 1 << LBP_QU; + attr |= (int64_t) 1 << LBP_QU; /* infix separator (numeric) */ if (ch == 0x002C /* COMMA */ @@ -5507,7 +6538,7 @@ get_lbp (unsigned int ch) || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) - attr |= 1 << LBP_IS; + attr |= (int64_t) 1 << LBP_IS; /* numeric */ if ((unicode_attributes[ch].category[0] == 'N' @@ -5515,7 +6546,7 @@ get_lbp (unsigned int ch) && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) - attr |= 1 << LBP_NU; + attr |= (int64_t) 1 << LBP_NU; /* postfix (numeric) */ if (ch == 0x0025 /* PERCENT SIGN */ @@ -5541,8 +6572,13 @@ get_lbp (unsigned int ch) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ - || ch == 0x0D79 /* MALAYALAM DATE MARK */) - attr |= 1 << LBP_PO; + || ch == 0x09F2 /* BENGALI RUPEE MARK */ + || ch == 0x09F3 /* BENGALI RUPEE SIGN */ + || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */ + || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */ + || ch == 0xA838 /* NORTH INDIC RUPEE MARK */) + attr |= (int64_t) 1 << LBP_PO; /* prefix (numeric) */ if ((unicode_attributes[ch].category[0] == 'S' @@ -5553,27 +6589,27 @@ get_lbp (unsigned int ch) || ch == 0x2116 /* NUMERO SIGN */ || ch == 0x2212 /* MINUS SIGN */ || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) - if (!(attr & (1 << LBP_PO))) - attr |= 1 << LBP_PR; + if (!(attr & ((int64_t) 1 << LBP_PO))) + attr |= (int64_t) 1 << LBP_PR; /* symbols allowing breaks */ if (ch == 0x002F /* SOLIDUS */) - attr |= 1 << LBP_SY; + attr |= (int64_t) 1 << LBP_SY; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0) - attr |= 1 << LBP_H2; + attr |= (int64_t) 1 << LBP_H2; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0) - attr |= 1 << LBP_H3; + attr |= (int64_t) 1 << LBP_H3; - if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F) - attr |= 1 << LBP_JL; + if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C)) + attr |= (int64_t) 1 << LBP_JL; - if (ch >= 0x1160 && ch <= 0x11A2) - attr |= 1 << LBP_JV; + if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6)) + attr |= (int64_t) 1 << LBP_JV; - if (ch >= 0x11A8 && ch <= 0x11F9) - attr |= 1 << LBP_JT; + if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) + attr |= (int64_t) 1 << LBP_JT; /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' @@ -5583,17 +6619,24 @@ get_lbp (unsigned int ch) || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'M' && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'n')) + || unicode_attributes[ch].category[1] == 'n') + && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ + || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ - || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) - && ((ch >= 0x0E00 && ch <= 0x0EFF) - || (ch >= 0x1000 && ch <= 0x109F) - || (ch >= 0x1780 && ch <= 0x17FF) - || (ch >= 0x1950 && ch <= 0x19DF))) - attr |= 1 << LBP_SA; + || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */ + || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */ + || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */ + || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */) + && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */ + || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */ + || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */ + || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */ + || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */ + || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */)) + attr |= (int64_t) 1 << LBP_SA; /* attached characters and combining marks */ if ((unicode_attributes[ch].category[0] == 'M' @@ -5602,17 +6645,18 @@ get_lbp (unsigned int ch) || unicode_attributes[ch].category[1] == 'n')) || (unicode_attributes[ch].category[0] == 'C' && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'f'))) - if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW)))) - attr |= 1 << LBP_CM; + || unicode_attributes[ch].category[1] == 'f') + && ch != 0x110BD /* KAITHI NUMBER SIGN */)) + if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW)))) + attr |= (int64_t) 1 << LBP_CM; /* ideographic */ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ || ch == 0x3000 /* IDEOGRAPHIC SPACE */ || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ - || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ @@ -5626,7 +6670,7 @@ get_lbp (unsigned int ch) || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL || (ch >= 0x3000 && ch <= 0x33FF - && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) + && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP)))) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ @@ -5669,8 +6713,14 @@ get_lbp (unsigned int ch) || ch == 0xFF5E /* FULLWIDTH TILDE */ || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ || ch == 0xFFE3 /* FULLWIDTH MACRON */ - || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */) - if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM)))) + || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */ + || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */ + || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */ + || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */ + || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */) + if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM)))) { /* ambiguous (ideograph) ? */ if ((unicode_width[ch] != NULL @@ -5678,9 +6728,9 @@ get_lbp (unsigned int ch) && ch >= 0x2000) || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) - attr |= 1 << LBP_AI; + attr |= (int64_t) 1 << LBP_AI; else - attr |= 1 << LBP_ID; + attr |= (int64_t) 1 << LBP_ID; } /* ordinary alphabetic and symbol characters */ @@ -5710,8 +6760,10 @@ get_lbp (unsigned int ch) || ch == 0x2061 /* FUNCTION APPLICATION */ || ch == 0x2062 /* INVISIBLE TIMES */ || ch == 0x2063 /* INVISIBLE SEPARATOR */ - || ch == 0x2064 /* INVISIBLE PLUS */) - if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID)))) + || ch == 0x2064 /* INVISIBLE PLUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x110BD /* KAITHI NUMBER SIGN */) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) { /* ambiguous (alphabetic) ? */ if ((unicode_width[ch] != NULL @@ -5729,8 +6781,6 @@ get_lbp (unsigned int ch) && ch != 0x21E7 /* UPWARDS WHITE ARROW */ && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) -#if !REVISION_22 - || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00A7 /* SECTION SIGN */ || ch == 0x00A8 /* DIAERESIS */ || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ @@ -5744,7 +6794,6 @@ get_lbp (unsigned int ch) || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ - || ch == 0x00BF /* INVERTED QUESTION MARK */ || ch == 0x00D7 /* MULTIPLICATION SIGN */ || ch == 0x00F7 /* DIVISION SIGN */ || ch == 0x02C7 /* CARON */ @@ -5758,7 +6807,6 @@ get_lbp (unsigned int ch) || ch == 0x02DA /* RING ABOVE */ || ch == 0x02DB /* OGONEK */ || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ -#endif || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ @@ -5766,16 +6814,30 @@ get_lbp (unsigned int ch) || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ || ch == 0x2616 /* WHITE SHOGI PIECE */ || ch == 0x2617 /* BLACK SHOGI PIECE */) - attr |= 1 << LBP_AI; + attr |= (int64_t) 1 << LBP_AI; else - attr |= 1 << LBP_AL; - attr &= ~(1 << LBP_CM); + attr |= (int64_t) 1 << LBP_AL; + attr &= ~((int64_t) 1 << LBP_CM); } } + else + { + /* Unassigned character. */ + if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ + || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ + || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + attr |= (int64_t) 1 << LBP_ID; + } if (attr == 0) /* unknown */ - attr |= 1 << LBP_XX; + attr |= (int64_t) 1 << LBP_XX; return attr; } @@ -5788,12 +6850,12 @@ debug_output_lbp (FILE *stream) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); - if (attr != 1 << LBP_XX) + int64_t attr = get_lbp (i); + if (attr != (int64_t) 1 << LBP_XX) { fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ - if (attr & (1 << bit)) fprintf (stream, " " #bit); + if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit); PRINT_BIT(attr,LBP_BK); PRINT_BIT(attr,LBP_CM); PRINT_BIT(attr,LBP_WJ); @@ -5806,6 +6868,7 @@ debug_output_lbp (FILE *stream) PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); @@ -5919,6 +6982,7 @@ fill_org_lbp (const char *linebreak_filename) TRY(LBP_HY) TRY(LBP_CB) TRY(LBP_CL) + TRY(LBP_CP) TRY(LBP_EX) TRY(LBP_IN) TRY(LBP_NS) @@ -5964,6 +7028,7 @@ fill_org_lbp (const char *linebreak_filename) unicode_org_lbp[i] = value; } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", linebreak_filename); @@ -5997,6 +7062,7 @@ debug_output_org_lbp (FILE *stream) PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); @@ -6065,13 +7131,13 @@ output_lbp (FILE *stream1, FILE *stream2) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); + int64_t attr = get_lbp (i); /* Now attr should contain exactly one bit. */ if (attr == 0 || ((attr & (attr - 1)) != 0)) abort (); - if (attr != 1 << LBP_XX) + if (attr != (int64_t) 1 << LBP_XX) { unsigned int log2_attr; for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); @@ -6169,6 +7235,7 @@ output_lbp (FILE *stream1, FILE *stream2) CASE(LBP_HY); CASE(LBP_CB); CASE(LBP_CL); + CASE(LBP_CP); CASE(LBP_EX); CASE(LBP_IN); CASE(LBP_NS); @@ -6230,7 +7297,7 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -6268,7 +7335,8 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve /* ========================================================================= */ -/* Word break property. */ +/* Word break property. + Updated for Unicode TR #29 revision 17. */ /* Possible values of the Word_Break property. */ enum @@ -6314,7 +7382,7 @@ get_wbp (unsigned int ch) if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Cf") == 0 - && ch != 0x200C && ch != 0x200D) + && ch != 0x200B && ch != 0x200C && ch != 0x200D) attr |= 1 << WBP_FORMAT; if ((unicode_scripts[ch] < numscripts @@ -6334,12 +7402,10 @@ get_wbp (unsigned int ch) && (attr & (1 << WBP_EXTEND)) == 0) attr |= 1 << WBP_ALETTER; - if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 - || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) + if (is_WBP_MIDNUMLET (ch)) attr |= 1 << WBP_MIDNUMLET; - if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A - || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) + if (is_WBP_MIDLETTER (ch)) attr |= 1 << WBP_MIDLETTER; if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 @@ -6712,7 +7778,7 @@ output_wbrk_tables (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -6746,7 +7812,8 @@ output_wbrk_tables (const char *filename, const char *version) /* ========================================================================= */ -/* Grapheme break property. */ +/* Grapheme break property. + Updated for Unicode TR #29 revision 17. */ /* Possible values of the Grapheme_Cluster_Break property. */ enum @@ -7042,6 +8109,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename) for (i = i1; i <= i2; i++) unicode_org_gbp[i] = propvalue; } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); @@ -7051,6 +8119,9 @@ fill_org_gbp (const char *graphemebreakproperty_filename) /* ========================================================================= */ +/* Composition and decomposition. + Updated for Unicode TR #15 revision 33. */ + /* Maximum number of characters into which a single Unicode character can be decomposed. */ #define MAX_DECOMP_LENGTH 18 @@ -7457,7 +8528,7 @@ output_composition_tables (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -7499,7 +8570,7 @@ output_composition_tables (const char *filename, const char *version) 1527, which is quite good (60% filled). It requires an auxiliary table lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ - fprintf (stream, "struct composition_rule { char codes[4]; };\n"); + fprintf (stream, "struct composition_rule { char codes[6]; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define slot-name codes\n"); @@ -7537,16 +8608,9 @@ output_composition_tables (const char *filename, const char *version) if (strcmp (unicode_attributes[combined].combining, "0") != 0) abort (); - if (!(code1 < 0x10000)) - abort (); - if (!(code2 < 0x10000)) - abort (); - if (!(combined < 0x10000)) - abort (); - - fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", - (code1 >> 8) & 0xff, code1 & 0xff, - (code2 >> 8) & 0xff, code2 & 0xff, + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, combined); } } @@ -7597,7 +8661,7 @@ output_simple_mapping_test (const char *filename, fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); - fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); fprintf (stream, "#include \"test-mapping-part1.h\"\n"); @@ -7659,7 +8723,7 @@ output_simple_mapping (const char *filename, fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -8581,6 +9645,7 @@ main (int argc, char * argv[]) const char *unicodedata_filename; const char *proplist_filename; const char *derivedproplist_filename; + const char *arabicshaping_filename; const char *scripts_filename; const char *blocks_filename; const char *proplist30_filename; @@ -8593,9 +9658,9 @@ main (int argc, char * argv[]) const char *casefolding_filename; const char *version; - if (argc != 15) + if (argc != 16) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", argv[0]); exit (1); } @@ -8603,23 +9668,25 @@ main (int argc, char * argv[]) unicodedata_filename = argv[1]; proplist_filename = argv[2]; derivedproplist_filename = argv[3]; - scripts_filename = argv[4]; - blocks_filename = argv[5]; - proplist30_filename = argv[6]; - eastasianwidth_filename = argv[7]; - linebreak_filename = argv[8]; - wordbreakproperty_filename = argv[9]; - graphemebreakproperty_filename = argv[10]; - compositionexclusions_filename = argv[11]; - specialcasing_filename = argv[12]; - casefolding_filename = argv[13]; - version = argv[14]; + arabicshaping_filename = argv[4]; + scripts_filename = argv[5]; + blocks_filename = argv[6]; + proplist30_filename = argv[7]; + eastasianwidth_filename = argv[8]; + linebreak_filename = argv[9]; + wordbreakproperty_filename = argv[10]; + graphemebreakproperty_filename = argv[11]; + compositionexclusions_filename = argv[12]; + specialcasing_filename = argv[13]; + casefolding_filename = argv[14]; + version = argv[15]; fill_attributes (unicodedata_filename); clear_properties (); fill_properties (proplist_filename); fill_properties (derivedproplist_filename); fill_properties30 (proplist30_filename); + fill_arabicshaping (arabicshaping_filename); fill_scripts (scripts_filename); fill_blocks (blocks_filename); fill_width (eastasianwidth_filename); @@ -8634,7 +9701,7 @@ main (int argc, char * argv[]) output_categories (version); output_category ("unictype/categ_of.h", version); - output_combclass ("unictype/combining.h", version); + output_combclass ("unictype/combiningclass.h", version); output_bidi_category ("unictype/bidi_of.h", version); output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); output_decimal_digit ("unictype/decdigit.h", version); @@ -8644,10 +9711,17 @@ main (int argc, char * argv[]) output_numeric ("unictype/numeric.h", version); output_mirror ("unictype/mirror.h", version); output_properties (version); + output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version); + output_joining_type ("unictype/joiningtype_of.h", version); + output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version); + output_joining_group ("unictype/joininggroup_of.h", version); + output_scripts (version); output_scripts_byname (version); output_blocks (version); output_ident_properties (version); + output_nonspacing_property ("uniwidth/width.c.part"); + output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part"); output_old_ctype (version); debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); @@ -8684,20 +9758,23 @@ main (int argc, char * argv[]) * compile-command: " gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ ./gen-uni-tables \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \ - 5.1.0 + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \ + 6.0.0 \ + && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \ + && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt " * End: */