X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=d00013ebed92043d9162b67ae965e6da1c6e9a35;hb=7ef6c64e210ac0979d7e8ac69bc5b5208c2405ab;hp=467fa5506228c0f8e6a1e01acdc8e387b10ae95d;hpb=441aa3044f43e5572f58c354f01e6bc070acd5c7;p=gnulib.git diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 467fa5506..d00013ebe 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -1,7 +1,7 @@ /* Generate Unicode conforming character classification tables and line break properties tables and word break property tables and decomposition/composition and case mapping tables from a UnicodeData file. - Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. + Copyright (C) 2000-2002, 2004, 2007-2014 Free Software Foundation, Inc. Written by Bruno Haible , 2000-2002. This program is free software: you can redistribute it and/or modify @@ -21,16 +21,18 @@ $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \ /usr/local/share/Unidata/PropList.txt \ /usr/local/share/Unidata/DerivedCoreProperties.txt \ + /usr/local/share/Unidata/ArabicShaping.txt \ /usr/local/share/Unidata/Scripts.txt \ /usr/local/share/Unidata/Blocks.txt \ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ /usr/local/share/Unidata/LineBreak.txt \ /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/GraphemeBreakProperty.txt \ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ - 5.1.0 + 6.0.0 */ #include @@ -90,7 +92,7 @@ fill_attribute (unsigned int i, exit (1); } if (strcmp (field2, "Cs") == 0) - /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */ + /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */ return; uni = &unicode_attributes[i]; /* Copy the strings. */ @@ -210,7 +212,7 @@ fill_attributes (const char *unicodedata_filename) i = strtoul (field0, NULL, 16); if (field1[0] == '<' && strlen (field1) >= 9 - && strcmp (field1 + strlen(field1) - 8, ", First>") == 0) + && strcmp (field1 + strlen (field1) - 8, ", First>") == 0) { /* Deal with a range. */ lineno++; @@ -258,6 +260,7 @@ fill_attributes (const char *unicodedata_filename) field11, field12, field13, field14); } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); @@ -279,6 +282,17 @@ is_category_L (unsigned int ch) } static bool +is_category_LC (unsigned int ch) +{ + /* See PropertyValueAliases.txt. */ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't')); +} + +static bool is_category_Lu (unsigned int ch) { return (unicode_attributes[ch].name != NULL @@ -689,7 +703,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* %s of Unicode characters. */\n", comment); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 4; /* or: 5 */ @@ -802,6 +816,7 @@ output_categories (const char *version) output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \ output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version); CATEGORY (L) + CATEGORY (LC) CATEGORY (Lu) CATEGORY (Ll) CATEGORY (Lt) @@ -844,6 +859,7 @@ output_categories (const char *version) enum { UC_CATEGORY_MASK_L = 0x0000001f, + UC_CATEGORY_MASK_LC = 0x00000007, UC_CATEGORY_MASK_Lu = 0x00000001, UC_CATEGORY_MASK_Ll = 0x00000002, UC_CATEGORY_MASK_Lt = 0x00000004, @@ -893,6 +909,7 @@ general_category_byname (const char *category_name) switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_L; + case 'C': return UC_CATEGORY_MASK_LC; case 'u': return UC_CATEGORY_MASK_Lu; case 'l': return UC_CATEGORY_MASK_Ll; case 't': return UC_CATEGORY_MASK_Lt; @@ -993,7 +1010,7 @@ output_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1157,7 +1174,7 @@ output_combclass (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Combining class of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1497,7 +1514,7 @@ output_bidi_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Bidi categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1654,7 +1671,7 @@ output_decimal_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -1701,7 +1718,7 @@ output_decimal_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -1841,7 +1858,7 @@ output_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -1888,7 +1905,7 @@ output_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -2045,7 +2062,7 @@ output_numeric_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); need_comma = false; @@ -2101,7 +2118,7 @@ output_numeric (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); /* Create table of occurring fractions. */ @@ -2386,7 +2403,7 @@ output_mirror (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Mirrored Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -2490,6 +2507,24 @@ output_mirror (const char *filename, const char *version) /* ========================================================================= */ +/* Particular values of the word break property. */ + +static bool +is_WBP_MIDNUMLET (unsigned int ch) +{ + return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E); +} + +static bool +is_WBP_MIDLETTER (unsigned int ch) +{ + return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A); +} + +/* ========================================================================= */ + /* Properties. */ /* Reading PropList.txt and DerivedCoreProperties.txt. */ @@ -2533,6 +2568,13 @@ enum PROP_ALPHABETIC, PROP_LOWERCASE, PROP_UPPERCASE, + PROP_CASED, + PROP_CASE_IGNORABLE, + PROP_CHANGES_WHEN_LOWERCASED, + PROP_CHANGES_WHEN_UPPERCASED, + PROP_CHANGES_WHEN_TITLECASED, + PROP_CHANGES_WHEN_CASEFOLDED, + PROP_CHANGES_WHEN_CASEMAPPED, PROP_ID_START, PROP_ID_CONTINUE, PROP_XID_START, @@ -2631,6 +2673,13 @@ fill_properties (const char *proplist_filename) PROP ("Alphabetic", PROP_ALPHABETIC) PROP ("Lowercase", PROP_LOWERCASE) PROP ("Uppercase", PROP_UPPERCASE) + PROP ("Cased", PROP_CASED) + PROP ("Case_Ignorable", PROP_CASE_IGNORABLE) + PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED) + PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED) + PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED) + PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED) + PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED) PROP ("ID_Start", PROP_ID_START) PROP ("ID_Continue", PROP_ID_CONTINUE) PROP ("XID_Start", PROP_XID_START) @@ -2727,6 +2776,7 @@ fill_property30 (char array[0x110000], const char *proplist_filename, const char for (i = i1; i <= i2; i++) array[i] = 1; } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", proplist_filename); @@ -2776,6 +2826,7 @@ is_property_alphabetic (unsigned int ch) || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */ + || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */ || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */ || (ch == 0x10341) /* GOTHIC LETTER NINETY */ || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ @@ -2811,7 +2862,10 @@ is_property_default_ignorable_code_point (unsigned int ch) bool result1 = (is_category_Cf (ch) && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ - && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)) + && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F) + /* For some reason, the following are not listed as having property + Default_Ignorable_Code_Point. */ + && !(ch == 0x110BD)) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); bool result2 = @@ -2920,6 +2974,79 @@ is_property_titlecase (unsigned int ch) return is_category_Lt (ch); } +/* See DerivedCoreProperties.txt. */ +static bool +is_property_cased (unsigned int ch) +{ + bool result1 = (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_case_ignorable (unsigned int ch) +{ + bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch) + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_lowercased (unsigned int ch) +{ + bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0); + bool result2 = (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].lower != NONE + && unicode_attributes[ch].lower != ch); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_uppercased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_titlecased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casefolded (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casemapped (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0); +} + /* See PropList.txt, UCD.html. */ static bool is_property_soft_dotted (unsigned int ch) @@ -3466,6 +3593,13 @@ output_properties (const char *version) PROPERTY(lowercase) PROPERTY(other_lowercase) PROPERTY(titlecase) + PROPERTY(cased) + PROPERTY(case_ignorable) + PROPERTY(changes_when_lowercased) + PROPERTY(changes_when_uppercased) + PROPERTY(changes_when_titlecased) + PROPERTY(changes_when_casefolded) + PROPERTY(changes_when_casemapped) PROPERTY(soft_dotted) PROPERTY(id_start) PROPERTY(other_id_start) @@ -3534,109 +3668,258 @@ output_properties (const char *version) /* ========================================================================= */ -/* Scripts. */ +/* Arabic Shaping. */ -static const char *scripts[256]; -static unsigned int numscripts; +enum +{ + UC_JOINING_TYPE_U, /* Non_Joining */ + UC_JOINING_TYPE_T, /* Transparent */ + UC_JOINING_TYPE_C, /* Join_Causing */ + UC_JOINING_TYPE_L, /* Left_Joining */ + UC_JOINING_TYPE_R, /* Right_Joining */ + UC_JOINING_TYPE_D /* Dual_Joining */ +}; -static uint8_t unicode_scripts[0x110000]; +static uint8_t unicode_joining_type[0x110000]; + +enum +{ + UC_JOINING_GROUP_NONE, /* No_Joining_Group */ + UC_JOINING_GROUP_AIN, /* Ain */ + UC_JOINING_GROUP_ALAPH, /* Alaph */ + UC_JOINING_GROUP_ALEF, /* Alef */ + UC_JOINING_GROUP_BEH, /* Beh */ + UC_JOINING_GROUP_BETH, /* Beth */ + UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */ + UC_JOINING_GROUP_DAL, /* Dal */ + UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */ + UC_JOINING_GROUP_E, /* E */ + UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */ + UC_JOINING_GROUP_FE, /* Fe */ + UC_JOINING_GROUP_FEH, /* Feh */ + UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */ + UC_JOINING_GROUP_GAF, /* Gaf */ + UC_JOINING_GROUP_GAMAL, /* Gamal */ + UC_JOINING_GROUP_HAH, /* Hah */ + UC_JOINING_GROUP_HE, /* He */ + UC_JOINING_GROUP_HEH, /* Heh */ + UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */ + UC_JOINING_GROUP_HETH, /* Heth */ + UC_JOINING_GROUP_KAF, /* Kaf */ + UC_JOINING_GROUP_KAPH, /* Kaph */ + UC_JOINING_GROUP_KHAPH, /* Khaph */ + UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */ + UC_JOINING_GROUP_LAM, /* Lam */ + UC_JOINING_GROUP_LAMADH, /* Lamadh */ + UC_JOINING_GROUP_MEEM, /* Meem */ + UC_JOINING_GROUP_MIM, /* Mim */ + UC_JOINING_GROUP_NOON, /* Noon */ + UC_JOINING_GROUP_NUN, /* Nun */ + UC_JOINING_GROUP_NYA, /* Nya */ + UC_JOINING_GROUP_PE, /* Pe */ + UC_JOINING_GROUP_QAF, /* Qaf */ + UC_JOINING_GROUP_QAPH, /* Qaph */ + UC_JOINING_GROUP_REH, /* Reh */ + UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */ + UC_JOINING_GROUP_SAD, /* Sad */ + UC_JOINING_GROUP_SADHE, /* Sadhe */ + UC_JOINING_GROUP_SEEN, /* Seen */ + UC_JOINING_GROUP_SEMKATH, /* Semkath */ + UC_JOINING_GROUP_SHIN, /* Shin */ + UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */ + UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */ + UC_JOINING_GROUP_TAH, /* Tah */ + UC_JOINING_GROUP_TAW, /* Taw */ + UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */ + UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */ + UC_JOINING_GROUP_TETH, /* Teth */ + UC_JOINING_GROUP_WAW, /* Waw */ + UC_JOINING_GROUP_YEH, /* Yeh */ + UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */ + UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */ + UC_JOINING_GROUP_YUDH, /* Yudh */ + UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */ + UC_JOINING_GROUP_ZAIN, /* Zain */ + UC_JOINING_GROUP_ZHAIN /* Zhain */ +}; + +static uint8_t unicode_joining_group[0x110000]; static void -fill_scripts (const char *scripts_filename) +fill_arabicshaping (const char *arabicshaping_filename) { FILE *stream; unsigned int i; + int lineno; - stream = fopen (scripts_filename, "r"); + stream = fopen (arabicshaping_filename, "r"); if (stream == NULL) { - fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); + fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename); exit (1); } - numscripts = 0; - for (i = 0; i < 0x110000; i++) - unicode_scripts[i] = (uint8_t)~(uint8_t)0; + { + unicode_joining_type[i] = (uint8_t)~(uint8_t)0; + unicode_joining_group[i] = UC_JOINING_GROUP_NONE; + } + lineno = 0; for (;;) { - char buf[200+1]; - unsigned int i1, i2; - char padding[200+1]; - char scriptname[200+1]; - int script; + char buf[100+1]; + char separator1[100+1]; + char padding1[100+1]; + char schematic_name[100+1]; + char separator2[100+1]; + char padding2[100+1]; + char joining_type_name[100+1]; + char separator3[100+1]; + char padding3[100+1]; + char joining_group_name[100+1]; + int joining_type; + int joining_group; - if (fscanf (stream, "%200[^\n]\n", buf) < 1) + lineno++; + if (fscanf (stream, "%100[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; - if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) + if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]", + &i, separator1, padding1, schematic_name, separator2, + padding2, joining_type_name, separator3, padding3, + joining_group_name) != 10) { - if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) - { - fprintf (stderr, "parse error in '%s'\n", scripts_filename); - exit (1); - } - i2 = i1; + fprintf (stderr, "parse error in '%s':%d\n", + arabicshaping_filename, lineno); + exit (1); } - if (i2 < i1) - abort (); - if (i2 >= 0x110000) + if (i >= 0x110000) abort (); - for (script = numscripts - 1; script >= 0; script--) - if (strcmp (scripts[script], scriptname) == 0) - break; - if (script < 0) +#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name; + if (false) {} + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + else { - scripts[numscripts] = strdup (scriptname); - script = numscripts; - numscripts++; - if (numscripts == 256) - abort (); + fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n", + joining_type_name, arabicshaping_filename, lineno); + exit (1); } - for (i = i1; i <= i2; i++) + /* Remove trailing spaces. */ + while (joining_group_name[0] != '\0' + && joining_group_name[strlen (joining_group_name) - 1] == ' ') + joining_group_name[strlen (joining_group_name) - 1] = '\0'; + +#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value; + if (false) {} + TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group") + TRY(UC_JOINING_GROUP_AIN, "AIN") + TRY(UC_JOINING_GROUP_ALAPH, "ALAPH") + TRY(UC_JOINING_GROUP_ALEF, "ALEF") + TRY(UC_JOINING_GROUP_BEH, "BEH") + TRY(UC_JOINING_GROUP_BETH, "BETH") + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE") + TRY(UC_JOINING_GROUP_DAL, "DAL") + TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH") + TRY(UC_JOINING_GROUP_E, "E") + TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH") + TRY(UC_JOINING_GROUP_FE, "FE") + TRY(UC_JOINING_GROUP_FEH, "FEH") + TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH") + TRY(UC_JOINING_GROUP_GAF, "GAF") + TRY(UC_JOINING_GROUP_GAMAL, "GAMAL") + TRY(UC_JOINING_GROUP_HAH, "HAH") + TRY(UC_JOINING_GROUP_HE, "HE") + TRY(UC_JOINING_GROUP_HEH, "HEH") + TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL") + TRY(UC_JOINING_GROUP_HETH, "HETH") + TRY(UC_JOINING_GROUP_KAF, "KAF") + TRY(UC_JOINING_GROUP_KAPH, "KAPH") + TRY(UC_JOINING_GROUP_KHAPH, "KHAPH") + TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH") + TRY(UC_JOINING_GROUP_LAM, "LAM") + TRY(UC_JOINING_GROUP_LAMADH, "LAMADH") + TRY(UC_JOINING_GROUP_MEEM, "MEEM") + TRY(UC_JOINING_GROUP_MIM, "MIM") + TRY(UC_JOINING_GROUP_NOON, "NOON") + TRY(UC_JOINING_GROUP_NUN, "NUN") + TRY(UC_JOINING_GROUP_NYA, "NYA") + TRY(UC_JOINING_GROUP_PE, "PE") + TRY(UC_JOINING_GROUP_QAF, "QAF") + TRY(UC_JOINING_GROUP_QAPH, "QAPH") + TRY(UC_JOINING_GROUP_REH, "REH") + TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE") + TRY(UC_JOINING_GROUP_SAD, "SAD") + TRY(UC_JOINING_GROUP_SADHE, "SADHE") + TRY(UC_JOINING_GROUP_SEEN, "SEEN") + TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH") + TRY(UC_JOINING_GROUP_SHIN, "SHIN") + TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF") + TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW") + TRY(UC_JOINING_GROUP_TAH, "TAH") + TRY(UC_JOINING_GROUP_TAW, "TAW") + TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA") + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL") + TRY(UC_JOINING_GROUP_TETH, "TETH") + TRY(UC_JOINING_GROUP_WAW, "WAW") + TRY(UC_JOINING_GROUP_YEH, "YEH") + TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE") + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL") + TRY(UC_JOINING_GROUP_YUDH, "YUDH") + TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE") + TRY(UC_JOINING_GROUP_ZAIN, "ZAIN") + TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN") +#undef TRY + else { - if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) - fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); - unicode_scripts[i] = script; + fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n", + joining_group_name, arabicshaping_filename, lineno); + exit (1); } + + unicode_joining_type[i] = joining_type; + unicode_joining_group[i] = joining_group; } if (ferror (stream) || fclose (stream)) { - fprintf (stderr, "error reading from '%s'\n", scripts_filename); + fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename); exit (1); } } -/* Construction of sparse 3-level tables. */ -#define TABLE script_table -#define ELEMENT uint8_t -#define DEFAULT (uint8_t)~(uint8_t)0 -#define xmalloc malloc -#define xrealloc realloc -#include "3level.h" +/* Convert a Joining_Type value to a C identifier. */ +static const char * +joining_type_as_c_identifier (int joining_type) +{ +#define TRY(value) if (joining_type == value) return #value; + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + abort (); +} static void -output_scripts (const char *version) +output_joining_type_test (const char *filename, const char *version) { - const char *filename = "unictype/scripts.h"; FILE *stream; - unsigned int ch, s, i; - struct script_table t; - unsigned int level1_offset, level2_offset, level3_offset; - - typedef struct - { - const char *lowercase_name; - } - scriptinfo_t; - scriptinfo_t scriptinfo[256]; + bool need_comma; + unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) @@ -3646,81 +3929,74 @@ output_scripts (const char *version) } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); - for (s = 0; s < numscripts; s++) + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) { - char *lcp = strdup (scripts[s]); - char *cp; - - for (cp = lcp; *cp != '\0'; cp++) - if (*cp >= 'A' && *cp <= 'Z') - *cp += 'a' - 'A'; + int value = unicode_joining_type[ch]; - scriptinfo[s].lowercase_name = lcp; + if (value != (uint8_t)~(uint8_t)0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value)); + need_comma = true; + } } + if (need_comma) + fprintf (stream, "\n"); - for (s = 0; s < numscripts; s++) + if (ferror (stream) || fclose (stream)) { - fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", - scriptinfo[s].lowercase_name); - fprintf (stream, "{\n"); - i = 0; - for (ch = 0; ch < 0x110000; ch++) - if (unicode_scripts[ch] == s) - { - unsigned int start; - unsigned int end; + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} - start = ch; - while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) - ch++; - end = ch; +/* Construction of sparse 3-level tables. */ +#define TABLE joining_type_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" - if (i > 0) - fprintf (stream, ",\n"); - if (start == end) - fprintf (stream, " { 0x%04X, 1, 1 }", start); - else - fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", - start, end); - i++; - } - fprintf (stream, "\n"); - fprintf (stream, "};\n"); - } +static void +output_joining_type (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct joining_type_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint8_t *level3_packed; - fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); - fprintf (stream, "{\n"); - for (s = 0; s < numscripts; s++) + stream = fopen (filename, "w"); + if (stream == NULL) { - fprintf (stream, " {\n"); - fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", - scriptinfo[s].lowercase_name); - fprintf (stream, " script_%s_intervals,\n", - scriptinfo[s].lowercase_name); - fprintf (stream, " \"%s\"\n", scripts[s]); - fprintf (stream, " }"); - if (s+1 < numscripts) - fprintf (stream, ","); - fprintf (stream, "\n"); + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); } - fprintf (stream, "};\n"); + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; - script_table_init (&t); + joining_type_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { - unsigned int s = unicode_scripts[ch]; - if (s != (uint8_t)~(uint8_t)0) - script_table_add (&t, ch, s); + uint8_t value = unicode_joining_type[ch]; + + joining_type_table_add (&t, ch, value); } - script_table_finalize (&t); + joining_type_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = @@ -3734,16 +4010,17 @@ output_scripts (const char *version) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) - fprintf (stream, "#define script_header_%d %d\n", i, + fprintf (stream, "#define joining_type_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); - fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size, + (1 << t.p) * 4 / 8); fprintf (stream, " }\n"); - fprintf (stream, "u_script =\n"); + fprintf (stream, "u_joining_type =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) @@ -3785,12 +4062,479 @@ output_scripts (const char *version) if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); - fprintf (stream, " {"); - if (t.level3_size << t.p > 8) - fprintf (stream, "\n "); + /* Pack the level3 array. Each entry needs 4 bits only. */ + level3_packed = + (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t)); for (i = 0; i < t.level3_size << t.p; i++) { - if (i > 0 && (i % 8) == 0) + unsigned int j = (i * 4) / 8; + unsigned int k = (i * 4) % 8; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f; + level3_packed[j] |= (value << k); + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 4 / 8) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Convert a Joining_Group value to a C identifier. */ +static const char * +joining_group_as_c_identifier (int joining_group) +{ +#define TRY(value) if (joining_group == value) return #value; + TRY(UC_JOINING_GROUP_NONE) + TRY(UC_JOINING_GROUP_AIN) + TRY(UC_JOINING_GROUP_ALAPH) + TRY(UC_JOINING_GROUP_ALEF) + TRY(UC_JOINING_GROUP_BEH) + TRY(UC_JOINING_GROUP_BETH) + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE) + TRY(UC_JOINING_GROUP_DAL) + TRY(UC_JOINING_GROUP_DALATH_RISH) + TRY(UC_JOINING_GROUP_E) + TRY(UC_JOINING_GROUP_FARSI_YEH) + TRY(UC_JOINING_GROUP_FE) + TRY(UC_JOINING_GROUP_FEH) + TRY(UC_JOINING_GROUP_FINAL_SEMKATH) + TRY(UC_JOINING_GROUP_GAF) + TRY(UC_JOINING_GROUP_GAMAL) + TRY(UC_JOINING_GROUP_HAH) + TRY(UC_JOINING_GROUP_HE) + TRY(UC_JOINING_GROUP_HEH) + TRY(UC_JOINING_GROUP_HEH_GOAL) + TRY(UC_JOINING_GROUP_HETH) + TRY(UC_JOINING_GROUP_KAF) + TRY(UC_JOINING_GROUP_KAPH) + TRY(UC_JOINING_GROUP_KHAPH) + TRY(UC_JOINING_GROUP_KNOTTED_HEH) + TRY(UC_JOINING_GROUP_LAM) + TRY(UC_JOINING_GROUP_LAMADH) + TRY(UC_JOINING_GROUP_MEEM) + TRY(UC_JOINING_GROUP_MIM) + TRY(UC_JOINING_GROUP_NOON) + TRY(UC_JOINING_GROUP_NUN) + TRY(UC_JOINING_GROUP_NYA) + TRY(UC_JOINING_GROUP_PE) + TRY(UC_JOINING_GROUP_QAF) + TRY(UC_JOINING_GROUP_QAPH) + TRY(UC_JOINING_GROUP_REH) + TRY(UC_JOINING_GROUP_REVERSED_PE) + TRY(UC_JOINING_GROUP_SAD) + TRY(UC_JOINING_GROUP_SADHE) + TRY(UC_JOINING_GROUP_SEEN) + TRY(UC_JOINING_GROUP_SEMKATH) + TRY(UC_JOINING_GROUP_SHIN) + TRY(UC_JOINING_GROUP_SWASH_KAF) + TRY(UC_JOINING_GROUP_SYRIAC_WAW) + TRY(UC_JOINING_GROUP_TAH) + TRY(UC_JOINING_GROUP_TAW) + TRY(UC_JOINING_GROUP_TEH_MARBUTA) + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL) + TRY(UC_JOINING_GROUP_TETH) + TRY(UC_JOINING_GROUP_WAW) + TRY(UC_JOINING_GROUP_YEH) + TRY(UC_JOINING_GROUP_YEH_BARREE) + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL) + TRY(UC_JOINING_GROUP_YUDH) + TRY(UC_JOINING_GROUP_YUDH_HE) + TRY(UC_JOINING_GROUP_ZAIN) + TRY(UC_JOINING_GROUP_ZHAIN) +#undef TRY + abort (); +} + +static void +output_joining_group_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining group of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_group[ch]; + + if (value != UC_JOINING_GROUP_NONE) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_joining_group (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch_min, ch_max, ch, i; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + ch_min = 0x10FFFF; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_min = ch; + break; + } + + ch_max = 0; + for (ch = 0x10FFFF; ch > 0; ch--) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_max = ch; + break; + } + + if (!(ch_min <= ch_max)) + abort (); + + /* If the interval [ch_min, ch_max] is too large, we should better use a + 3-level table. */ + if (!(ch_max - ch_min < 0x200)) + abort (); + + fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min); + fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n", + ch_max + 1, ch_min); + fprintf (stream, "{"); + for (i = 0; i <= ch_max - ch_min; i++) + { + const char *s; + + ch = ch_min + i; + if ((i % 2) == 0) + fprintf (stream, "\n "); + s = joining_group_as_c_identifier (unicode_joining_group[ch]); + fprintf (stream, " %s", s); + if (i+1 <= ch_max - ch_min) + { + fprintf (stream, ","); + if (((i+1) % 2) != 0) + fprintf (stream, "%*s", 38 - (int) strlen (s), ""); + } + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Scripts. */ + +static const char *scripts[256]; +static unsigned int numscripts; + +static uint8_t unicode_scripts[0x110000]; + +static void +fill_scripts (const char *scripts_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (scripts_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); + exit (1); + } + + numscripts = 0; + + for (i = 0; i < 0x110000; i++) + unicode_scripts[i] = (uint8_t)~(uint8_t)0; + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char scriptname[200+1]; + int script; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", scripts_filename); + exit (1); + } + i2 = i1; + } + if (i2 < i1) + abort (); + if (i2 >= 0x110000) + abort (); + + for (script = numscripts - 1; script >= 0; script--) + if (strcmp (scripts[script], scriptname) == 0) + break; + if (script < 0) + { + scripts[numscripts] = strdup (scriptname); + script = numscripts; + numscripts++; + if (numscripts == 256) + abort (); + } + + for (i = i1; i <= i2; i++) + { + if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) + fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); + unicode_scripts[i] = script; + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", scripts_filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE script_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_scripts (const char *version) +{ + const char *filename = "unictype/scripts.h"; + FILE *stream; + unsigned int ch, s, i; + struct script_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + typedef struct + { + const char *lowercase_name; + } + scriptinfo_t; + scriptinfo_t scriptinfo[256]; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode scripts. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + for (s = 0; s < numscripts; s++) + { + char *lcp = strdup (scripts[s]); + char *cp; + + for (cp = lcp; *cp != '\0'; cp++) + if (*cp >= 'A' && *cp <= 'Z') + *cp += 'a' - 'A'; + + scriptinfo[s].lowercase_name = lcp; + } + + for (s = 0; s < numscripts; s++) + { + fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", + scriptinfo[s].lowercase_name); + fprintf (stream, "{\n"); + i = 0; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_scripts[ch] == s) + { + unsigned int start; + unsigned int end; + + start = ch; + while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) + ch++; + end = ch; + + if (i > 0) + fprintf (stream, ",\n"); + if (start == end) + fprintf (stream, " { 0x%04X, 1, 1 }", start); + else + fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", + start, end); + i++; + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + } + + fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); + fprintf (stream, "{\n"); + for (s = 0; s < numscripts; s++) + { + fprintf (stream, " {\n"); + fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " script_%s_intervals,\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " \"%s\"\n", scripts[s]); + fprintf (stream, " }"); + if (s+1 < numscripts) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + + t.p = 7; + t.q = 9; + script_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int s = unicode_scripts[ch]; + if (s != (uint8_t)~(uint8_t)0) + script_table_add (&t, ch, s); + } + + script_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define script_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_script =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) @@ -3824,9 +4568,9 @@ output_scripts_byname (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); - fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n"); + fprintf (stream, "struct named_script { int name; unsigned int index; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define hash-function-name scripts_hash\n"); @@ -3834,6 +4578,8 @@ output_scripts_byname (const char *version) fprintf (stream, "%%readonly-tables\n"); fprintf (stream, "%%global-table\n"); fprintf (stream, "%%define word-array-name script_names\n"); + fprintf (stream, "%%pic\n"); + fprintf (stream, "%%define string-pool-name script_stringpool\n"); fprintf (stream, "%%%%\n"); for (s = 0; s < numscripts; s++) fprintf (stream, "%s, %u\n", scripts[s], s); @@ -3964,7 +4710,7 @@ output_blocks (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode blocks. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "static const uc_block_t blocks[] =\n"); @@ -4381,7 +5127,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Language syntax properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; /* or 8 */ @@ -4926,7 +5672,7 @@ output_tables (const char *filename, const char *version) fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); - fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", + fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n", version); fprintf (stream, "\n"); @@ -4952,7 +5698,7 @@ output_tables (const char *filename, const char *version) fprintf (stream, "END LC_IDENTIFICATION\n"); fprintf (stream, "\n"); - /* Verifications. */ + /* Verification. */ for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords @@ -5134,61 +5880,291 @@ fill_width (const char *width_filename) unicode_width[i] = strdup (field1); } } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", width_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Non-spacing attribute and width. */ + +/* The non-spacing attribute table consists of: + - Non-spacing characters; generated from PropList.txt or + "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" + - Format control characters; generated from + "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" + - Zero width characters; generated from + "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" + */ + +static bool +is_nonspacing (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (get_bidi_category (ch) == UC_BIDI_NSM + || is_category_Cc (ch) || is_category_Cf (ch) + || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0)); +} + +static void +output_nonspacing_property (const char *filename) +{ + FILE *stream; + int ind[0x110000 / 0x200]; + unsigned int i; + unsigned int i_max; + int next_ind; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + next_ind = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = false; + unsigned int ch; + + if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */ + for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++) + if (is_nonspacing (ch)) + { + nontrivial = true; + break; + } + if (nontrivial) + ind[i] = next_ind++; + else + ind[i] = -1; + } + + fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n", + next_ind); + i_max = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = (ind[i] >= 0); + + if (nontrivial) + { + unsigned int j; + + fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1); + for (j = 0; j < 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + unsigned int l; + unsigned char bits = 0; + + for (l = 0; l < 8; l++) + { + unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l; + + if (is_nonspacing (ch)) + bits |= 1 << l; + } + fprintf (stream, " 0x%02x%c", bits, + ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1); + } + i_max = i; + } + } + fprintf (stream, "};\n"); + + i_max = ((i_max + 8 - 1) / 8) * 8; + fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n", + i_max); + { + unsigned int j; + + for (j = 0; j < i_max / 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + i = j * 8 + k; + fprintf (stream, " %2d%c", ind[i], + j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1); + } + } + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */ +static char +symbolic_width (unsigned int ch) +{ + /* Test for unassigned character. */ + if (is_property_unassigned_code_value (ch)) + { + /* Unicode TR#11 section "Unassigned and Private-Use Characters". */ + if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */ + return 'A'; + if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */ + || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */ + || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */) + return '2'; + return 0; + } + else + { + /* Test for non-spacing or control character. */ + if (is_category_Cc (ch) && ch < 0x00A0) + return 0; + if (is_nonspacing (ch)) + return '0'; + /* Test for double-width character. */ + if (unicode_width[ch] != NULL + && (strcmp (unicode_width[ch], "W") == 0 + || strcmp (unicode_width[ch], "F") == 0)) + return '2'; + /* Test for half-width character. */ + if (unicode_width[ch] != NULL + && strcmp (unicode_width[ch], "H") == 0) + return '1'; + } + /* In ancient CJK encodings, Cyrillic and most other characters are + double-width as well. */ + if (ch >= 0x00A1 && ch < 0x10000) + return 'A'; + return '1'; +} + +static void +output_width_property_test (const char *filename) +{ + FILE *stream; + unsigned int interval_start, interval_end, ch; + char interval_value; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + interval_value = 0; + interval_start = interval_end = 0; /* avoid GCC warning */ + for (ch = 0; ch < 0x110000; ch++) + { + char value = symbolic_width (ch); + if (value != 0) /* skip Cc control characters and unassigned characters */ + { + if (value == interval_value) + /* Extend the interval. */ + interval_end = ch; + else + { + /* Terminate the interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + /* Start a new interval. */ + interval_start = interval_end = ch; + interval_value = value; + } + } + } + /* Terminate the last interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + if (ferror (stream) || fclose (stream)) { - fprintf (stderr, "error reading from '%s'\n", width_filename); + fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } -/* Line breaking classification. */ +/* ========================================================================= */ + +/* Line breaking classification. + Updated for Unicode TR #14 revision 26. */ enum { - /* Values >= 24 are resolved at run time. */ - LBP_BK = 24, /* mandatory break */ + /* Values >= 25 are resolved at run time. */ + LBP_BK = 25, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 25, /* attached characters and combining marks */ + LBP_CM = 26, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ - LBP_ZW = 26, /* zero width space */ + LBP_ZW = 27, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ - LBP_SP = 27, /* space */ + LBP_SP = 28, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ - LBP_CB = 28, /* contingent break opportunity */ + LBP_CB = 29, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ - LBP_EX = 7, /* exclamation/interrogation */ - LBP_IN = 8, /* inseparable */ - LBP_NS = 9, /* non starter */ - LBP_OP = 10, /* opening punctuation */ - LBP_QU = 11, /* ambiguous quotation */ - LBP_IS = 12, /* infix separator (numeric) */ - LBP_NU = 13, /* numeric */ - LBP_PO = 14, /* postfix (numeric) */ - LBP_PR = 15, /* prefix (numeric) */ - LBP_SY = 16, /* symbols allowing breaks */ - LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */ - LBP_AL = 17, /* ordinary alphabetic and symbol characters */ - LBP_H2 = 18, /* Hangul LV syllable */ - LBP_H3 = 19, /* Hangul LVT syllable */ - LBP_ID = 20, /* ideographic */ - LBP_JL = 21, /* Hangul L Jamo */ - LBP_JV = 22, /* Hangul V Jamo */ - LBP_JT = 23, /* Hangul T Jamo */ - LBP_SA = 30, /* complex context (South East Asian) */ - LBP_XX = 31 /* unknown */ + LBP_CP = 7, /* closing parenthesis */ + LBP_EX = 8, /* exclamation/interrogation */ + LBP_IN = 9, /* inseparable */ + LBP_NS = 10, /* non starter */ + LBP_OP = 11, /* opening punctuation */ + LBP_QU = 12, /* ambiguous quotation */ + LBP_IS = 13, /* infix separator (numeric) */ + LBP_NU = 14, /* numeric */ + LBP_PO = 15, /* postfix (numeric) */ + LBP_PR = 16, /* prefix (numeric) */ + LBP_SY = 17, /* symbols allowing breaks */ + LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */ + LBP_AL = 18, /* ordinary alphabetic and symbol characters */ + LBP_H2 = 19, /* Hangul LV syllable */ + LBP_H3 = 20, /* Hangul LVT syllable */ + LBP_ID = 21, /* ideographic */ + LBP_JL = 22, /* Hangul L Jamo */ + LBP_JV = 23, /* Hangul V Jamo */ + LBP_JT = 24, /* Hangul T Jamo */ + LBP_SA = 31, /* complex context (South East Asian) */ + LBP_XX = 32 /* unknown */ }; /* Returns the line breaking classification for ch, as a bit mask. */ -static int +static int64_t get_lbp (unsigned int ch) { - int attr = 0; + int64_t attr = 0; if (unicode_attributes[ch].name != NULL) { @@ -5198,15 +6174,15 @@ get_lbp (unsigned int ch) || ch == 0x000B /* line tabulation */ || ch == 0x2028 /* LINE SEPARATOR */ || ch == 0x2029 /* PARAGRAPH SEPARATOR */) - attr |= 1 << LBP_BK; + attr |= (int64_t) 1 << LBP_BK; if (ch == 0x2060 /* WORD JOINER */ || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) - attr |= 1 << LBP_WJ; + attr |= (int64_t) 1 << LBP_WJ; /* zero width space */ if (ch == 0x200B /* ZERO WIDTH SPACE */) - attr |= 1 << LBP_ZW; + attr |= (int64_t) 1 << LBP_ZW; /* non-breaking (glue) */ if (ch == 0x00A0 /* NO-BREAK SPACE */ @@ -5218,19 +6194,23 @@ get_lbp (unsigned int ch) || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ - || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */) - attr |= 1 << LBP_GL; + || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */ + || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */) + attr |= (int64_t) 1 << LBP_GL; /* space */ if (ch == 0x0020 /* SPACE */) - attr |= 1 << LBP_SP; + attr |= (int64_t) 1 << LBP_SP; /* break opportunity before and after */ if (ch == 0x2014 /* EM DASH */) - attr |= 1 << LBP_B2; + attr |= (int64_t) 1 << LBP_B2; /* break opportunity after */ - if (ch == 0x1680 /* OGHAM SPACE MARK */ + if (/* Breaking Spaces */ + ch == 0x1680 /* OGHAM SPACE MARK */ || ch == 0x2000 /* EN QUAD */ || ch == 0x2001 /* EM QUAD */ || ch == 0x2002 /* EN SPACE */ @@ -5242,12 +6222,17 @@ get_lbp (unsigned int ch) || ch == 0x2009 /* THIN SPACE */ || ch == 0x200A /* HAIR SPACE */ || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ + /* Tabs */ || ch == 0x0009 /* tab */ + /* Conditional Hyphens */ || ch == 0x00AD /* SOFT HYPHEN */ + /* Breaking Hyphens */ || ch == 0x058A /* ARMENIAN HYPHEN */ + || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */ || ch == 0x2010 /* HYPHEN */ || ch == 0x2012 /* FIGURE DASH */ || ch == 0x2013 /* EN DASH */ + /* Visible Word Dividers */ || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ || ch == 0x1361 /* ETHIOPIC WORDSPACE */ @@ -5255,6 +6240,7 @@ get_lbp (unsigned int ch) || ch == 0x17DA /* KHMER SIGN KOOMUUT */ || ch == 0x2027 /* HYPHENATION POINT */ || ch == 0x007C /* VERTICAL LINE */ + /* Historic Word Separators */ || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ @@ -5271,6 +6257,7 @@ get_lbp (unsigned int ch) || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ || ch == 0x2E30 /* RING POINT */ + || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */ || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ || ch == 0x10102 /* AEGEAN CHECK MARK */ @@ -5278,6 +6265,7 @@ get_lbp (unsigned int ch) || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ + /* Dandas */ || ch == 0x0964 /* DEVANAGARI DANDA */ || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ @@ -5297,17 +6285,18 @@ get_lbp (unsigned int ch) || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ + /* Tibetan */ || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ || ch == 0x0F85 /* TIBETAN MARK PALUTA */ || ch == 0x0FBE /* TIBETAN KU RU KHA */ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ + /* Other Terminating Punctuation */ || ch == 0x1804 /* MONGOLIAN COLON */ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ || ch == 0x1B5A /* BALINESE PANTI */ || ch == 0x1B5B /* BALINESE PAMADA */ - || ch == 0x1B5C /* BALINESE WINDU */ || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ || ch == 0x1B60 /* BALINESE PAMENENG */ || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ @@ -5334,10 +6323,36 @@ get_lbp (unsigned int ch) || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */ + || ch == 0xA4FE /* LISU PUNCTUATION COMMA */ + || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */ + || ch == 0xA6F3 /* BAMUM FULL STOP */ + || ch == 0xA6F4 /* BAMUM COLON */ + || ch == 0xA6F5 /* BAMUM COMMA */ + || ch == 0xA6F6 /* BAMUM SEMICOLON */ + || ch == 0xA6F7 /* BAMUM QUESTION MARK */ + || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */ + || ch == 0xA9C8 /* JAVANESE PADA LINGSA */ + || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */ + || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */ + || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */ + || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */ + || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */ + || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */ + || ch == 0x11047 /* BRAHMI DANDA */ + || ch == 0x11048 /* BRAHMI DOUBLE DANDA */ + || ch == 0x110BE /* KAITHI SECTION MARK */ + || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */ + || ch == 0x110C0 /* KAITHI DANDA */ + || ch == 0x110C1 /* KAITHI DOUBLE DANDA */ || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) - attr |= 1 << LBP_BA; + attr |= (int64_t) 1 << LBP_BA; /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ @@ -5359,19 +6374,25 @@ get_lbp (unsigned int ch) || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) - attr |= 1 << LBP_BB; + attr |= (int64_t) 1 << LBP_BB; /* hyphen */ if (ch == 0x002D /* HYPHEN-MINUS */) - attr |= 1 << LBP_HY; + attr |= (int64_t) 1 << LBP_HY; /* contingent break opportunity */ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) - attr |= 1 << LBP_CB; + attr |= (int64_t) 1 << LBP_CB; + + /* closing parenthesis */ + if (ch == 0x0029 /* RIGHT PARENTHESIS */ + || ch == 0x005D /* RIGHT SQUARE BRACKET */) + attr |= (int64_t) 1 << LBP_CP; /* closing punctuation */ if ((unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'e') + && unicode_attributes[ch].category[1] == 'e' + && !(attr & ((int64_t) 1 << LBP_CP))) || ch == 0x3001 /* IDEOGRAPHIC COMMA */ || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ @@ -5381,8 +6402,17 @@ get_lbp (unsigned int ch) || ch == 0xFF0C /* FULLWIDTH COMMA */ || ch == 0xFF0E /* FULLWIDTH FULL STOP */ || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ - || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */) - attr |= 1 << LBP_CL; + || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */ + || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */ + || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */ + || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */ + || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */ + || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */ + || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */ + || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */) + attr |= (int64_t) 1 << LBP_CL; /* exclamation/interrogation */ if (ch == 0x0021 /* EXCLAMATION MARK */ @@ -5410,9 +6440,6 @@ get_lbp (unsigned int ch) || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ || ch == 0x2CFE /* COPTIC FULL STOP */ || ch == 0x2E2E /* REVERSED QUESTION MARK */ -#if REVISION_22 - || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ -#endif || ch == 0xA60E /* VAI FULL STOP */ || ch == 0xA876 /* PHAGS-PA MARK SHAD */ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ @@ -5422,14 +6449,14 @@ get_lbp (unsigned int ch) || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) - attr |= 1 << LBP_EX; + attr |= (int64_t) 1 << LBP_EX; /* inseparable */ if (ch == 0x2024 /* ONE DOT LEADER */ || ch == 0x2025 /* TWO DOT LEADER */ || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) - attr |= 1 << LBP_IN; + attr |= (int64_t) 1 << LBP_IN; /* non starter */ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ @@ -5462,15 +6489,22 @@ get_lbp (unsigned int ch) || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) - attr |= 1 << LBP_NS; + attr |= (int64_t) 1 << LBP_NS; /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's') || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00BF /* INVERTED QUESTION MARK */ - || ch == 0x2E18 /* INVERTED INTERROBANG */) - attr |= 1 << LBP_OP; + || ch == 0x2E18 /* INVERTED INTERROBANG */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */ + || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */ + || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */ + || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */ + || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */ + || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */) + attr |= (int64_t) 1 << LBP_OP; /* ambiguous quotation */ if ((unicode_attributes[ch].category[0] == 'P' @@ -5488,7 +6522,7 @@ get_lbp (unsigned int ch) || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ || ch == 0x2E0B /* RAISED SQUARE */) - attr |= 1 << LBP_QU; + attr |= (int64_t) 1 << LBP_QU; /* infix separator (numeric) */ if (ch == 0x002C /* COMMA */ @@ -5504,7 +6538,7 @@ get_lbp (unsigned int ch) || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) - attr |= 1 << LBP_IS; + attr |= (int64_t) 1 << LBP_IS; /* numeric */ if ((unicode_attributes[ch].category[0] == 'N' @@ -5512,7 +6546,7 @@ get_lbp (unsigned int ch) && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) - attr |= 1 << LBP_NU; + attr |= (int64_t) 1 << LBP_NU; /* postfix (numeric) */ if (ch == 0x0025 /* PERCENT SIGN */ @@ -5538,8 +6572,13 @@ get_lbp (unsigned int ch) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ - || ch == 0x0D79 /* MALAYALAM DATE MARK */) - attr |= 1 << LBP_PO; + || ch == 0x09F2 /* BENGALI RUPEE MARK */ + || ch == 0x09F3 /* BENGALI RUPEE SIGN */ + || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */ + || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */ + || ch == 0xA838 /* NORTH INDIC RUPEE MARK */) + attr |= (int64_t) 1 << LBP_PO; /* prefix (numeric) */ if ((unicode_attributes[ch].category[0] == 'S' @@ -5550,27 +6589,27 @@ get_lbp (unsigned int ch) || ch == 0x2116 /* NUMERO SIGN */ || ch == 0x2212 /* MINUS SIGN */ || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) - if (!(attr & (1 << LBP_PO))) - attr |= 1 << LBP_PR; + if (!(attr & ((int64_t) 1 << LBP_PO))) + attr |= (int64_t) 1 << LBP_PR; /* symbols allowing breaks */ if (ch == 0x002F /* SOLIDUS */) - attr |= 1 << LBP_SY; + attr |= (int64_t) 1 << LBP_SY; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0) - attr |= 1 << LBP_H2; + attr |= (int64_t) 1 << LBP_H2; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0) - attr |= 1 << LBP_H3; + attr |= (int64_t) 1 << LBP_H3; - if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F) - attr |= 1 << LBP_JL; + if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C)) + attr |= (int64_t) 1 << LBP_JL; - if (ch >= 0x1160 && ch <= 0x11A2) - attr |= 1 << LBP_JV; + if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6)) + attr |= (int64_t) 1 << LBP_JV; - if (ch >= 0x11A8 && ch <= 0x11F9) - attr |= 1 << LBP_JT; + if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) + attr |= (int64_t) 1 << LBP_JT; /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' @@ -5580,17 +6619,24 @@ get_lbp (unsigned int ch) || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'M' && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'n')) + || unicode_attributes[ch].category[1] == 'n') + && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ + || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ - || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) - && ((ch >= 0x0E00 && ch <= 0x0EFF) - || (ch >= 0x1000 && ch <= 0x109F) - || (ch >= 0x1780 && ch <= 0x17FF) - || (ch >= 0x1950 && ch <= 0x19DF))) - attr |= 1 << LBP_SA; + || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */ + || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */ + || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */ + || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */) + && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */ + || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */ + || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */ + || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */ + || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */ + || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */)) + attr |= (int64_t) 1 << LBP_SA; /* attached characters and combining marks */ if ((unicode_attributes[ch].category[0] == 'M' @@ -5599,17 +6645,18 @@ get_lbp (unsigned int ch) || unicode_attributes[ch].category[1] == 'n')) || (unicode_attributes[ch].category[0] == 'C' && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'f'))) - if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW)))) - attr |= 1 << LBP_CM; + || unicode_attributes[ch].category[1] == 'f') + && ch != 0x110BD /* KAITHI NUMBER SIGN */)) + if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW)))) + attr |= (int64_t) 1 << LBP_CM; /* ideographic */ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ || ch == 0x3000 /* IDEOGRAPHIC SPACE */ || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ - || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ @@ -5623,7 +6670,7 @@ get_lbp (unsigned int ch) || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL || (ch >= 0x3000 && ch <= 0x33FF - && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) + && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP)))) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ @@ -5666,8 +6713,14 @@ get_lbp (unsigned int ch) || ch == 0xFF5E /* FULLWIDTH TILDE */ || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ || ch == 0xFFE3 /* FULLWIDTH MACRON */ - || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */) - if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM)))) + || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */ + || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */ + || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */ + || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */ + || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */) + if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM)))) { /* ambiguous (ideograph) ? */ if ((unicode_width[ch] != NULL @@ -5675,9 +6728,9 @@ get_lbp (unsigned int ch) && ch >= 0x2000) || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) - attr |= 1 << LBP_AI; + attr |= (int64_t) 1 << LBP_AI; else - attr |= 1 << LBP_ID; + attr |= (int64_t) 1 << LBP_ID; } /* ordinary alphabetic and symbol characters */ @@ -5707,8 +6760,10 @@ get_lbp (unsigned int ch) || ch == 0x2061 /* FUNCTION APPLICATION */ || ch == 0x2062 /* INVISIBLE TIMES */ || ch == 0x2063 /* INVISIBLE SEPARATOR */ - || ch == 0x2064 /* INVISIBLE PLUS */) - if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID)))) + || ch == 0x2064 /* INVISIBLE PLUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x110BD /* KAITHI NUMBER SIGN */) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) { /* ambiguous (alphabetic) ? */ if ((unicode_width[ch] != NULL @@ -5726,8 +6781,6 @@ get_lbp (unsigned int ch) && ch != 0x21E7 /* UPWARDS WHITE ARROW */ && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) -#if !REVISION_22 - || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00A7 /* SECTION SIGN */ || ch == 0x00A8 /* DIAERESIS */ || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ @@ -5741,7 +6794,6 @@ get_lbp (unsigned int ch) || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ - || ch == 0x00BF /* INVERTED QUESTION MARK */ || ch == 0x00D7 /* MULTIPLICATION SIGN */ || ch == 0x00F7 /* DIVISION SIGN */ || ch == 0x02C7 /* CARON */ @@ -5755,7 +6807,6 @@ get_lbp (unsigned int ch) || ch == 0x02DA /* RING ABOVE */ || ch == 0x02DB /* OGONEK */ || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ -#endif || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ @@ -5763,16 +6814,30 @@ get_lbp (unsigned int ch) || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ || ch == 0x2616 /* WHITE SHOGI PIECE */ || ch == 0x2617 /* BLACK SHOGI PIECE */) - attr |= 1 << LBP_AI; + attr |= (int64_t) 1 << LBP_AI; else - attr |= 1 << LBP_AL; - attr &= ~(1 << LBP_CM); + attr |= (int64_t) 1 << LBP_AL; + attr &= ~((int64_t) 1 << LBP_CM); } } + else + { + /* Unassigned character. */ + if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ + || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ + || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + attr |= (int64_t) 1 << LBP_ID; + } if (attr == 0) /* unknown */ - attr |= 1 << LBP_XX; + attr |= (int64_t) 1 << LBP_XX; return attr; } @@ -5785,12 +6850,12 @@ debug_output_lbp (FILE *stream) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); - if (attr != 1 << LBP_XX) + int64_t attr = get_lbp (i); + if (attr != (int64_t) 1 << LBP_XX) { fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ - if (attr & (1 << bit)) fprintf (stream, " " #bit); + if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit); PRINT_BIT(attr,LBP_BK); PRINT_BIT(attr,LBP_CM); PRINT_BIT(attr,LBP_WJ); @@ -5803,6 +6868,7 @@ debug_output_lbp (FILE *stream) PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); @@ -5916,6 +6982,7 @@ fill_org_lbp (const char *linebreak_filename) TRY(LBP_HY) TRY(LBP_CB) TRY(LBP_CL) + TRY(LBP_CP) TRY(LBP_EX) TRY(LBP_IN) TRY(LBP_NS) @@ -5961,6 +7028,7 @@ fill_org_lbp (const char *linebreak_filename) unicode_org_lbp[i] = value; } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", linebreak_filename); @@ -5994,6 +7062,7 @@ debug_output_org_lbp (FILE *stream) PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); @@ -6062,13 +7131,13 @@ output_lbp (FILE *stream1, FILE *stream2) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); + int64_t attr = get_lbp (i); /* Now attr should contain exactly one bit. */ if (attr == 0 || ((attr & (attr - 1)) != 0)) abort (); - if (attr != 1 << LBP_XX) + if (attr != (int64_t) 1 << LBP_XX) { unsigned int log2_attr; for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); @@ -6166,6 +7235,7 @@ output_lbp (FILE *stream1, FILE *stream2) CASE(LBP_HY); CASE(LBP_CB); CASE(LBP_CL); + CASE(LBP_CP); CASE(LBP_EX); CASE(LBP_IN); CASE(LBP_NS); @@ -6227,7 +7297,7 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -6265,7 +7335,8 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve /* ========================================================================= */ -/* Word break property. */ +/* Word break property. + Updated for Unicode TR #29 revision 17. */ /* Possible values of the Word_Break property. */ enum @@ -6311,7 +7382,7 @@ get_wbp (unsigned int ch) if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Cf") == 0 - && ch != 0x200C && ch != 0x200D) + && ch != 0x200B && ch != 0x200C && ch != 0x200D) attr |= 1 << WBP_FORMAT; if ((unicode_scripts[ch] < numscripts @@ -6331,12 +7402,10 @@ get_wbp (unsigned int ch) && (attr & (1 << WBP_EXTEND)) == 0) attr |= 1 << WBP_ALETTER; - if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 - || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) + if (is_WBP_MIDNUMLET (ch)) attr |= 1 << WBP_MIDNUMLET; - if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A - || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) + if (is_WBP_MIDLETTER (ch)) attr |= 1 << WBP_MIDLETTER; if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 @@ -6709,7 +7778,7 @@ output_wbrk_tables (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -6743,6 +7812,316 @@ output_wbrk_tables (const char *filename, const char *version) /* ========================================================================= */ +/* Grapheme break property. + Updated for Unicode TR #29 revision 17. */ + +/* Possible values of the Grapheme_Cluster_Break property. */ +enum +{ + GBP_OTHER = 0, + GBP_CR = 1, + GBP_LF = 2, + GBP_CONTROL = 3, + GBP_EXTEND = 4, + GBP_PREPEND = 5, + GBP_SPACINGMARK = 6, + GBP_L = 7, + GBP_V = 8, + GBP_T = 9, + GBP_LV = 10, + GBP_LVT = 11 +}; + +/* Construction of sparse 3-level tables. */ +#define TABLE gbp_table +#define ELEMENT unsigned char +#define DEFAULT GBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* The grapheme break property from the GraphemeBreakProperty.txt file. */ +int unicode_org_gbp[0x110000]; + +/* Output the unit test data for the grapheme break property. */ +static void +output_gbp_test (const char *filename) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode grapheme break property functions.\n"); + fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int gbp = unicode_org_gbp[ch]; + const char *gbp_string; + + while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp) + ch++; + + switch (gbp) + { +#define CASE(x) case x: gbp_string = #x; break; + CASE (GBP_OTHER) + CASE (GBP_CR) + CASE (GBP_LF) + CASE (GBP_CONTROL) + CASE (GBP_EXTEND) + CASE (GBP_PREPEND) + CASE (GBP_SPACINGMARK) + CASE (GBP_L) + CASE (GBP_V) + CASE (GBP_T) + CASE (GBP_LV) + CASE (GBP_LVT) +#undef CASE + default: + abort (); + } + + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string); + + need_comma = true; + } + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the per-character grapheme break property table. */ +static void +output_gbp_table (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct gbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Grapheme break property of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + gbp_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + gbp_table_add (&t, ch, unicode_org_gbp[ch]); + + gbp_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define gbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n", + t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "unigbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t) / 2); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) / 2; i++) + { + unsigned char *p = (unsigned char *) (t.result + level3_offset); + unsigned char value0 = p[i * 2]; + unsigned char value1 = p[i * 2 + 1]; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x%s", (value1 << 4) + value0, + (i+1 < (t.level3_size << t.p) / 2 ? "," : "")); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Stores in unicode_org_gbp[] the grapheme breaking property from the + GraphemeBreakProperty.txt file. */ +static void +fill_org_gbp (const char *graphemebreakproperty_filename) +{ + unsigned int i; + FILE *stream; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_org_gbp[i] = GBP_OTHER; + + stream = fopen (graphemebreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + lineno++; + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", GBP_CR) + PROP ("LF", GBP_LF) + PROP ("Control", GBP_CONTROL) + PROP ("Extend", GBP_EXTEND) + PROP ("Prepend", GBP_PREPEND) + PROP ("SpacingMark", GBP_SPACINGMARK) + PROP ("L", GBP_L) + PROP ("V", GBP_V) + PROP ("T", GBP_T) + PROP ("LV", GBP_LV) + PROP ("LVT", GBP_LVT) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, + graphemebreakproperty_filename, lineno); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_gbp[i] = propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Composition and decomposition. + Updated for Unicode TR #15 revision 33. */ + /* Maximum number of characters into which a single Unicode character can be decomposed. */ #define MAX_DECOMP_LENGTH 18 @@ -7149,7 +8528,7 @@ output_composition_tables (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); @@ -7191,7 +8570,7 @@ output_composition_tables (const char *filename, const char *version) 1527, which is quite good (60% filled). It requires an auxiliary table lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ - fprintf (stream, "struct composition_rule { char codes[4]; };\n"); + fprintf (stream, "struct composition_rule { char codes[6]; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define slot-name codes\n"); @@ -7229,16 +8608,9 @@ output_composition_tables (const char *filename, const char *version) if (strcmp (unicode_attributes[combined].combining, "0") != 0) abort (); - if (!(code1 < 0x10000)) - abort (); - if (!(code2 < 0x10000)) - abort (); - if (!(combined < 0x10000)) - abort (); - - fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", - (code1 >> 8) & 0xff, code1 & 0xff, - (code2 >> 8) & 0xff, code2 & 0xff, + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, combined); } } @@ -7289,7 +8661,7 @@ output_simple_mapping_test (const char *filename, fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); - fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); fprintf (stream, "#include \"test-mapping-part1.h\"\n"); @@ -7351,7 +8723,7 @@ output_simple_mapping (const char *filename, fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); t.p = 7; @@ -8273,20 +9645,22 @@ main (int argc, char * argv[]) const char *unicodedata_filename; const char *proplist_filename; const char *derivedproplist_filename; + const char *arabicshaping_filename; const char *scripts_filename; const char *blocks_filename; const char *proplist30_filename; const char *eastasianwidth_filename; const char *linebreak_filename; const char *wordbreakproperty_filename; + const char *graphemebreakproperty_filename; const char *compositionexclusions_filename; const char *specialcasing_filename; const char *casefolding_filename; const char *version; - if (argc != 14) + if (argc != 16) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", argv[0]); exit (1); } @@ -8294,27 +9668,31 @@ main (int argc, char * argv[]) unicodedata_filename = argv[1]; proplist_filename = argv[2]; derivedproplist_filename = argv[3]; - scripts_filename = argv[4]; - blocks_filename = argv[5]; - proplist30_filename = argv[6]; - eastasianwidth_filename = argv[7]; - linebreak_filename = argv[8]; - wordbreakproperty_filename = argv[9]; - compositionexclusions_filename = argv[10]; - specialcasing_filename = argv[11]; - casefolding_filename = argv[12]; - version = argv[13]; + arabicshaping_filename = argv[4]; + scripts_filename = argv[5]; + blocks_filename = argv[6]; + proplist30_filename = argv[7]; + eastasianwidth_filename = argv[8]; + linebreak_filename = argv[9]; + wordbreakproperty_filename = argv[10]; + graphemebreakproperty_filename = argv[11]; + compositionexclusions_filename = argv[12]; + specialcasing_filename = argv[13]; + casefolding_filename = argv[14]; + version = argv[15]; fill_attributes (unicodedata_filename); clear_properties (); fill_properties (proplist_filename); fill_properties (derivedproplist_filename); fill_properties30 (proplist30_filename); + fill_arabicshaping (arabicshaping_filename); fill_scripts (scripts_filename); fill_blocks (blocks_filename); fill_width (eastasianwidth_filename); fill_org_lbp (linebreak_filename); fill_org_wbp (wordbreakproperty_filename); + fill_org_gbp (graphemebreakproperty_filename); fill_composition_exclusions (compositionexclusions_filename); fill_casing_rules (specialcasing_filename); fill_casefolding_rules (casefolding_filename); @@ -8323,7 +9701,7 @@ main (int argc, char * argv[]) output_categories (version); output_category ("unictype/categ_of.h", version); - output_combclass ("unictype/combining.h", version); + output_combclass ("unictype/combiningclass.h", version); output_bidi_category ("unictype/bidi_of.h", version); output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); output_decimal_digit ("unictype/decdigit.h", version); @@ -8333,10 +9711,17 @@ main (int argc, char * argv[]) output_numeric ("unictype/numeric.h", version); output_mirror ("unictype/mirror.h", version); output_properties (version); + output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version); + output_joining_type ("unictype/joiningtype_of.h", version); + output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version); + output_joining_group ("unictype/joininggroup_of.h", version); + output_scripts (version); output_scripts_byname (version); output_blocks (version); output_ident_properties (version); + output_nonspacing_property ("uniwidth/width.c.part"); + output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part"); output_old_ctype (version); debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); @@ -8347,6 +9732,9 @@ main (int argc, char * argv[]) debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h"); + output_gbp_table ("unigbrk/gbrkprop.h", version); + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); debug_output_composition_tables ("uninorm/composition.txt"); output_composition_tables ("uninorm/composition-table.gperf", version); @@ -8370,19 +9758,23 @@ main (int argc, char * argv[]) * compile-command: " gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ ./gen-uni-tables \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \ - 5.1.0 + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \ + 6.0.0 \ + && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \ + && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt " * End: */