/* Generate Unicode conforming character classification tables and
line break properties tables and word break property tables and
decomposition/composition and case mapping tables from a UnicodeData file.
- Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.
+ Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
This program is free software: you can redistribute it and/or modify
$ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
/usr/local/share/Unidata/PropList.txt \
/usr/local/share/Unidata/DerivedCoreProperties.txt \
+ /usr/local/share/Unidata/ArabicShaping.txt \
/usr/local/share/Unidata/Scripts.txt \
/usr/local/share/Unidata/Blocks.txt \
/usr/local/share/Unidata/PropList-3.0.1.txt \
/usr/local/share/Unidata/EastAsianWidth.txt \
/usr/local/share/Unidata/LineBreak.txt \
/usr/local/share/Unidata/WordBreakProperty.txt \
+ /usr/local/share/Unidata/GraphemeBreakProperty.txt \
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 5.1.0
+ 6.0.0
*/
#include <stdbool.h>
i = strtoul (field0, NULL, 16);
if (field1[0] == '<'
&& strlen (field1) >= 9
- && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
+ && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
{
/* Deal with a range. */
lineno++;
field11, field12, field13, field14);
}
}
+
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
}
static bool
+is_category_LC (unsigned int ch)
+{
+ /* See PropertyValueAliases.txt. */
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'u'
+ || unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 't'));
+}
+
+static bool
is_category_Lu (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* %s of Unicode characters. */\n", comment);
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 4; /* or: 5 */
output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
CATEGORY (L)
+ CATEGORY (LC)
CATEGORY (Lu)
CATEGORY (Ll)
CATEGORY (Lt)
enum
{
UC_CATEGORY_MASK_L = 0x0000001f,
+ UC_CATEGORY_MASK_LC = 0x00000007,
UC_CATEGORY_MASK_Lu = 0x00000001,
UC_CATEGORY_MASK_Ll = 0x00000002,
UC_CATEGORY_MASK_Lt = 0x00000004,
switch (category_name[1])
{
case '\0': return UC_CATEGORY_MASK_L;
+ case 'C': return UC_CATEGORY_MASK_LC;
case 'u': return UC_CATEGORY_MASK_Lu;
case 'l': return UC_CATEGORY_MASK_Ll;
case 't': return UC_CATEGORY_MASK_Lt;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Categories of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Combining class of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
need_comma = false;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
need_comma = false;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Numeric values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
need_comma = false;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Numeric values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
/* Create table of occurring fractions. */
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Mirrored Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
/* ========================================================================= */
+/* Particular values of the word break property. */
+
+static bool
+is_WBP_MIDNUMLET (unsigned int ch)
+{
+ return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+ || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
+}
+
+static bool
+is_WBP_MIDLETTER (unsigned int ch)
+{
+ return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
+ || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
+}
+
+/* ========================================================================= */
+
/* Properties. */
/* Reading PropList.txt and DerivedCoreProperties.txt. */
PROP_ALPHABETIC,
PROP_LOWERCASE,
PROP_UPPERCASE,
+ PROP_CASED,
+ PROP_CASE_IGNORABLE,
+ PROP_CHANGES_WHEN_LOWERCASED,
+ PROP_CHANGES_WHEN_UPPERCASED,
+ PROP_CHANGES_WHEN_TITLECASED,
+ PROP_CHANGES_WHEN_CASEFOLDED,
+ PROP_CHANGES_WHEN_CASEMAPPED,
PROP_ID_START,
PROP_ID_CONTINUE,
PROP_XID_START,
PROP ("Alphabetic", PROP_ALPHABETIC)
PROP ("Lowercase", PROP_LOWERCASE)
PROP ("Uppercase", PROP_UPPERCASE)
+ PROP ("Cased", PROP_CASED)
+ PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
+ PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
+ PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
+ PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
+ PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
+ PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
PROP ("ID_Start", PROP_ID_START)
PROP ("ID_Continue", PROP_ID_CONTINUE)
PROP ("XID_Start", PROP_XID_START)
for (i = i1; i <= i2; i++)
array[i] = 1;
}
+
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", proplist_filename);
|| (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
|| (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
|| (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
+ || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
|| (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
|| (ch == 0x10341) /* GOTHIC LETTER NINETY */
|| (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
bool result1 =
(is_category_Cf (ch)
&& !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
- && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
+ && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
+ /* For some reason, the following are not listed as having property
+ Default_Ignorable_Code_Point. */
+ && !(ch == 0x110BD))
|| ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
|| ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
bool result2 =
return is_category_Lt (ch);
}
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_cased (unsigned int ch)
+{
+ bool result1 = (is_property_lowercase (ch)
+ || is_property_uppercase (ch)
+ || is_category_Lt (ch));
+ bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_case_ignorable (unsigned int ch)
+{
+ bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+ || is_category_Mn (ch)
+ || is_category_Me (ch)
+ || is_category_Cf (ch)
+ || is_category_Lm (ch)
+ || is_category_Sk (ch));
+ bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_lowercased (unsigned int ch)
+{
+ bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
+ bool result2 = (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].lower != NONE
+ && unicode_attributes[ch].lower != ch);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_uppercased (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_titlecased (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_casefolded (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_casemapped (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
+}
+
/* See PropList.txt, UCD.html. */
static bool
is_property_soft_dotted (unsigned int ch)
PROPERTY(lowercase)
PROPERTY(other_lowercase)
PROPERTY(titlecase)
+ PROPERTY(cased)
+ PROPERTY(case_ignorable)
+ PROPERTY(changes_when_lowercased)
+ PROPERTY(changes_when_uppercased)
+ PROPERTY(changes_when_titlecased)
+ PROPERTY(changes_when_casefolded)
+ PROPERTY(changes_when_casemapped)
PROPERTY(soft_dotted)
PROPERTY(id_start)
PROPERTY(other_id_start)
/* ========================================================================= */
-/* Scripts. */
+/* Arabic Shaping. */
-static const char *scripts[256];
-static unsigned int numscripts;
+enum
+{
+ UC_JOINING_TYPE_U, /* Non_Joining */
+ UC_JOINING_TYPE_T, /* Transparent */
+ UC_JOINING_TYPE_C, /* Join_Causing */
+ UC_JOINING_TYPE_L, /* Left_Joining */
+ UC_JOINING_TYPE_R, /* Right_Joining */
+ UC_JOINING_TYPE_D /* Dual_Joining */
+};
-static uint8_t unicode_scripts[0x110000];
+static uint8_t unicode_joining_type[0x110000];
+
+enum
+{
+ UC_JOINING_GROUP_NONE, /* No_Joining_Group */
+ UC_JOINING_GROUP_AIN, /* Ain */
+ UC_JOINING_GROUP_ALAPH, /* Alaph */
+ UC_JOINING_GROUP_ALEF, /* Alef */
+ UC_JOINING_GROUP_BEH, /* Beh */
+ UC_JOINING_GROUP_BETH, /* Beth */
+ UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
+ UC_JOINING_GROUP_DAL, /* Dal */
+ UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
+ UC_JOINING_GROUP_E, /* E */
+ UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
+ UC_JOINING_GROUP_FE, /* Fe */
+ UC_JOINING_GROUP_FEH, /* Feh */
+ UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
+ UC_JOINING_GROUP_GAF, /* Gaf */
+ UC_JOINING_GROUP_GAMAL, /* Gamal */
+ UC_JOINING_GROUP_HAH, /* Hah */
+ UC_JOINING_GROUP_HE, /* He */
+ UC_JOINING_GROUP_HEH, /* Heh */
+ UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
+ UC_JOINING_GROUP_HETH, /* Heth */
+ UC_JOINING_GROUP_KAF, /* Kaf */
+ UC_JOINING_GROUP_KAPH, /* Kaph */
+ UC_JOINING_GROUP_KHAPH, /* Khaph */
+ UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
+ UC_JOINING_GROUP_LAM, /* Lam */
+ UC_JOINING_GROUP_LAMADH, /* Lamadh */
+ UC_JOINING_GROUP_MEEM, /* Meem */
+ UC_JOINING_GROUP_MIM, /* Mim */
+ UC_JOINING_GROUP_NOON, /* Noon */
+ UC_JOINING_GROUP_NUN, /* Nun */
+ UC_JOINING_GROUP_NYA, /* Nya */
+ UC_JOINING_GROUP_PE, /* Pe */
+ UC_JOINING_GROUP_QAF, /* Qaf */
+ UC_JOINING_GROUP_QAPH, /* Qaph */
+ UC_JOINING_GROUP_REH, /* Reh */
+ UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
+ UC_JOINING_GROUP_SAD, /* Sad */
+ UC_JOINING_GROUP_SADHE, /* Sadhe */
+ UC_JOINING_GROUP_SEEN, /* Seen */
+ UC_JOINING_GROUP_SEMKATH, /* Semkath */
+ UC_JOINING_GROUP_SHIN, /* Shin */
+ UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
+ UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
+ UC_JOINING_GROUP_TAH, /* Tah */
+ UC_JOINING_GROUP_TAW, /* Taw */
+ UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
+ UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
+ UC_JOINING_GROUP_TETH, /* Teth */
+ UC_JOINING_GROUP_WAW, /* Waw */
+ UC_JOINING_GROUP_YEH, /* Yeh */
+ UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
+ UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
+ UC_JOINING_GROUP_YUDH, /* Yudh */
+ UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
+ UC_JOINING_GROUP_ZAIN, /* Zain */
+ UC_JOINING_GROUP_ZHAIN /* Zhain */
+};
+
+static uint8_t unicode_joining_group[0x110000];
static void
-fill_scripts (const char *scripts_filename)
+fill_arabicshaping (const char *arabicshaping_filename)
{
FILE *stream;
unsigned int i;
+ int lineno;
- stream = fopen (scripts_filename, "r");
+ stream = fopen (arabicshaping_filename, "r");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+ fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
exit (1);
}
- numscripts = 0;
-
for (i = 0; i < 0x110000; i++)
- unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+ {
+ unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
+ unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
+ }
+ lineno = 0;
for (;;)
{
- char buf[200+1];
- unsigned int i1, i2;
- char padding[200+1];
- char scriptname[200+1];
- int script;
+ char buf[100+1];
+ char separator1[100+1];
+ char padding1[100+1];
+ char schematic_name[100+1];
+ char separator2[100+1];
+ char padding2[100+1];
+ char joining_type_name[100+1];
+ char separator3[100+1];
+ char padding3[100+1];
+ char joining_group_name[100+1];
+ int joining_type;
+ int joining_group;
- if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ lineno++;
+ if (fscanf (stream, "%100[^\n]\n", buf) < 1)
break;
if (buf[0] == '\0' || buf[0] == '#')
continue;
- if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+ if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
+ &i, separator1, padding1, schematic_name, separator2,
+ padding2, joining_type_name, separator3, padding3,
+ joining_group_name) != 10)
{
- if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
- {
- fprintf (stderr, "parse error in '%s'\n", scripts_filename);
- exit (1);
- }
- i2 = i1;
+ fprintf (stderr, "parse error in '%s':%d\n",
+ arabicshaping_filename, lineno);
+ exit (1);
}
- if (i2 < i1)
- abort ();
- if (i2 >= 0x110000)
+ if (i >= 0x110000)
abort ();
- for (script = numscripts - 1; script >= 0; script--)
- if (strcmp (scripts[script], scriptname) == 0)
- break;
- if (script < 0)
+#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
+ if (false) {}
+ TRY(UC_JOINING_TYPE_U)
+ TRY(UC_JOINING_TYPE_T)
+ TRY(UC_JOINING_TYPE_C)
+ TRY(UC_JOINING_TYPE_L)
+ TRY(UC_JOINING_TYPE_R)
+ TRY(UC_JOINING_TYPE_D)
+#undef TRY
+ else
{
- scripts[numscripts] = strdup (scriptname);
- script = numscripts;
- numscripts++;
- if (numscripts == 256)
- abort ();
+ fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
+ joining_type_name, arabicshaping_filename, lineno);
+ exit (1);
}
- for (i = i1; i <= i2; i++)
+ /* Remove trailing spaces. */
+ while (joining_group_name[0] != '\0'
+ && joining_group_name[strlen (joining_group_name) - 1] == ' ')
+ joining_group_name[strlen (joining_group_name) - 1] = '\0';
+
+#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
+ if (false) {}
+ TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
+ TRY(UC_JOINING_GROUP_AIN, "AIN")
+ TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
+ TRY(UC_JOINING_GROUP_ALEF, "ALEF")
+ TRY(UC_JOINING_GROUP_BEH, "BEH")
+ TRY(UC_JOINING_GROUP_BETH, "BETH")
+ TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
+ TRY(UC_JOINING_GROUP_DAL, "DAL")
+ TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
+ TRY(UC_JOINING_GROUP_E, "E")
+ TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
+ TRY(UC_JOINING_GROUP_FE, "FE")
+ TRY(UC_JOINING_GROUP_FEH, "FEH")
+ TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
+ TRY(UC_JOINING_GROUP_GAF, "GAF")
+ TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
+ TRY(UC_JOINING_GROUP_HAH, "HAH")
+ TRY(UC_JOINING_GROUP_HE, "HE")
+ TRY(UC_JOINING_GROUP_HEH, "HEH")
+ TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
+ TRY(UC_JOINING_GROUP_HETH, "HETH")
+ TRY(UC_JOINING_GROUP_KAF, "KAF")
+ TRY(UC_JOINING_GROUP_KAPH, "KAPH")
+ TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
+ TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
+ TRY(UC_JOINING_GROUP_LAM, "LAM")
+ TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
+ TRY(UC_JOINING_GROUP_MEEM, "MEEM")
+ TRY(UC_JOINING_GROUP_MIM, "MIM")
+ TRY(UC_JOINING_GROUP_NOON, "NOON")
+ TRY(UC_JOINING_GROUP_NUN, "NUN")
+ TRY(UC_JOINING_GROUP_NYA, "NYA")
+ TRY(UC_JOINING_GROUP_PE, "PE")
+ TRY(UC_JOINING_GROUP_QAF, "QAF")
+ TRY(UC_JOINING_GROUP_QAPH, "QAPH")
+ TRY(UC_JOINING_GROUP_REH, "REH")
+ TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
+ TRY(UC_JOINING_GROUP_SAD, "SAD")
+ TRY(UC_JOINING_GROUP_SADHE, "SADHE")
+ TRY(UC_JOINING_GROUP_SEEN, "SEEN")
+ TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
+ TRY(UC_JOINING_GROUP_SHIN, "SHIN")
+ TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
+ TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
+ TRY(UC_JOINING_GROUP_TAH, "TAH")
+ TRY(UC_JOINING_GROUP_TAW, "TAW")
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
+ TRY(UC_JOINING_GROUP_TETH, "TETH")
+ TRY(UC_JOINING_GROUP_WAW, "WAW")
+ TRY(UC_JOINING_GROUP_YEH, "YEH")
+ TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
+ TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
+ TRY(UC_JOINING_GROUP_YUDH, "YUDH")
+ TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
+ TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
+ TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
+#undef TRY
+ else
{
- if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
- fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
- unicode_scripts[i] = script;
+ fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
+ joining_group_name, arabicshaping_filename, lineno);
+ exit (1);
}
+
+ unicode_joining_type[i] = joining_type;
+ unicode_joining_group[i] = joining_group;
}
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+ fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
exit (1);
}
}
-/* Construction of sparse 3-level tables. */
-#define TABLE script_table
-#define ELEMENT uint8_t
-#define DEFAULT (uint8_t)~(uint8_t)0
-#define xmalloc malloc
-#define xrealloc realloc
-#include "3level.h"
+/* Convert a Joining_Type value to a C identifier. */
+static const char *
+joining_type_as_c_identifier (int joining_type)
+{
+#define TRY(value) if (joining_type == value) return #value;
+ TRY(UC_JOINING_TYPE_U)
+ TRY(UC_JOINING_TYPE_T)
+ TRY(UC_JOINING_TYPE_C)
+ TRY(UC_JOINING_TYPE_L)
+ TRY(UC_JOINING_TYPE_R)
+ TRY(UC_JOINING_TYPE_D)
+#undef TRY
+ abort ();
+}
static void
-output_scripts (const char *version)
+output_joining_type_test (const char *filename, const char *version)
{
- const char *filename = "unictype/scripts.h";
FILE *stream;
- unsigned int ch, s, i;
- struct script_table t;
- unsigned int level1_offset, level2_offset, level3_offset;
-
- typedef struct
- {
- const char *lowercase_name;
- }
- scriptinfo_t;
- scriptinfo_t scriptinfo[256];
+ bool need_comma;
+ unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Unicode scripts. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
- for (s = 0; s < numscripts; s++)
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
{
- char *lcp = strdup (scripts[s]);
- char *cp;
-
- for (cp = lcp; *cp != '\0'; cp++)
- if (*cp >= 'A' && *cp <= 'Z')
- *cp += 'a' - 'A';
+ int value = unicode_joining_type[ch];
- scriptinfo[s].lowercase_name = lcp;
+ if (value != (uint8_t)~(uint8_t)0)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
+ need_comma = true;
+ }
}
+ if (need_comma)
+ fprintf (stream, "\n");
- for (s = 0; s < numscripts; s++)
+ if (ferror (stream) || fclose (stream))
{
- fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, "{\n");
- i = 0;
- for (ch = 0; ch < 0x110000; ch++)
- if (unicode_scripts[ch] == s)
- {
- unsigned int start;
- unsigned int end;
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
- start = ch;
- while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
- ch++;
- end = ch;
+/* Construction of sparse 3-level tables. */
+#define TABLE joining_type_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
- if (i > 0)
- fprintf (stream, ",\n");
- if (start == end)
- fprintf (stream, " { 0x%04X, 1, 1 }", start);
- else
- fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
- start, end);
- i++;
- }
- fprintf (stream, "\n");
- fprintf (stream, "};\n");
- }
+static void
+output_joining_type (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct joining_type_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+ uint8_t *level3_packed;
- fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
- fprintf (stream, "{\n");
- for (s = 0; s < numscripts; s++)
+ stream = fopen (filename, "w");
+ if (stream == NULL)
{
- fprintf (stream, " {\n");
- fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, " script_%s_intervals,\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, " \"%s\"\n", scripts[s]);
- fprintf (stream, " }");
- if (s+1 < numscripts)
- fprintf (stream, ",");
- fprintf (stream, "\n");
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
}
- fprintf (stream, "};\n");
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
- script_table_init (&t);
+ joining_type_table_init (&t);
for (ch = 0; ch < 0x110000; ch++)
{
- unsigned int s = unicode_scripts[ch];
- if (s != (uint8_t)~(uint8_t)0)
- script_table_add (&t, ch, s);
+ uint8_t value = unicode_joining_type[ch];
+
+ joining_type_table_add (&t, ch, value);
}
- script_table_finalize (&t);
+ joining_type_table_finalize (&t);
/* Offsets in t.result, in memory of this process. */
level1_offset =
+ (t.level2_size << t.q) * sizeof (uint32_t);
for (i = 0; i < 5; i++)
- fprintf (stream, "#define script_header_%d %d\n", i,
+ fprintf (stream, "#define joining_type_header_%d %d\n", i,
((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
+ (1 << t.p) * 4 / 8);
fprintf (stream, " }\n");
- fprintf (stream, "u_script =\n");
+ fprintf (stream, "u_joining_type =\n");
fprintf (stream, "{\n");
fprintf (stream, " {");
if (t.level1_size > 8)
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
fprintf (stream, " },\n");
- fprintf (stream, " {");
- if (t.level3_size << t.p > 8)
- fprintf (stream, "\n ");
+ /* Pack the level3 array. Each entry needs 4 bits only. */
+ level3_packed =
+ (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
for (i = 0; i < t.level3_size << t.p; i++)
{
- if (i > 0 && (i % 8) == 0)
+ unsigned int j = (i * 4) / 8;
+ unsigned int k = (i * 4) % 8;
+ uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
+ level3_packed[j] |= (value << k);
+ }
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%02x", level3_packed[i]);
+ if (i+1 < (t.level3_size << t.p) * 4 / 8)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ free (level3_packed);
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Convert a Joining_Group value to a C identifier. */
+static const char *
+joining_group_as_c_identifier (int joining_group)
+{
+#define TRY(value) if (joining_group == value) return #value;
+ TRY(UC_JOINING_GROUP_NONE)
+ TRY(UC_JOINING_GROUP_AIN)
+ TRY(UC_JOINING_GROUP_ALAPH)
+ TRY(UC_JOINING_GROUP_ALEF)
+ TRY(UC_JOINING_GROUP_BEH)
+ TRY(UC_JOINING_GROUP_BETH)
+ TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_DAL)
+ TRY(UC_JOINING_GROUP_DALATH_RISH)
+ TRY(UC_JOINING_GROUP_E)
+ TRY(UC_JOINING_GROUP_FARSI_YEH)
+ TRY(UC_JOINING_GROUP_FE)
+ TRY(UC_JOINING_GROUP_FEH)
+ TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
+ TRY(UC_JOINING_GROUP_GAF)
+ TRY(UC_JOINING_GROUP_GAMAL)
+ TRY(UC_JOINING_GROUP_HAH)
+ TRY(UC_JOINING_GROUP_HE)
+ TRY(UC_JOINING_GROUP_HEH)
+ TRY(UC_JOINING_GROUP_HEH_GOAL)
+ TRY(UC_JOINING_GROUP_HETH)
+ TRY(UC_JOINING_GROUP_KAF)
+ TRY(UC_JOINING_GROUP_KAPH)
+ TRY(UC_JOINING_GROUP_KHAPH)
+ TRY(UC_JOINING_GROUP_KNOTTED_HEH)
+ TRY(UC_JOINING_GROUP_LAM)
+ TRY(UC_JOINING_GROUP_LAMADH)
+ TRY(UC_JOINING_GROUP_MEEM)
+ TRY(UC_JOINING_GROUP_MIM)
+ TRY(UC_JOINING_GROUP_NOON)
+ TRY(UC_JOINING_GROUP_NUN)
+ TRY(UC_JOINING_GROUP_NYA)
+ TRY(UC_JOINING_GROUP_PE)
+ TRY(UC_JOINING_GROUP_QAF)
+ TRY(UC_JOINING_GROUP_QAPH)
+ TRY(UC_JOINING_GROUP_REH)
+ TRY(UC_JOINING_GROUP_REVERSED_PE)
+ TRY(UC_JOINING_GROUP_SAD)
+ TRY(UC_JOINING_GROUP_SADHE)
+ TRY(UC_JOINING_GROUP_SEEN)
+ TRY(UC_JOINING_GROUP_SEMKATH)
+ TRY(UC_JOINING_GROUP_SHIN)
+ TRY(UC_JOINING_GROUP_SWASH_KAF)
+ TRY(UC_JOINING_GROUP_SYRIAC_WAW)
+ TRY(UC_JOINING_GROUP_TAH)
+ TRY(UC_JOINING_GROUP_TAW)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
+ TRY(UC_JOINING_GROUP_TETH)
+ TRY(UC_JOINING_GROUP_WAW)
+ TRY(UC_JOINING_GROUP_YEH)
+ TRY(UC_JOINING_GROUP_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
+ TRY(UC_JOINING_GROUP_YUDH)
+ TRY(UC_JOINING_GROUP_YUDH_HE)
+ TRY(UC_JOINING_GROUP_ZAIN)
+ TRY(UC_JOINING_GROUP_ZHAIN)
+#undef TRY
+ abort ();
+}
+
+static void
+output_joining_group_test (const char *filename, const char *version)
+{
+ FILE *stream;
+ bool need_comma;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int value = unicode_joining_group[ch];
+
+ if (value != UC_JOINING_GROUP_NONE)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
+ need_comma = true;
+ }
+ }
+ if (need_comma)
+ fprintf (stream, "\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_joining_group (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch_min, ch_max, ch, i;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ ch_min = 0x10FFFF;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_min = ch;
+ break;
+ }
+
+ ch_max = 0;
+ for (ch = 0x10FFFF; ch > 0; ch--)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_max = ch;
+ break;
+ }
+
+ if (!(ch_min <= ch_max))
+ abort ();
+
+ /* If the interval [ch_min, ch_max] is too large, we should better use a
+ 3-level table. */
+ if (!(ch_max - ch_min < 0x200))
+ abort ();
+
+ fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
+ fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
+ ch_max + 1, ch_min);
+ fprintf (stream, "{");
+ for (i = 0; i <= ch_max - ch_min; i++)
+ {
+ const char *s;
+
+ ch = ch_min + i;
+ if ((i % 2) == 0)
+ fprintf (stream, "\n ");
+ s = joining_group_as_c_identifier (unicode_joining_group[ch]);
+ fprintf (stream, " %s", s);
+ if (i+1 <= ch_max - ch_min)
+ {
+ fprintf (stream, ",");
+ if (((i+1) % 2) != 0)
+ fprintf (stream, "%*s", 38 - (int) strlen (s), "");
+ }
+ }
+ fprintf (stream, "\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Scripts. */
+
+static const char *scripts[256];
+static unsigned int numscripts;
+
+static uint8_t unicode_scripts[0x110000];
+
+static void
+fill_scripts (const char *scripts_filename)
+{
+ FILE *stream;
+ unsigned int i;
+
+ stream = fopen (scripts_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+ exit (1);
+ }
+
+ numscripts = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char scriptname[200+1];
+ int script;
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n", scripts_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+ if (i2 < i1)
+ abort ();
+ if (i2 >= 0x110000)
+ abort ();
+
+ for (script = numscripts - 1; script >= 0; script--)
+ if (strcmp (scripts[script], scriptname) == 0)
+ break;
+ if (script < 0)
+ {
+ scripts[numscripts] = strdup (scriptname);
+ script = numscripts;
+ numscripts++;
+ if (numscripts == 256)
+ abort ();
+ }
+
+ for (i = i1; i <= i2; i++)
+ {
+ if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
+ fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
+ unicode_scripts[i] = script;
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+ exit (1);
+ }
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE script_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_scripts (const char *version)
+{
+ const char *filename = "unictype/scripts.h";
+ FILE *stream;
+ unsigned int ch, s, i;
+ struct script_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ typedef struct
+ {
+ const char *lowercase_name;
+ }
+ scriptinfo_t;
+ scriptinfo_t scriptinfo[256];
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Unicode scripts. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ for (s = 0; s < numscripts; s++)
+ {
+ char *lcp = strdup (scripts[s]);
+ char *cp;
+
+ for (cp = lcp; *cp != '\0'; cp++)
+ if (*cp >= 'A' && *cp <= 'Z')
+ *cp += 'a' - 'A';
+
+ scriptinfo[s].lowercase_name = lcp;
+ }
+
+ for (s = 0; s < numscripts; s++)
+ {
+ fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, "{\n");
+ i = 0;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_scripts[ch] == s)
+ {
+ unsigned int start;
+ unsigned int end;
+
+ start = ch;
+ while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
+ ch++;
+ end = ch;
+
+ if (i > 0)
+ fprintf (stream, ",\n");
+ if (start == end)
+ fprintf (stream, " { 0x%04X, 1, 1 }", start);
+ else
+ fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
+ start, end);
+ i++;
+ }
+ fprintf (stream, "\n");
+ fprintf (stream, "};\n");
+ }
+
+ fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
+ fprintf (stream, "{\n");
+ for (s = 0; s < numscripts; s++)
+ {
+ fprintf (stream, " {\n");
+ fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " script_%s_intervals,\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " \"%s\"\n", scripts[s]);
+ fprintf (stream, " }");
+ if (s+1 < numscripts)
+ fprintf (stream, ",");
+ fprintf (stream, "\n");
+ }
+ fprintf (stream, "};\n");
+
+ t.p = 7;
+ t.q = 9;
+ script_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int s = unicode_scripts[ch];
+ if (s != (uint8_t)~(uint8_t)0)
+ script_table_add (&t, ch, s);
+ }
+
+ script_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define script_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_script =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
if (i+1 < t.level3_size << t.p)
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Unicode scripts. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
- fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
+ fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
fprintf (stream, "%%struct-type\n");
fprintf (stream, "%%language=ANSI-C\n");
fprintf (stream, "%%define hash-function-name scripts_hash\n");
fprintf (stream, "%%readonly-tables\n");
fprintf (stream, "%%global-table\n");
fprintf (stream, "%%define word-array-name script_names\n");
+ fprintf (stream, "%%pic\n");
+ fprintf (stream, "%%define string-pool-name script_stringpool\n");
fprintf (stream, "%%%%\n");
for (s = 0; s < numscripts; s++)
fprintf (stream, "%s, %u\n", scripts[s], s);
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Unicode blocks. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
fprintf (stream, "static const uc_block_t blocks[] =\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7; /* or 8 */
fprintf (stream, "escape_char /\n");
fprintf (stream, "comment_char %%\n");
fprintf (stream, "\n");
- fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
+ fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
version);
fprintf (stream, "\n");
unicode_width[i] = strdup (field1);
}
}
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", width_filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Non-spacing attribute and width. */
+
+/* The non-spacing attribute table consists of:
+ - Non-spacing characters; generated from PropList.txt or
+ "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
+ - Format control characters; generated from
+ "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+ - Zero width characters; generated from
+ "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+ */
+
+static bool
+is_nonspacing (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && (get_bidi_category (ch) == UC_BIDI_NSM
+ || is_category_Cc (ch) || is_category_Cf (ch)
+ || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
+}
+
+static void
+output_nonspacing_property (const char *filename)
+{
+ FILE *stream;
+ int ind[0x110000 / 0x200];
+ unsigned int i;
+ unsigned int i_max;
+ int next_ind;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ next_ind = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = false;
+ unsigned int ch;
+
+ if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
+ for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
+ if (is_nonspacing (ch))
+ {
+ nontrivial = true;
+ break;
+ }
+ if (nontrivial)
+ ind[i] = next_ind++;
+ else
+ ind[i] = -1;
+ }
+
+ fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
+ next_ind);
+ i_max = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = (ind[i] >= 0);
+
+ if (nontrivial)
+ {
+ unsigned int j;
+
+ fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
+ for (j = 0; j < 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ unsigned int l;
+ unsigned char bits = 0;
+
+ for (l = 0; l < 8; l++)
+ {
+ unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
+
+ if (is_nonspacing (ch))
+ bits |= 1 << l;
+ }
+ fprintf (stream, " 0x%02x%c", bits,
+ ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
+ }
+ i_max = i;
+ }
+ }
+ fprintf (stream, "};\n");
+
+ i_max = ((i_max + 8 - 1) / 8) * 8;
+ fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
+ i_max);
+ {
+ unsigned int j;
+
+ for (j = 0; j < i_max / 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ i = j * 8 + k;
+ fprintf (stream, " %2d%c", ind[i],
+ j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
+ }
+ }
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
+static char
+symbolic_width (unsigned int ch)
+{
+ /* Test for unassigned character. */
+ if (is_property_unassigned_code_value (ch))
+ {
+ /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
+ if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
+ return 'A';
+ if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
+ || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
+ || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
+ return '2';
+ return 0;
+ }
+ else
+ {
+ /* Test for non-spacing or control character. */
+ if (is_category_Cc (ch) && ch < 0x00A0)
+ return 0;
+ if (is_nonspacing (ch))
+ return '0';
+ /* Test for double-width character. */
+ if (unicode_width[ch] != NULL
+ && (strcmp (unicode_width[ch], "W") == 0
+ || strcmp (unicode_width[ch], "F") == 0))
+ return '2';
+ /* Test for half-width character. */
+ if (unicode_width[ch] != NULL
+ && strcmp (unicode_width[ch], "H") == 0)
+ return '1';
+ }
+ /* In ancient CJK encodings, Cyrillic and most other characters are
+ double-width as well. */
+ if (ch >= 0x00A1 && ch < 0x10000)
+ return 'A';
+ return '1';
+}
+
+static void
+output_width_property_test (const char *filename)
+{
+ FILE *stream;
+ unsigned int interval_start, interval_end, ch;
+ char interval_value;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ interval_value = 0;
+ interval_start = interval_end = 0; /* avoid GCC warning */
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ char value = symbolic_width (ch);
+ if (value != 0) /* skip Cc control characters and unassigned characters */
+ {
+ if (value == interval_value)
+ /* Extend the interval. */
+ interval_end = ch;
+ else
+ {
+ /* Terminate the interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+ /* Start a new interval. */
+ interval_start = interval_end = ch;
+ interval_value = value;
+ }
+ }
+ }
+ /* Terminate the last interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", width_filename);
+ fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
-/* Line breaking classification. */
+/* ========================================================================= */
+
+/* Line breaking classification.
+ Updated for Unicode TR #14 revision 26. */
enum
{
- /* Values >= 24 are resolved at run time. */
- LBP_BK = 24, /* mandatory break */
+ /* Values >= 25 are resolved at run time. */
+ LBP_BK = 25, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 25, /* attached characters and combining marks */
+ LBP_CM = 26, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 26, /* zero width space */
+ LBP_ZW = 27, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 27, /* space */
+ LBP_SP = 28, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 28, /* contingent break opportunity */
+ LBP_CB = 29, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
- LBP_EX = 7, /* exclamation/interrogation */
- LBP_IN = 8, /* inseparable */
- LBP_NS = 9, /* non starter */
- LBP_OP = 10, /* opening punctuation */
- LBP_QU = 11, /* ambiguous quotation */
- LBP_IS = 12, /* infix separator (numeric) */
- LBP_NU = 13, /* numeric */
- LBP_PO = 14, /* postfix (numeric) */
- LBP_PR = 15, /* prefix (numeric) */
- LBP_SY = 16, /* symbols allowing breaks */
- LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
- LBP_AL = 17, /* ordinary alphabetic and symbol characters */
- LBP_H2 = 18, /* Hangul LV syllable */
- LBP_H3 = 19, /* Hangul LVT syllable */
- LBP_ID = 20, /* ideographic */
- LBP_JL = 21, /* Hangul L Jamo */
- LBP_JV = 22, /* Hangul V Jamo */
- LBP_JT = 23, /* Hangul T Jamo */
- LBP_SA = 30, /* complex context (South East Asian) */
- LBP_XX = 31 /* unknown */
+ LBP_CP = 7, /* closing parenthesis */
+ LBP_EX = 8, /* exclamation/interrogation */
+ LBP_IN = 9, /* inseparable */
+ LBP_NS = 10, /* non starter */
+ LBP_OP = 11, /* opening punctuation */
+ LBP_QU = 12, /* ambiguous quotation */
+ LBP_IS = 13, /* infix separator (numeric) */
+ LBP_NU = 14, /* numeric */
+ LBP_PO = 15, /* postfix (numeric) */
+ LBP_PR = 16, /* prefix (numeric) */
+ LBP_SY = 17, /* symbols allowing breaks */
+ LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+ LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+ LBP_H2 = 19, /* Hangul LV syllable */
+ LBP_H3 = 20, /* Hangul LVT syllable */
+ LBP_ID = 21, /* ideographic */
+ LBP_JL = 22, /* Hangul L Jamo */
+ LBP_JV = 23, /* Hangul V Jamo */
+ LBP_JT = 24, /* Hangul T Jamo */
+ LBP_SA = 31, /* complex context (South East Asian) */
+ LBP_XX = 32 /* unknown */
};
/* Returns the line breaking classification for ch, as a bit mask. */
-static int
+static int64_t
get_lbp (unsigned int ch)
{
- int attr = 0;
+ int64_t attr = 0;
if (unicode_attributes[ch].name != NULL)
{
|| ch == 0x000B /* line tabulation */
|| ch == 0x2028 /* LINE SEPARATOR */
|| ch == 0x2029 /* PARAGRAPH SEPARATOR */)
- attr |= 1 << LBP_BK;
+ attr |= (int64_t) 1 << LBP_BK;
if (ch == 0x2060 /* WORD JOINER */
|| ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
- attr |= 1 << LBP_WJ;
+ attr |= (int64_t) 1 << LBP_WJ;
/* zero width space */
if (ch == 0x200B /* ZERO WIDTH SPACE */)
- attr |= 1 << LBP_ZW;
+ attr |= (int64_t) 1 << LBP_ZW;
/* non-breaking (glue) */
if (ch == 0x00A0 /* NO-BREAK SPACE */
|| ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
|| ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
|| ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
- || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
- attr |= 1 << LBP_GL;
+ || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
+ || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
+ attr |= (int64_t) 1 << LBP_GL;
/* space */
if (ch == 0x0020 /* SPACE */)
- attr |= 1 << LBP_SP;
+ attr |= (int64_t) 1 << LBP_SP;
/* break opportunity before and after */
if (ch == 0x2014 /* EM DASH */)
- attr |= 1 << LBP_B2;
+ attr |= (int64_t) 1 << LBP_B2;
/* break opportunity after */
- if (ch == 0x1680 /* OGHAM SPACE MARK */
+ if (/* Breaking Spaces */
+ ch == 0x1680 /* OGHAM SPACE MARK */
|| ch == 0x2000 /* EN QUAD */
|| ch == 0x2001 /* EM QUAD */
|| ch == 0x2002 /* EN SPACE */
|| ch == 0x2009 /* THIN SPACE */
|| ch == 0x200A /* HAIR SPACE */
|| ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
+ /* Tabs */
|| ch == 0x0009 /* tab */
+ /* Conditional Hyphens */
|| ch == 0x00AD /* SOFT HYPHEN */
+ /* Breaking Hyphens */
|| ch == 0x058A /* ARMENIAN HYPHEN */
+ || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
|| ch == 0x2010 /* HYPHEN */
|| ch == 0x2012 /* FIGURE DASH */
|| ch == 0x2013 /* EN DASH */
+ /* Visible Word Dividers */
|| ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
|| ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
|| ch == 0x1361 /* ETHIOPIC WORDSPACE */
|| ch == 0x17DA /* KHMER SIGN KOOMUUT */
|| ch == 0x2027 /* HYPHENATION POINT */
|| ch == 0x007C /* VERTICAL LINE */
+ /* Historic Word Separators */
|| ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
|| ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
|| ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
|| ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
|| ch == 0x2E2D /* FIVE DOT PUNCTUATION */
|| ch == 0x2E30 /* RING POINT */
+ || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
|| ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
|| ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
|| ch == 0x10102 /* AEGEAN CHECK MARK */
|| ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
|| ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
|| ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
+ /* Dandas */
|| ch == 0x0964 /* DEVANAGARI DANDA */
|| ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
|| ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
|| ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
|| ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
|| ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
+ /* Tibetan */
|| ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
|| ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
|| ch == 0x0F85 /* TIBETAN MARK PALUTA */
|| ch == 0x0FBE /* TIBETAN KU RU KHA */
|| ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
|| ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
+ /* Other Terminating Punctuation */
|| ch == 0x1804 /* MONGOLIAN COLON */
|| ch == 0x1805 /* MONGOLIAN FOUR DOTS */
|| ch == 0x1B5A /* BALINESE PANTI */
|| ch == 0x1B5B /* BALINESE PAMADA */
- || ch == 0x1B5C /* BALINESE WINDU */
|| ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
|| ch == 0x1B60 /* BALINESE PAMENENG */
|| ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
|| ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
|| ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
+ || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
+ || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
+ || ch == 0xA6F3 /* BAMUM FULL STOP */
+ || ch == 0xA6F4 /* BAMUM COLON */
+ || ch == 0xA6F5 /* BAMUM COMMA */
+ || ch == 0xA6F6 /* BAMUM SEMICOLON */
+ || ch == 0xA6F7 /* BAMUM QUESTION MARK */
+ || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
+ || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
+ || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+ || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
+ || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
+ || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
+ || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
+ || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
+ || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
+ || ch == 0x11047 /* BRAHMI DANDA */
+ || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
+ || ch == 0x110BE /* KAITHI SECTION MARK */
+ || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
+ || ch == 0x110C0 /* KAITHI DANDA */
+ || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
|| ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
|| ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
|| ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
- attr |= 1 << LBP_BA;
+ attr |= (int64_t) 1 << LBP_BA;
/* break opportunity before */
if (ch == 0x00B4 /* ACUTE ACCENT */
|| ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
|| ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
|| ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
- attr |= 1 << LBP_BB;
+ attr |= (int64_t) 1 << LBP_BB;
/* hyphen */
if (ch == 0x002D /* HYPHEN-MINUS */)
- attr |= 1 << LBP_HY;
+ attr |= (int64_t) 1 << LBP_HY;
/* contingent break opportunity */
if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
- attr |= 1 << LBP_CB;
+ attr |= (int64_t) 1 << LBP_CB;
+
+ /* closing parenthesis */
+ if (ch == 0x0029 /* RIGHT PARENTHESIS */
+ || ch == 0x005D /* RIGHT SQUARE BRACKET */)
+ attr |= (int64_t) 1 << LBP_CP;
/* closing punctuation */
if ((unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'e')
+ && unicode_attributes[ch].category[1] == 'e'
+ && !(attr & ((int64_t) 1 << LBP_CP)))
|| ch == 0x3001 /* IDEOGRAPHIC COMMA */
|| ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
|| ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
|| ch == 0xFF0C /* FULLWIDTH COMMA */
|| ch == 0xFF0E /* FULLWIDTH FULL STOP */
|| ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
- || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
- attr |= 1 << LBP_CL;
+ || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
+ || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
+ || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
+ || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
+ || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
+ || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
+ || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
+ || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
+ attr |= (int64_t) 1 << LBP_CL;
/* exclamation/interrogation */
if (ch == 0x0021 /* EXCLAMATION MARK */
|| ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
|| ch == 0x2CFE /* COPTIC FULL STOP */
|| ch == 0x2E2E /* REVERSED QUESTION MARK */
-#if REVISION_22
- || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
-#endif
|| ch == 0xA60E /* VAI FULL STOP */
|| ch == 0xA876 /* PHAGS-PA MARK SHAD */
|| ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
|| ch == 0xFE57 /* SMALL EXCLAMATION MARK */
|| ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
|| ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
- attr |= 1 << LBP_EX;
+ attr |= (int64_t) 1 << LBP_EX;
/* inseparable */
if (ch == 0x2024 /* ONE DOT LEADER */
|| ch == 0x2025 /* TWO DOT LEADER */
|| ch == 0x2026 /* HORIZONTAL ELLIPSIS */
|| ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
- attr |= 1 << LBP_IN;
+ attr |= (int64_t) 1 << LBP_IN;
/* non starter */
if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
|| ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
|| strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
|| strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
- attr |= 1 << LBP_NS;
+ attr |= (int64_t) 1 << LBP_NS;
/* opening punctuation */
if ((unicode_attributes[ch].category[0] == 'P'
&& unicode_attributes[ch].category[1] == 's')
|| ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
|| ch == 0x00BF /* INVERTED QUESTION MARK */
- || ch == 0x2E18 /* INVERTED INTERROBANG */)
- attr |= 1 << LBP_OP;
+ || ch == 0x2E18 /* INVERTED INTERROBANG */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
+ || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
+ || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
+ || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
+ || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
+ || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
+ attr |= (int64_t) 1 << LBP_OP;
/* ambiguous quotation */
if ((unicode_attributes[ch].category[0] == 'P'
|| ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
|| ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
|| ch == 0x2E0B /* RAISED SQUARE */)
- attr |= 1 << LBP_QU;
+ attr |= (int64_t) 1 << LBP_QU;
/* infix separator (numeric) */
if (ch == 0x002C /* COMMA */
|| ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
|| ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
|| ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
- attr |= 1 << LBP_IS;
+ attr |= (int64_t) 1 << LBP_IS;
/* numeric */
if ((unicode_attributes[ch].category[0] == 'N'
&& strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
|| ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
|| ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
- attr |= 1 << LBP_NU;
+ attr |= (int64_t) 1 << LBP_NU;
/* postfix (numeric) */
if (ch == 0x0025 /* PERCENT SIGN */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
|| ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
- || ch == 0x0D79 /* MALAYALAM DATE MARK */)
- attr |= 1 << LBP_PO;
+ || ch == 0x09F2 /* BENGALI RUPEE MARK */
+ || ch == 0x09F3 /* BENGALI RUPEE SIGN */
+ || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
+ || ch == 0x0D79 /* MALAYALAM DATE MARK */
+ || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
+ || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
+ attr |= (int64_t) 1 << LBP_PO;
/* prefix (numeric) */
if ((unicode_attributes[ch].category[0] == 'S'
|| ch == 0x2116 /* NUMERO SIGN */
|| ch == 0x2212 /* MINUS SIGN */
|| ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
- if (!(attr & (1 << LBP_PO)))
- attr |= 1 << LBP_PR;
+ if (!(attr & ((int64_t) 1 << LBP_PO)))
+ attr |= (int64_t) 1 << LBP_PR;
/* symbols allowing breaks */
if (ch == 0x002F /* SOLIDUS */)
- attr |= 1 << LBP_SY;
+ attr |= (int64_t) 1 << LBP_SY;
if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
- attr |= 1 << LBP_H2;
+ attr |= (int64_t) 1 << LBP_H2;
if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
- attr |= 1 << LBP_H3;
+ attr |= (int64_t) 1 << LBP_H3;
- if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
- attr |= 1 << LBP_JL;
+ if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
+ attr |= (int64_t) 1 << LBP_JL;
- if (ch >= 0x1160 && ch <= 0x11A2)
- attr |= 1 << LBP_JV;
+ if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
+ attr |= (int64_t) 1 << LBP_JV;
- if (ch >= 0x11A8 && ch <= 0x11F9)
- attr |= 1 << LBP_JT;
+ if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
+ attr |= (int64_t) 1 << LBP_JT;
/* complex context (South East Asian) */
if (((unicode_attributes[ch].category[0] == 'C'
|| unicode_attributes[ch].category[1] == 'o'))
|| (unicode_attributes[ch].category[0] == 'M'
&& (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'n'))
+ || unicode_attributes[ch].category[1] == 'n')
+ && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
|| ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
+ || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
|| ch == 0x19DE /* NEW TAI LUE SIGN LAE */
- || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
- && ((ch >= 0x0E00 && ch <= 0x0EFF)
- || (ch >= 0x1000 && ch <= 0x109F)
- || (ch >= 0x1780 && ch <= 0x17FF)
- || (ch >= 0x1950 && ch <= 0x19DF)))
- attr |= 1 << LBP_SA;
+ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
+ || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
+ || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
+ || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
+ && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
+ || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
+ || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
+ || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
+ || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
+ || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
+ attr |= (int64_t) 1 << LBP_SA;
/* attached characters and combining marks */
if ((unicode_attributes[ch].category[0] == 'M'
|| unicode_attributes[ch].category[1] == 'n'))
|| (unicode_attributes[ch].category[0] == 'C'
&& (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'f')))
- if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
- attr |= 1 << LBP_CM;
+ || unicode_attributes[ch].category[1] == 'f')
+ && ch != 0x110BD /* KAITHI NUMBER SIGN */))
+ if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+ attr |= (int64_t) 1 << LBP_CM;
/* ideographic */
if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
|| ch == 0x3000 /* IDEOGRAPHIC SPACE */
|| (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
|| (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
- || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
- || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
|| (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
|| (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
|| (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
|| (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
|| strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
|| (ch >= 0x3000 && ch <= 0x33FF
- && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
+ && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
|| ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
|| ch == 0xFF5E /* FULLWIDTH TILDE */
|| ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
|| ch == 0xFFE3 /* FULLWIDTH MACRON */
- || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
- if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
+ || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
+ || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
+ || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
+ if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
{
/* ambiguous (ideograph) ? */
if ((unicode_width[ch] != NULL
&& ch >= 0x2000)
|| ch == 0x24EA /* CIRCLED DIGIT ZERO */
|| (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
- attr |= 1 << LBP_AI;
+ attr |= (int64_t) 1 << LBP_AI;
else
- attr |= 1 << LBP_ID;
+ attr |= (int64_t) 1 << LBP_ID;
}
/* ordinary alphabetic and symbol characters */
|| ch == 0x2061 /* FUNCTION APPLICATION */
|| ch == 0x2062 /* INVISIBLE TIMES */
|| ch == 0x2063 /* INVISIBLE SEPARATOR */
- || ch == 0x2064 /* INVISIBLE PLUS */)
- if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
+ || ch == 0x2064 /* INVISIBLE PLUS */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x110BD /* KAITHI NUMBER SIGN */)
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
{
/* ambiguous (alphabetic) ? */
if ((unicode_width[ch] != NULL
&& ch != 0x21E7 /* UPWARDS WHITE ARROW */
&& ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
&& ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
-#if !REVISION_22
- || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
|| ch == 0x00A7 /* SECTION SIGN */
|| ch == 0x00A8 /* DIAERESIS */
|| ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
|| ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
|| ch == 0x00BD /* VULGAR FRACTION ONE HALF */
|| ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
- || ch == 0x00BF /* INVERTED QUESTION MARK */
|| ch == 0x00D7 /* MULTIPLICATION SIGN */
|| ch == 0x00F7 /* DIVISION SIGN */
|| ch == 0x02C7 /* CARON */
|| ch == 0x02DA /* RING ABOVE */
|| ch == 0x02DB /* OGONEK */
|| ch == 0x02DD /* DOUBLE ACUTE ACCENT */
-#endif
|| ch == 0x24EA /* CIRCLED DIGIT ZERO */
|| (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
|| ch == 0x2616 /* WHITE SHOGI PIECE */
|| ch == 0x2617 /* BLACK SHOGI PIECE */)
- attr |= 1 << LBP_AI;
+ attr |= (int64_t) 1 << LBP_AI;
else
- attr |= 1 << LBP_AL;
- attr &= ~(1 << LBP_CM);
+ attr |= (int64_t) 1 << LBP_AL;
+ attr &= ~((int64_t) 1 << LBP_CM);
}
}
+ else
+ {
+ /* Unassigned character. */
+ if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+ || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
+ || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
+ attr |= (int64_t) 1 << LBP_ID;
+ }
if (attr == 0)
/* unknown */
- attr |= 1 << LBP_XX;
+ attr |= (int64_t) 1 << LBP_XX;
return attr;
}
for (i = 0; i < 0x110000; i++)
{
- int attr = get_lbp (i);
- if (attr != 1 << LBP_XX)
+ int64_t attr = get_lbp (i);
+ if (attr != (int64_t) 1 << LBP_XX)
{
fprintf (stream, "0x%04X", i);
#define PRINT_BIT(attr,bit) \
- if (attr & (1 << bit)) fprintf (stream, " " #bit);
+ if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
PRINT_BIT(attr,LBP_BK);
PRINT_BIT(attr,LBP_CM);
PRINT_BIT(attr,LBP_WJ);
PRINT_BIT(attr,LBP_HY);
PRINT_BIT(attr,LBP_CB);
PRINT_BIT(attr,LBP_CL);
+ PRINT_BIT(attr,LBP_CP);
PRINT_BIT(attr,LBP_EX);
PRINT_BIT(attr,LBP_IN);
PRINT_BIT(attr,LBP_NS);
TRY(LBP_HY)
TRY(LBP_CB)
TRY(LBP_CL)
+ TRY(LBP_CP)
TRY(LBP_EX)
TRY(LBP_IN)
TRY(LBP_NS)
unicode_org_lbp[i] = value;
}
}
+
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
PRINT_BIT(attr,LBP_HY);
PRINT_BIT(attr,LBP_CB);
PRINT_BIT(attr,LBP_CL);
+ PRINT_BIT(attr,LBP_CP);
PRINT_BIT(attr,LBP_EX);
PRINT_BIT(attr,LBP_IN);
PRINT_BIT(attr,LBP_NS);
for (i = 0; i < 0x110000; i++)
{
- int attr = get_lbp (i);
+ int64_t attr = get_lbp (i);
/* Now attr should contain exactly one bit. */
if (attr == 0 || ((attr & (attr - 1)) != 0))
abort ();
- if (attr != 1 << LBP_XX)
+ if (attr != (int64_t) 1 << LBP_XX)
{
unsigned int log2_attr;
for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
CASE(LBP_HY);
CASE(LBP_CB);
CASE(LBP_CL);
+ CASE(LBP_CP);
CASE(LBP_EX);
CASE(LBP_IN);
CASE(LBP_NS);
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
fprintf (stream, "\n");
/* ========================================================================= */
-/* Word break property. */
+/* Word break property.
+ Updated for Unicode TR #29 revision 17. */
/* Possible values of the Word_Break property. */
enum
if (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Cf") == 0
- && ch != 0x200C && ch != 0x200D)
+ && ch != 0x200B && ch != 0x200C && ch != 0x200D)
attr |= 1 << WBP_FORMAT;
if ((unicode_scripts[ch] < numscripts
&& (attr & (1 << WBP_EXTEND)) == 0)
attr |= 1 << WBP_ALETTER;
- if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
- || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
+ if (is_WBP_MIDNUMLET (ch))
attr |= 1 << WBP_MIDNUMLET;
- if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
- || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
+ if (is_WBP_MIDLETTER (ch))
attr |= 1 << WBP_MIDLETTER;
if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
fprintf (stream, "\n");
/* ========================================================================= */
+/* Grapheme break property.
+ Updated for Unicode TR #29 revision 17. */
+
+/* Possible values of the Grapheme_Cluster_Break property. */
+enum
+{
+ GBP_OTHER = 0,
+ GBP_CR = 1,
+ GBP_LF = 2,
+ GBP_CONTROL = 3,
+ GBP_EXTEND = 4,
+ GBP_PREPEND = 5,
+ GBP_SPACINGMARK = 6,
+ GBP_L = 7,
+ GBP_V = 8,
+ GBP_T = 9,
+ GBP_LV = 10,
+ GBP_LVT = 11
+};
+
+/* Construction of sparse 3-level tables. */
+#define TABLE gbp_table
+#define ELEMENT unsigned char
+#define DEFAULT GBP_OTHER
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* The grapheme break property from the GraphemeBreakProperty.txt file. */
+int unicode_org_gbp[0x110000];
+
+/* Output the unit test data for the grapheme break property. */
+static void
+output_gbp_test (const char *filename)
+{
+ FILE *stream;
+ bool need_comma;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
+ fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int gbp = unicode_org_gbp[ch];
+ const char *gbp_string;
+
+ while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
+ ch++;
+
+ switch (gbp)
+ {
+#define CASE(x) case x: gbp_string = #x; break;
+ CASE (GBP_OTHER)
+ CASE (GBP_CR)
+ CASE (GBP_LF)
+ CASE (GBP_CONTROL)
+ CASE (GBP_EXTEND)
+ CASE (GBP_PREPEND)
+ CASE (GBP_SPACINGMARK)
+ CASE (GBP_L)
+ CASE (GBP_V)
+ CASE (GBP_T)
+ CASE (GBP_LV)
+ CASE (GBP_LVT)
+#undef CASE
+ default:
+ abort ();
+ }
+
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
+
+ need_comma = true;
+ }
+ fprintf (stream, "\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Output the per-character grapheme break property table. */
+static void
+output_gbp_table (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct gbp_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ t.p = 7;
+ t.q = 9;
+ gbp_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ gbp_table_add (&t, ch, unicode_org_gbp[ch]);
+
+ gbp_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
+ t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "unigbrkprop =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t) / 2);
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) / 2; i++)
+ {
+ unsigned char *p = (unsigned char *) (t.result + level3_offset);
+ unsigned char value0 = p[i * 2];
+ unsigned char value1 = p[i * 2 + 1];
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
+ (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
+ }
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Stores in unicode_org_gbp[] the grapheme breaking property from the
+ GraphemeBreakProperty.txt file. */
+static void
+fill_org_gbp (const char *graphemebreakproperty_filename)
+{
+ unsigned int i;
+ FILE *stream;
+ int lineno = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_org_gbp[i] = GBP_OTHER;
+
+ stream = fopen (graphemebreakproperty_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n",
+ graphemebreakproperty_filename);
+ exit (1);
+ }
+
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char propname[200+1];
+ int propvalue;
+
+ lineno++;
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n",
+ graphemebreakproperty_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+#define PROP(name,value) \
+ if (strcmp (propname, name) == 0) propvalue = value; else
+ PROP ("CR", GBP_CR)
+ PROP ("LF", GBP_LF)
+ PROP ("Control", GBP_CONTROL)
+ PROP ("Extend", GBP_EXTEND)
+ PROP ("Prepend", GBP_PREPEND)
+ PROP ("SpacingMark", GBP_SPACINGMARK)
+ PROP ("L", GBP_L)
+ PROP ("V", GBP_V)
+ PROP ("T", GBP_T)
+ PROP ("LV", GBP_LV)
+ PROP ("LVT", GBP_LVT)
+#undef PROP
+ {
+ fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
+ graphemebreakproperty_filename, lineno);
+ exit (1);
+ }
+ if (!(i1 <= i2 && i2 < 0x110000))
+ abort ();
+
+ for (i = i1; i <= i2; i++)
+ unicode_org_gbp[i] = propvalue;
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Composition and decomposition.
+ Updated for Unicode TR #15 revision 33. */
+
/* Maximum number of characters into which a single Unicode character can be
decomposed. */
#define MAX_DECOMP_LENGTH 18
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
fprintf (stream, "\n");
1527, which is quite good (60% filled). It requires an auxiliary table
lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
- fprintf (stream, "struct composition_rule { char codes[4]; };\n");
+ fprintf (stream, "struct composition_rule { char codes[6]; };\n");
fprintf (stream, "%%struct-type\n");
fprintf (stream, "%%language=ANSI-C\n");
fprintf (stream, "%%define slot-name codes\n");
if (strcmp (unicode_attributes[combined].combining, "0") != 0)
abort ();
- if (!(code1 < 0x10000))
- abort ();
- if (!(code2 < 0x10000))
- abort ();
- if (!(combined < 0x10000))
- abort ();
-
- fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
- (code1 >> 8) & 0xff, code1 & 0xff,
- (code2 >> 8) & 0xff, code2 & 0xff,
+ fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
+ (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
+ (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
combined);
}
}
fprintf (stream, " You should have received a copy of the GNU General Public License\n");
fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
fprintf (stream, "\n");
- fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
fprintf (stream, "\n");
fprintf (stream, "#include \"test-mapping-part1.h\"\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
t.p = 7;
const char *unicodedata_filename;
const char *proplist_filename;
const char *derivedproplist_filename;
+ const char *arabicshaping_filename;
const char *scripts_filename;
const char *blocks_filename;
const char *proplist30_filename;
const char *eastasianwidth_filename;
const char *linebreak_filename;
const char *wordbreakproperty_filename;
+ const char *graphemebreakproperty_filename;
const char *compositionexclusions_filename;
const char *specialcasing_filename;
const char *casefolding_filename;
const char *version;
- if (argc != 14)
+ if (argc != 16)
{
- fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
+ fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
argv[0]);
exit (1);
}
unicodedata_filename = argv[1];
proplist_filename = argv[2];
derivedproplist_filename = argv[3];
- scripts_filename = argv[4];
- blocks_filename = argv[5];
- proplist30_filename = argv[6];
- eastasianwidth_filename = argv[7];
- linebreak_filename = argv[8];
- wordbreakproperty_filename = argv[9];
- compositionexclusions_filename = argv[10];
- specialcasing_filename = argv[11];
- casefolding_filename = argv[12];
- version = argv[13];
+ arabicshaping_filename = argv[4];
+ scripts_filename = argv[5];
+ blocks_filename = argv[6];
+ proplist30_filename = argv[7];
+ eastasianwidth_filename = argv[8];
+ linebreak_filename = argv[9];
+ wordbreakproperty_filename = argv[10];
+ graphemebreakproperty_filename = argv[11];
+ compositionexclusions_filename = argv[12];
+ specialcasing_filename = argv[13];
+ casefolding_filename = argv[14];
+ version = argv[15];
fill_attributes (unicodedata_filename);
clear_properties ();
fill_properties (proplist_filename);
fill_properties (derivedproplist_filename);
fill_properties30 (proplist30_filename);
+ fill_arabicshaping (arabicshaping_filename);
fill_scripts (scripts_filename);
fill_blocks (blocks_filename);
fill_width (eastasianwidth_filename);
fill_org_lbp (linebreak_filename);
fill_org_wbp (wordbreakproperty_filename);
+ fill_org_gbp (graphemebreakproperty_filename);
fill_composition_exclusions (compositionexclusions_filename);
fill_casing_rules (specialcasing_filename);
fill_casefolding_rules (casefolding_filename);
output_categories (version);
output_category ("unictype/categ_of.h", version);
- output_combclass ("unictype/combining.h", version);
+ output_combclass ("unictype/combiningclass.h", version);
output_bidi_category ("unictype/bidi_of.h", version);
output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
output_decimal_digit ("unictype/decdigit.h", version);
output_numeric ("unictype/numeric.h", version);
output_mirror ("unictype/mirror.h", version);
output_properties (version);
+ output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
+ output_joining_type ("unictype/joiningtype_of.h", version);
+ output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
+ output_joining_group ("unictype/joininggroup_of.h", version);
+
output_scripts (version);
output_scripts_byname (version);
output_blocks (version);
output_ident_properties (version);
+ output_nonspacing_property ("uniwidth/width.c.part");
+ output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
output_old_ctype (version);
debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
+ output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
+ output_gbp_table ("unigbrk/gbrkprop.h", version);
+
output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
debug_output_composition_tables ("uninorm/composition.txt");
output_composition_tables ("uninorm/composition-table.gperf", version);
* compile-command: "
gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
./gen-uni-tables \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
/gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
- 5.1.0
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
+ 6.0.0 \
+ && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
+ && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt
"
* End:
*/