X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=e63a18de8897dfc78c1916f3a370c1a36b221c54;hb=ff2f086830480dfa85d1e98c46f8566feb657deb;hp=23462386e6b7306a93a23adfd17517636a3029c0;hpb=04606dd55fd8f1a122a21c4b4e09ab2a36498385;p=gnulib.git diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 23462386e..e63a18de8 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -1,7 +1,7 @@ /* Generate Unicode conforming character classification tables and line break properties tables and word break property tables and decomposition/composition and case mapping tables from a UnicodeData file. - Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. + Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc. Written by Bruno Haible , 2000-2002. This program is free software: you can redistribute it and/or modify @@ -21,14 +21,18 @@ $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \ /usr/local/share/Unidata/PropList.txt \ /usr/local/share/Unidata/DerivedCoreProperties.txt \ + /usr/local/share/Unidata/ArabicShaping.txt \ /usr/local/share/Unidata/Scripts.txt \ /usr/local/share/Unidata/Blocks.txt \ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ /usr/local/share/Unidata/LineBreak.txt \ /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/GraphemeBreakProperty.txt \ /usr/local/share/Unidata/CompositionExclusions.txt \ - 5.1.0 + /usr/local/share/Unidata/SpecialCasing.txt \ + /usr/local/share/Unidata/CaseFolding.txt \ + 6.0.0 */ #include @@ -72,13 +76,13 @@ struct unicode_attribute unicode_attributes [0x110000]; /* Stores in unicode_attributes[i] the values from the given fields. */ static void fill_attribute (unsigned int i, - const char *field1, const char *field2, - const char *field3, const char *field4, - const char *field5, const char *field6, - const char *field7, const char *field8, - const char *field9, const char *field10, - const char *field11, const char *field12, - const char *field13, const char *field14) + const char *field1, const char *field2, + const char *field3, const char *field4, + const char *field5, const char *field6, + const char *field7, const char *field8, + const char *field9, const char *field10, + const char *field11, const char *field12, + const char *field13, const char *field14) { struct unicode_attribute * uni; @@ -123,16 +127,16 @@ getfield (FILE *stream, char *buffer, int delim) for (; (c = getc (stream)), (c != EOF && c != delim); ) { /* The original unicode.org UnicodeData.txt file happens to have - CR/LF line terminators. Silently convert to LF. */ + CR/LF line terminators. Silently convert to LF. */ if (c == '\r') - continue; + continue; /* Put c into the buffer. */ if (++count >= FIELDLEN - 1) - { - fprintf (stderr, "field longer than expected, increase FIELDLEN\n"); - exit (1); - } + { + fprintf (stderr, "field longer than expected, increase FIELDLEN\n"); + exit (1); + } *buffer++ = c; } @@ -198,64 +202,65 @@ fill_attributes (const char *unicodedata_filename) n += getfield (stream, field13, ';'); n += getfield (stream, field14, '\n'); if (n == 0) - break; + break; if (n != 15) - { - fprintf (stderr, "short line in '%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } + { + fprintf (stderr, "short line in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } i = strtoul (field0, NULL, 16); if (field1[0] == '<' - && strlen (field1) >= 9 - && strcmp (field1 + strlen(field1) - 8, ", First>") == 0) - { - /* Deal with a range. */ - lineno++; - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ';'); - n += getfield (stream, field2, ';'); - n += getfield (stream, field3, ';'); - n += getfield (stream, field4, ';'); - n += getfield (stream, field5, ';'); - n += getfield (stream, field6, ';'); - n += getfield (stream, field7, ';'); - n += getfield (stream, field8, ';'); - n += getfield (stream, field9, ';'); - n += getfield (stream, field10, ';'); - n += getfield (stream, field11, ';'); - n += getfield (stream, field12, ';'); - n += getfield (stream, field13, ';'); - n += getfield (stream, field14, '\n'); - if (n != 15) - { - fprintf (stderr, "missing end range in '%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } - if (!(field1[0] == '<' - && strlen (field1) >= 8 - && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0)) - { - fprintf (stderr, "missing end range in '%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } - field1[strlen (field1) - 7] = '\0'; - j = strtoul (field0, NULL, 16); - for (; i <= j; i++) - fill_attribute (i, field1+1, field2, field3, field4, field5, - field6, field7, field8, field9, field10, - field11, field12, field13, field14); - } + && strlen (field1) >= 9 + && strcmp (field1 + strlen (field1) - 8, ", First>") == 0) + { + /* Deal with a range. */ + lineno++; + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ';'); + n += getfield (stream, field2, ';'); + n += getfield (stream, field3, ';'); + n += getfield (stream, field4, ';'); + n += getfield (stream, field5, ';'); + n += getfield (stream, field6, ';'); + n += getfield (stream, field7, ';'); + n += getfield (stream, field8, ';'); + n += getfield (stream, field9, ';'); + n += getfield (stream, field10, ';'); + n += getfield (stream, field11, ';'); + n += getfield (stream, field12, ';'); + n += getfield (stream, field13, ';'); + n += getfield (stream, field14, '\n'); + if (n != 15) + { + fprintf (stderr, "missing end range in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } + if (!(field1[0] == '<' + && strlen (field1) >= 8 + && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0)) + { + fprintf (stderr, "missing end range in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } + field1[strlen (field1) - 7] = '\0'; + j = strtoul (field0, NULL, 16); + for (; i <= j; i++) + fill_attribute (i, field1+1, field2, field3, field4, field5, + field6, field7, field8, field9, field10, + field11, field12, field13, field14); + } else - { - /* Single character line */ - fill_attribute (i, field1, field2, field3, field4, field5, - field6, field7, field8, field9, field10, - field11, field12, field13, field14); - } + { + /* Single character line */ + fill_attribute (i, field1, field2, field3, field4, field5, + field6, field7, field8, field9, field10, + field11, field12, field13, field14); + } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); @@ -273,265 +278,276 @@ static bool is_category_L (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L'); + && unicode_attributes[ch].category[0] == 'L'); +} + +static bool +is_category_LC (unsigned int ch) +{ + /* See PropertyValueAliases.txt. */ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't')); } static bool is_category_Lu (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 'u'); + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'u'); } static bool is_category_Ll (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 'l'); + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_Lt (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 't'); + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 't'); } static bool is_category_Lm (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 'm'); + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'm'); } static bool is_category_Lo (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 'o'); + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_M (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'M'); + && unicode_attributes[ch].category[0] == 'M'); } static bool is_category_Mn (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'M' - && unicode_attributes[ch].category[1] == 'n'); + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'n'); } static bool is_category_Mc (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'M' - && unicode_attributes[ch].category[1] == 'c'); + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Me (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'M' - && unicode_attributes[ch].category[1] == 'e'); + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'e'); } static bool is_category_N (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'N'); + && unicode_attributes[ch].category[0] == 'N'); } static bool is_category_Nd (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'd'); + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd'); } static bool is_category_Nl (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'l'); + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_No (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'o'); + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_P (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P'); + && unicode_attributes[ch].category[0] == 'P'); } static bool is_category_Pc (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'c'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Pd (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'd'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'd'); } static bool is_category_Ps (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 's'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 's'); } static bool is_category_Pe (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'e'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'e'); } static bool is_category_Pi (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'i'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'i'); } static bool is_category_Pf (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'f'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'f'); } static bool is_category_Po (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'o'); + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_S (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'S'); + && unicode_attributes[ch].category[0] == 'S'); } static bool is_category_Sm (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'm'); + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'm'); } static bool is_category_Sc (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'c'); + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Sk (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'k'); + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'k'); } static bool is_category_So (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'o'); + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_Z (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z'); + && unicode_attributes[ch].category[0] == 'Z'); } static bool is_category_Zs (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && unicode_attributes[ch].category[1] == 's'); + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 's'); } static bool is_category_Zl (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && unicode_attributes[ch].category[1] == 'l'); + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_Zp (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && unicode_attributes[ch].category[1] == 'p'); + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 'p'); } static bool is_category_C (unsigned int ch) { return (unicode_attributes[ch].name == NULL - || unicode_attributes[ch].category[0] == 'C'); + || unicode_attributes[ch].category[0] == 'C'); } static bool is_category_Cc (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'C' - && unicode_attributes[ch].category[1] == 'c'); + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Cf (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'C' - && unicode_attributes[ch].category[1] == 'f'); + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'f'); } static bool @@ -544,15 +560,15 @@ static bool is_category_Co (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'C' - && unicode_attributes[ch].category[1] == 'o'); + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_Cn (unsigned int ch) { return (unicode_attributes[ch].name == NULL - && !(ch >= 0xd800 && ch < 0xe000)); + && !(ch >= 0xd800 && ch < 0xe000)); } /* Output a boolean property in a human readable format. */ @@ -573,22 +589,22 @@ debug_output_predicate (const char *filename, bool (*predicate) (unsigned int)) for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { - fprintf (stream, "0x%04X\n", ch); + fprintf (stream, "0x%04X\n", ch); } #else for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { - unsigned int first = ch; - unsigned int last; - - while (ch + 1 < 0x110000 && predicate (ch + 1)) - ch++; - last = ch; - if (first < last) - fprintf (stream, "0x%04X..0x%04X\n", first, last); - else - fprintf (stream, "0x%04X\n", ch); + unsigned int first = ch; + unsigned int last; + + while (ch + 1 < 0x110000 && predicate (ch + 1)) + ch++; + last = ch; + if (first < last) + fprintf (stream, "0x%04X..0x%04X\n", first, last); + else + fprintf (stream, "0x%04X\n", ch); } #endif @@ -638,16 +654,16 @@ output_predicate_test (const char *filename, bool (*predicate) (unsigned int), c for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { - unsigned int first = ch; - unsigned int last; - - while (ch + 1 < 0x110000 && predicate (ch + 1)) - ch++; - last = ch; - if (need_comma) - fprintf (stream, ",\n"); - fprintf (stream, " { 0x%04X, 0x%04X }", first, last); - need_comma = true; + unsigned int first = ch; + unsigned int last; + + while (ch + 1 < 0x110000 && predicate (ch + 1)) + ch++; + last = ch; + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", first, last); + need_comma = true; } if (need_comma) fprintf (stream, "\n"); @@ -687,8 +703,8 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* %s of Unicode characters. */\n", comment); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 4; /* or: 5 */ t.q = 7; /* or: 6 */ @@ -714,7 +730,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const for (i = 0; i < 5; i++) if (i != 1) fprintf (stream, "#define header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); @@ -734,13 +750,13 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const { uint32_t offset; if (i > 0 && (i % 1) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", - 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", + 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } @@ -754,15 +770,15 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const { uint32_t offset; if (i > 0 && (i % 1) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", - 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", + 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 1) fprintf (stream, "\n "); @@ -773,11 +789,11 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 4) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%08X", - ((uint32_t *) (t.result + level3_offset))[i]); + ((uint32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << t.p > 4) fprintf (stream, "\n "); @@ -800,6 +816,7 @@ output_categories (const char *version) output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \ output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version); CATEGORY (L) + CATEGORY (LC) CATEGORY (Lu) CATEGORY (Ll) CATEGORY (Lt) @@ -842,6 +859,7 @@ output_categories (const char *version) enum { UC_CATEGORY_MASK_L = 0x0000001f, + UC_CATEGORY_MASK_LC = 0x00000007, UC_CATEGORY_MASK_Lu = 0x00000001, UC_CATEGORY_MASK_Ll = 0x00000002, UC_CATEGORY_MASK_Lt = 0x00000004, @@ -888,77 +906,78 @@ general_category_byname (const char *category_name) switch (category_name[0]) { case 'L': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_L; - case 'u': return UC_CATEGORY_MASK_Lu; - case 'l': return UC_CATEGORY_MASK_Ll; - case 't': return UC_CATEGORY_MASK_Lt; - case 'm': return UC_CATEGORY_MASK_Lm; - case 'o': return UC_CATEGORY_MASK_Lo; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_L; + case 'C': return UC_CATEGORY_MASK_LC; + case 'u': return UC_CATEGORY_MASK_Lu; + case 'l': return UC_CATEGORY_MASK_Ll; + case 't': return UC_CATEGORY_MASK_Lt; + case 'm': return UC_CATEGORY_MASK_Lm; + case 'o': return UC_CATEGORY_MASK_Lo; + } + break; case 'M': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_M; - case 'n': return UC_CATEGORY_MASK_Mn; - case 'c': return UC_CATEGORY_MASK_Mc; - case 'e': return UC_CATEGORY_MASK_Me; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_M; + case 'n': return UC_CATEGORY_MASK_Mn; + case 'c': return UC_CATEGORY_MASK_Mc; + case 'e': return UC_CATEGORY_MASK_Me; + } + break; case 'N': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_N; - case 'd': return UC_CATEGORY_MASK_Nd; - case 'l': return UC_CATEGORY_MASK_Nl; - case 'o': return UC_CATEGORY_MASK_No; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_N; + case 'd': return UC_CATEGORY_MASK_Nd; + case 'l': return UC_CATEGORY_MASK_Nl; + case 'o': return UC_CATEGORY_MASK_No; + } + break; case 'P': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_P; - case 'c': return UC_CATEGORY_MASK_Pc; - case 'd': return UC_CATEGORY_MASK_Pd; - case 's': return UC_CATEGORY_MASK_Ps; - case 'e': return UC_CATEGORY_MASK_Pe; - case 'i': return UC_CATEGORY_MASK_Pi; - case 'f': return UC_CATEGORY_MASK_Pf; - case 'o': return UC_CATEGORY_MASK_Po; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_P; + case 'c': return UC_CATEGORY_MASK_Pc; + case 'd': return UC_CATEGORY_MASK_Pd; + case 's': return UC_CATEGORY_MASK_Ps; + case 'e': return UC_CATEGORY_MASK_Pe; + case 'i': return UC_CATEGORY_MASK_Pi; + case 'f': return UC_CATEGORY_MASK_Pf; + case 'o': return UC_CATEGORY_MASK_Po; + } + break; case 'S': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_S; - case 'm': return UC_CATEGORY_MASK_Sm; - case 'c': return UC_CATEGORY_MASK_Sc; - case 'k': return UC_CATEGORY_MASK_Sk; - case 'o': return UC_CATEGORY_MASK_So; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_S; + case 'm': return UC_CATEGORY_MASK_Sm; + case 'c': return UC_CATEGORY_MASK_Sc; + case 'k': return UC_CATEGORY_MASK_Sk; + case 'o': return UC_CATEGORY_MASK_So; + } + break; case 'Z': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_Z; - case 's': return UC_CATEGORY_MASK_Zs; - case 'l': return UC_CATEGORY_MASK_Zl; - case 'p': return UC_CATEGORY_MASK_Zp; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_Z; + case 's': return UC_CATEGORY_MASK_Zs; + case 'l': return UC_CATEGORY_MASK_Zl; + case 'p': return UC_CATEGORY_MASK_Zp; + } + break; case 'C': - switch (category_name[1]) - { - case '\0': return UC_CATEGORY_MASK_C; - case 'c': return UC_CATEGORY_MASK_Cc; - case 'f': return UC_CATEGORY_MASK_Cf; - case 's': return UC_CATEGORY_MASK_Cs; - case 'o': return UC_CATEGORY_MASK_Co; - case 'n': return UC_CATEGORY_MASK_Cn; - } - break; + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_C; + case 'c': return UC_CATEGORY_MASK_Cc; + case 'f': return UC_CATEGORY_MASK_Cf; + case 's': return UC_CATEGORY_MASK_Cs; + case 'o': return UC_CATEGORY_MASK_Co; + case 'n': return UC_CATEGORY_MASK_Cn; + } + break; } /* Invalid category name. */ abort (); @@ -991,8 +1010,8 @@ output_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -1004,15 +1023,15 @@ output_category (const char *filename, const char *version) unsigned int log2_value; if (is_category_Cs (ch)) - value = UC_CATEGORY_MASK_Cs; + value = UC_CATEGORY_MASK_Cs; else if (unicode_attributes[ch].name != NULL) - value = general_category_byname (unicode_attributes[ch].category); + value = general_category_byname (unicode_attributes[ch].category); else - continue; + continue; /* Now value should contain exactly one bit. */ if (value == 0 || ((value & (value - 1)) != 0)) - abort (); + abort (); for (log2_value = 0; value > 1; value >>= 1, log2_value++); @@ -1034,14 +1053,14 @@ output_category (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define category_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, - (1 << t.p) * 5 / 16); + (1 << t.p) * 5 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_category =\n"); fprintf (stream, "{\n"); @@ -1052,15 +1071,15 @@ output_category (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -1072,15 +1091,15 @@ output_category (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -1105,10 +1124,10 @@ output_category (const char *filename, const char *version) for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) - fprintf (stream, ","); + fprintf (stream, ","); } if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); @@ -1155,8 +1174,8 @@ output_combclass (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Combining class of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -1165,10 +1184,10 @@ output_combclass (const char *filename, const char *version) for (ch = 0; ch < 0x110000; ch++) if (unicode_attributes[ch].name != NULL) { - int value = atoi (unicode_attributes[ch].combining); + int value = atoi (unicode_attributes[ch].combining); if (!(value >= 0 && value <= 255)) - abort (); - combclass_table_add (&t, ch, value); + abort (); + combclass_table_add (&t, ch, value); } combclass_table_finalize (&t); @@ -1186,7 +1205,7 @@ output_combclass (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define combclass_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); @@ -1203,15 +1222,15 @@ output_combclass (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -1223,15 +1242,15 @@ output_combclass (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -1242,10 +1261,10 @@ output_combclass (const char *filename, const char *version) for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); @@ -1295,143 +1314,143 @@ bidi_category_byname (const char *category_name) { case 'A': switch (category_name[1]) - { - case 'L': - if (category_name[2] == '\0') - return UC_BIDI_AL; - break; - case 'N': - if (category_name[2] == '\0') - return UC_BIDI_AN; - break; - } + { + case 'L': + if (category_name[2] == '\0') + return UC_BIDI_AL; + break; + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_AN; + break; + } break; case 'B': switch (category_name[1]) - { - case '\0': - return UC_BIDI_B; - case 'N': - if (category_name[2] == '\0') - return UC_BIDI_BN; - break; - } + { + case '\0': + return UC_BIDI_B; + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_BN; + break; + } break; case 'C': switch (category_name[1]) - { - case 'S': - if (category_name[2] == '\0') - return UC_BIDI_CS; - break; - } + { + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_CS; + break; + } break; case 'E': switch (category_name[1]) - { - case 'N': - if (category_name[2] == '\0') - return UC_BIDI_EN; - break; - case 'S': - if (category_name[2] == '\0') - return UC_BIDI_ES; - break; - case 'T': - if (category_name[2] == '\0') - return UC_BIDI_ET; - break; - } + { + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_EN; + break; + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_ES; + break; + case 'T': + if (category_name[2] == '\0') + return UC_BIDI_ET; + break; + } break; case 'L': switch (category_name[1]) - { - case '\0': - return UC_BIDI_L; - case 'R': - switch (category_name[2]) - { - case 'E': - if (category_name[3] == '\0') - return UC_BIDI_LRE; - break; - case 'O': - if (category_name[3] == '\0') - return UC_BIDI_LRO; - break; - } - break; - } + { + case '\0': + return UC_BIDI_L; + case 'R': + switch (category_name[2]) + { + case 'E': + if (category_name[3] == '\0') + return UC_BIDI_LRE; + break; + case 'O': + if (category_name[3] == '\0') + return UC_BIDI_LRO; + break; + } + break; + } break; case 'N': switch (category_name[1]) - { - case 'S': - switch (category_name[2]) - { - case 'M': - if (category_name[3] == '\0') - return UC_BIDI_NSM; - break; - } - break; - } + { + case 'S': + switch (category_name[2]) + { + case 'M': + if (category_name[3] == '\0') + return UC_BIDI_NSM; + break; + } + break; + } break; case 'O': switch (category_name[1]) - { - case 'N': - if (category_name[2] == '\0') - return UC_BIDI_ON; - break; - } + { + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_ON; + break; + } break; case 'P': switch (category_name[1]) - { - case 'D': - switch (category_name[2]) - { - case 'F': - if (category_name[3] == '\0') - return UC_BIDI_PDF; - break; - } - break; - } + { + case 'D': + switch (category_name[2]) + { + case 'F': + if (category_name[3] == '\0') + return UC_BIDI_PDF; + break; + } + break; + } break; case 'R': switch (category_name[1]) - { - case '\0': - return UC_BIDI_R; - case 'L': - switch (category_name[2]) - { - case 'E': - if (category_name[3] == '\0') - return UC_BIDI_RLE; - break; - case 'O': - if (category_name[3] == '\0') - return UC_BIDI_RLO; - break; - } - break; - } + { + case '\0': + return UC_BIDI_R; + case 'L': + switch (category_name[2]) + { + case 'E': + if (category_name[3] == '\0') + return UC_BIDI_RLE; + break; + case 'O': + if (category_name[3] == '\0') + return UC_BIDI_RLO; + break; + } + break; + } break; case 'S': if (category_name[1] == '\0') - return UC_BIDI_S; + return UC_BIDI_S; break; case 'W': switch (category_name[1]) - { - case 'S': - if (category_name[2] == '\0') - return UC_BIDI_WS; - break; - } + { + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_WS; + break; + } break; } /* Invalid bidi category name. */ @@ -1446,25 +1465,25 @@ get_bidi_category (unsigned int ch) else { /* The bidi category of unassigned characters depends on the range. - See UTR #9 and DerivedBidiClass.txt. */ + See UTR #9 and DerivedBidiClass.txt. */ if ((ch >= 0x0590 && ch <= 0x05FF) - || (ch >= 0x07FB && ch <= 0x08FF) - || (ch >= 0xFB37 && ch <= 0xFB45) - || (ch >= 0x10800 && ch <= 0x10FFF)) - return UC_BIDI_R; + || (ch >= 0x07FB && ch <= 0x08FF) + || (ch >= 0xFB37 && ch <= 0xFB45) + || (ch >= 0x10800 && ch <= 0x10FFF)) + return UC_BIDI_R; else if ((ch >= 0x0600 && ch <= 0x07BF) - || (ch >= 0x2064 && ch <= 0x2069) - || (ch >= 0xFBB2 && ch <= 0xFDCF) - || (ch >= 0xFDFE && ch <= 0xFEFE)) - return UC_BIDI_AL; + || (ch >= 0x2064 && ch <= 0x2069) + || (ch >= 0xFBB2 && ch <= 0xFDCF) + || (ch >= 0xFDFE && ch <= 0xFEFE)) + return UC_BIDI_AL; else if ((ch >= 0xFDD0 && ch <= 0xFDEF) - || (ch >= 0xFFF0 && ch <= 0xFFFF) - || (ch & 0xFFFF) == 0xFFFE - || (ch & 0xFFFF) == 0xFFFF - || (ch >= 0xE0000 && ch <= 0xE0FFF)) - return UC_BIDI_BN; + || (ch >= 0xFFF0 && ch <= 0xFFFF) + || (ch & 0xFFFF) == 0xFFFE + || (ch & 0xFFFF) == 0xFFFF + || (ch >= 0xE0000 && ch <= 0xE0FFF)) + return UC_BIDI_BN; else - return UC_BIDI_L; + return UC_BIDI_L; } } @@ -1495,8 +1514,8 @@ output_bidi_category (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Bidi categories of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -1524,14 +1543,14 @@ output_bidi_category (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define bidi_category_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, - (1 << t.p) * 5 / 16); + (1 << t.p) * 5 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_bidi_category =\n"); fprintf (stream, "{\n"); @@ -1542,15 +1561,15 @@ output_bidi_category (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -1562,15 +1581,15 @@ output_bidi_category (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -1595,10 +1614,10 @@ output_bidi_category (const char *filename, const char *version) for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) - fprintf (stream, ","); + fprintf (stream, ","); } if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); @@ -1652,8 +1671,8 @@ output_decimal_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) @@ -1661,15 +1680,15 @@ output_decimal_digit_test (const char *filename, const char *version) int value = get_decdigit_value (ch); if (!(value >= -1 && value < 10)) - abort (); + abort (); if (value >= 0) - { - if (need_comma) - fprintf (stream, ",\n"); - fprintf (stream, " { 0x%04X, %d }", ch, value); - need_comma = true; - } + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d }", ch, value); + need_comma = true; + } } if (need_comma) fprintf (stream, "\n"); @@ -1699,8 +1718,8 @@ output_decimal_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -1711,7 +1730,7 @@ output_decimal_digit (const char *filename, const char *version) int value = 1 + get_decdigit_value (ch); if (!(value >= 0 && value <= 10)) - abort (); + abort (); decdigit_table_add (&t, ch, value); } @@ -1731,14 +1750,14 @@ output_decimal_digit (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define decdigit_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, - t.p - 1); + t.p - 1); fprintf (stream, " }\n"); fprintf (stream, "u_decdigit =\n"); fprintf (stream, "{\n"); @@ -1749,15 +1768,15 @@ output_decimal_digit (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -1769,15 +1788,15 @@ output_decimal_digit (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -1789,12 +1808,12 @@ output_decimal_digit (const char *filename, const char *version) for (i = 0; i < t.level3_size << (t.p - 1); i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%02x", - ((uint8_t *) (t.result + level3_offset))[2*i] - + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); + ((uint8_t *) (t.result + level3_offset))[2*i] + + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); if (i+1 < t.level3_size << (t.p - 1)) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); @@ -1839,8 +1858,8 @@ output_digit_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) @@ -1848,15 +1867,15 @@ output_digit_test (const char *filename, const char *version) int value = get_digit_value (ch); if (!(value >= -1 && value < 10)) - abort (); + abort (); if (value >= 0) - { - if (need_comma) - fprintf (stream, ",\n"); - fprintf (stream, " { 0x%04X, %d }", ch, value); - need_comma = true; - } + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d }", ch, value); + need_comma = true; + } } if (need_comma) fprintf (stream, "\n"); @@ -1886,8 +1905,8 @@ output_digit (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -1898,7 +1917,7 @@ output_digit (const char *filename, const char *version) int value = 1 + get_digit_value (ch); if (!(value >= 0 && value <= 10)) - abort (); + abort (); decdigit_table_add (&t, ch, value); } @@ -1918,14 +1937,14 @@ output_digit (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define digit_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, - t.p - 1); + t.p - 1); fprintf (stream, " }\n"); fprintf (stream, "u_digit =\n"); fprintf (stream, "{\n"); @@ -1936,15 +1955,15 @@ output_digit (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -1956,15 +1975,15 @@ output_digit (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -1976,12 +1995,12 @@ output_digit (const char *filename, const char *version) for (i = 0; i < t.level3_size << (t.p - 1); i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%02x", - ((uint8_t *) (t.result + level3_offset))[2*i] - + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); + ((uint8_t *) (t.result + level3_offset))[2*i] + + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); if (i+1 < t.level3_size << (t.p - 1)) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); @@ -2014,9 +2033,9 @@ get_numeric_value (unsigned int ch) /* str is of the form "integer" or "integer/posinteger". */ value.numerator = atoi (str); if (strchr (str, '/') != NULL) - value.denominator = atoi (strchr (str, '/') + 1); + value.denominator = atoi (strchr (str, '/') + 1); else - value.denominator = 1; + value.denominator = 1; } else { @@ -2043,8 +2062,8 @@ output_numeric_test (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) @@ -2052,13 +2071,13 @@ output_numeric_test (const char *filename, const char *version) uc_fraction_t value = get_numeric_value (ch); if (value.numerator != 0 || value.denominator != 0) - { - if (need_comma) - fprintf (stream, ",\n"); - fprintf (stream, " { 0x%04X, %d, %d }", - ch, value.numerator, value.denominator); - need_comma = true; - } + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d, %d }", + ch, value.numerator, value.denominator); + need_comma = true; + } } if (need_comma) fprintf (stream, "\n"); @@ -2099,8 +2118,8 @@ output_numeric (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); /* Create table of occurring fractions. */ nfractions = 0; @@ -2109,34 +2128,34 @@ output_numeric (const char *filename, const char *version) uc_fraction_t value = get_numeric_value (ch); for (i = 0; i < nfractions; i++) - if (value.numerator == fractions[i].numerator - && value.denominator == fractions[i].denominator) - break; + if (value.numerator == fractions[i].numerator + && value.denominator == fractions[i].denominator) + break; if (i == nfractions) - { - if (nfractions == 128) - abort (); - for (i = 0; i < nfractions; i++) - if (value.denominator < fractions[i].denominator - || (value.denominator == fractions[i].denominator - && value.numerator < fractions[i].numerator)) - break; - for (j = nfractions; j > i; j--) - fractions[j] = fractions[j - 1]; - fractions[i] = value; - nfractions++; - } + { + if (nfractions == 128) + abort (); + for (i = 0; i < nfractions; i++) + if (value.denominator < fractions[i].denominator + || (value.denominator == fractions[i].denominator + && value.numerator < fractions[i].numerator)) + break; + for (j = nfractions; j > i; j--) + fractions[j] = fractions[j - 1]; + fractions[i] = value; + nfractions++; + } } fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n", - nfractions); + nfractions); fprintf (stream, "{\n"); for (i = 0; i < nfractions; i++) { fprintf (stream, " { %d, %d }", fractions[i].numerator, - fractions[i].denominator); + fractions[i].denominator); if (i+1 < nfractions) - fprintf (stream, ","); + fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); @@ -2150,11 +2169,11 @@ output_numeric (const char *filename, const char *version) uc_fraction_t value = get_numeric_value (ch); for (i = 0; i < nfractions; i++) - if (value.numerator == fractions[i].numerator - && value.denominator == fractions[i].denominator) - break; + if (value.numerator == fractions[i].numerator + && value.denominator == fractions[i].denominator) + break; if (i == nfractions) - abort (); + abort (); numeric_table_add (&t, ch, i); } @@ -2174,14 +2193,14 @@ output_numeric (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define numeric_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, - (1 << t.p) * 7 / 16); + (1 << t.p) * 7 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_numeric =\n"); fprintf (stream, "{\n"); @@ -2192,15 +2211,15 @@ output_numeric (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -2212,15 +2231,15 @@ output_numeric (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -2245,10 +2264,10 @@ output_numeric (const char *filename, const char *version) for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1) - fprintf (stream, ","); + fprintf (stream, ","); } if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) fprintf (stream, "\n "); @@ -2335,25 +2354,25 @@ get_mirror_value (unsigned int ch) unsigned int i; mirrored = (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].mirrored); + && unicode_attributes[ch].mirrored); mirror_char = 0xfffd; for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++) if (ch == mirror_pairs[i][0]) { - mirror_char = mirror_pairs[i][1]; - break; + mirror_char = mirror_pairs[i][1]; + break; } else if (ch == mirror_pairs[i][1]) { - mirror_char = mirror_pairs[i][0]; - break; + mirror_char = mirror_pairs[i][0]; + break; } if (mirrored) return (int) mirror_char - (int) ch; else { if (mirror_char != 0xfffd) - abort (); + abort (); return 0; } } @@ -2384,8 +2403,8 @@ output_mirror (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Mirrored Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; @@ -2413,7 +2432,7 @@ output_mirror (const char *filename, const char *version) for (i = 0; i < 5; i++) fprintf (stream, "#define mirror_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); @@ -2430,15 +2449,15 @@ output_mirror (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -2450,15 +2469,15 @@ output_mirror (const char *filename, const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (int32_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -2469,10 +2488,10 @@ output_mirror (const char *filename, const char *version) for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); @@ -2488,6 +2507,24 @@ output_mirror (const char *filename, const char *version) /* ========================================================================= */ +/* Particular values of the word break property. */ + +static bool +is_WBP_MIDNUMLET (unsigned int ch) +{ + return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E); +} + +static bool +is_WBP_MIDLETTER (unsigned int ch) +{ + return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A); +} + +/* ========================================================================= */ + /* Properties. */ /* Reading PropList.txt and DerivedCoreProperties.txt. */ @@ -2531,6 +2568,13 @@ enum PROP_ALPHABETIC, PROP_LOWERCASE, PROP_UPPERCASE, + PROP_CASED, + PROP_CASE_IGNORABLE, + PROP_CHANGES_WHEN_LOWERCASED, + PROP_CHANGES_WHEN_UPPERCASED, + PROP_CHANGES_WHEN_TITLECASED, + PROP_CHANGES_WHEN_CASEFOLDED, + PROP_CHANGES_WHEN_CASEMAPPED, PROP_ID_START, PROP_ID_CONTINUE, PROP_XID_START, @@ -2575,20 +2619,20 @@ fill_properties (const char *proplist_filename) unsigned int propvalue; if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; + break; if (buf[0] == '\0' || buf[0] == '#') - continue; + continue; if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) - { - if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) - { - fprintf (stderr, "parse error in '%s'\n", proplist_filename); - exit (1); - } - i2 = i1; - } + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", proplist_filename); + exit (1); + } + i2 = i1; + } #define PROP(name,value) \ if (strcmp (propname, name) == 0) propvalue = value; else /* PropList.txt */ @@ -2629,6 +2673,13 @@ fill_properties (const char *proplist_filename) PROP ("Alphabetic", PROP_ALPHABETIC) PROP ("Lowercase", PROP_LOWERCASE) PROP ("Uppercase", PROP_UPPERCASE) + PROP ("Cased", PROP_CASED) + PROP ("Case_Ignorable", PROP_CASE_IGNORABLE) + PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED) + PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED) + PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED) + PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED) + PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED) PROP ("ID_Start", PROP_ID_START) PROP ("ID_Continue", PROP_ID_CONTINUE) PROP ("XID_Start", PROP_XID_START) @@ -2638,16 +2689,16 @@ fill_properties (const char *proplist_filename) PROP ("Grapheme_Base", PROP_GRAPHEME_BASE) PROP ("Grapheme_Link", PROP_GRAPHEME_LINK) #undef PROP - { - fprintf (stderr, "unknown property named '%s' in '%s'\n", propname, - proplist_filename); - exit (1); - } + { + fprintf (stderr, "unknown property named '%s' in '%s'\n", propname, + proplist_filename); + exit (1); + } if (!(i1 <= i2 && i2 < 0x110000)) - abort (); + abort (); for (i = i1; i <= i2; i++) - unicode_properties[i] |= 1ULL << propvalue; + unicode_properties[i] |= 1ULL << propvalue; } if (ferror (stream) || fclose (stream)) @@ -2680,10 +2731,10 @@ fill_property30 (char array[0x110000], const char *proplist_filename, const char do { if (fscanf (stream, "%100[^\n]\n", buf) < 1) - { - fprintf (stderr, "no property found in '%s'\n", proplist_filename); - exit (1); - } + { + fprintf (stderr, "no property found in '%s'\n", proplist_filename); + exit (1); + } } while (strstr (buf, property_name) == NULL); @@ -2692,39 +2743,40 @@ fill_property30 (char array[0x110000], const char *proplist_filename, const char unsigned int i1, i2; if (fscanf (stream, "%100[^\n]\n", buf) < 1) - break; + break; if (buf[0] == '*') - break; + break; if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') - { - if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) - { - fprintf (stderr, "parse error in property in '%s'\n", - proplist_filename); - exit (1); - } - } + { + if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } + } else if (strlen (buf) >= 4) - { - if (sscanf (buf, "%4X", &i1) < 1) - { - fprintf (stderr, "parse error in property in '%s'\n", - proplist_filename); - exit (1); - } - i2 = i1; - } + { + if (sscanf (buf, "%4X", &i1) < 1) + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } + i2 = i1; + } else - { - fprintf (stderr, "parse error in property in '%s'\n", - proplist_filename); - exit (1); - } + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } if (!(i1 <= i2 && i2 < 0x110000)) - abort (); + abort (); for (i = i1; i <= i2; i++) - array[i] = 1; + array[i] = 1; } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", proplist_filename); @@ -2774,6 +2826,7 @@ is_property_alphabetic (unsigned int ch) || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */ + || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */ || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */ || (ch == 0x10341) /* GOTHIC LETTER NINETY */ || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ @@ -2809,7 +2862,10 @@ is_property_default_ignorable_code_point (unsigned int ch) bool result1 = (is_category_Cf (ch) && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ - && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)) + && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F) + /* For some reason, the following are not listed as having property + Default_Ignorable_Code_Point. */ + && !(ch == 0x110BD)) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); bool result2 = @@ -2854,8 +2910,8 @@ is_property_private_use (unsigned int ch) { /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */ return (ch >= 0xE000 && ch <= 0xF8FF) - || (ch >= 0xF0000 && ch <= 0xFFFFD) - || (ch >= 0x100000 && ch <= 0x10FFFD); + || (ch >= 0xF0000 && ch <= 0xFFFFD) + || (ch >= 0x100000 && ch <= 0x10FFFD); } /* See PropList-3.0.1.txt. */ @@ -2918,6 +2974,79 @@ is_property_titlecase (unsigned int ch) return is_category_Lt (ch); } +/* See DerivedCoreProperties.txt. */ +static bool +is_property_cased (unsigned int ch) +{ + bool result1 = (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_case_ignorable (unsigned int ch) +{ + bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch) + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_lowercased (unsigned int ch) +{ + bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0); + bool result2 = (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].lower != NONE + && unicode_attributes[ch].lower != ch); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_uppercased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_titlecased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casefolded (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casemapped (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0); +} + /* See PropList.txt, UCD.html. */ static bool is_property_soft_dotted (unsigned int ch) @@ -3127,7 +3256,7 @@ is_property_bidi_embedding_or_override (unsigned int ch) { int category = get_bidi_category (ch); return (category == UC_BIDI_LRE || category == UC_BIDI_LRO - || category == UC_BIDI_RLE || category == UC_BIDI_RLO); + || category == UC_BIDI_RLE || category == UC_BIDI_RLO); } /* See PropList-3.0.1.txt. */ @@ -3192,8 +3321,8 @@ static bool is_property_zero_width (unsigned int ch) { return is_category_Cf (ch) - || (unicode_attributes[ch].name != NULL - && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL); + || (unicode_attributes[ch].name != NULL + && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL); } /* See PropList-3.0.1.txt. */ @@ -3210,21 +3339,21 @@ is_property_non_break (unsigned int ch) /* This is exactly the set of characters having line breaking property GL. */ return (ch == 0x00A0 /* NO-BREAK SPACE */ - || ch == 0x034F /* COMBINING GRAPHEME JOINER */ - || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */ - || ch == 0x035D /* COMBINING DOUBLE BREVE */ - || ch == 0x035E /* COMBINING DOUBLE MACRON */ - || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */ - || ch == 0x0360 /* COMBINING DOUBLE TILDE */ - || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */ - || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */ - || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ - || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ - || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ - || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ - || ch == 0x2007 /* FIGURE SPACE */ - || ch == 0x2011 /* NON-BREAKING HYPHEN */ - || ch == 0x202F /* NARROW NO-BREAK SPACE */); + || ch == 0x034F /* COMBINING GRAPHEME JOINER */ + || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */ + || ch == 0x035D /* COMBINING DOUBLE BREVE */ + || ch == 0x035E /* COMBINING DOUBLE MACRON */ + || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */ + || ch == 0x0360 /* COMBINING DOUBLE TILDE */ + || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */ + || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */ + || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ + || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ + || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ + || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ + || ch == 0x2007 /* FIGURE SPACE */ + || ch == 0x2011 /* NON-BREAKING HYPHEN */ + || ch == 0x202F /* NARROW NO-BREAK SPACE */); } /* See PropList-3.0.1.txt. */ @@ -3247,9 +3376,9 @@ static bool is_property_format_control (unsigned int ch) { return (is_category_Cf (ch) - && get_bidi_category (ch) == UC_BIDI_BN - && !is_property_join_control (ch) - && ch != 0xFEFF); + && get_bidi_category (ch) == UC_BIDI_BN + && !is_property_join_control (ch) + && ch != 0xFEFF); } /* See PropList.txt, UCD.html. */ @@ -3358,10 +3487,10 @@ static bool is_property_combining (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && (strcmp (unicode_attributes[ch].combining, "0") != 0 - || is_category_Mc (ch) - || is_category_Me (ch) - || is_category_Mn (ch))); + && (strcmp (unicode_attributes[ch].combining, "0") != 0 + || is_category_Mc (ch) + || is_category_Me (ch) + || is_category_Mn (ch))); } #if 0 /* same as is_property_bidi_non_spacing_mark */ @@ -3370,7 +3499,7 @@ static bool is_property_non_spacing (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && get_bidi_category (ch) == UC_BIDI_NSM); + && get_bidi_category (ch) == UC_BIDI_NSM); } #endif @@ -3386,14 +3515,14 @@ is_property_composite (unsigned int ch) && unicode_attributes[ch].decomposition != NULL) { /* Test whether the decomposition contains more than one character, - and the first is not a space. */ + and the first is not a space. */ const char *decomp = unicode_attributes[ch].decomposition; if (decomp[0] == '<') - { - decomp = strchr (decomp, '>') + 1; - if (decomp[0] == ' ') - decomp++; - } + { + decomp = strchr (decomp, '>') + 1; + if (decomp[0] == ' ') + decomp++; + } return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0; } return false; @@ -3411,8 +3540,8 @@ static bool is_property_numeric (unsigned int ch) { return ((get_numeric_value (ch)).denominator > 0) - || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ - || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */ + || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ + || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */ } /* See PropList.txt, UCD.html. */ @@ -3434,8 +3563,8 @@ static bool is_property_ignorable_control (unsigned int ch) { return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN) - || is_category_Cf (ch)) - && ch != 0x0000; + || is_category_Cf (ch)) + && ch != 0x0000; } /* ------------------------------------------------------------------------- */ @@ -3464,6 +3593,13 @@ output_properties (const char *version) PROPERTY(lowercase) PROPERTY(other_lowercase) PROPERTY(titlecase) + PROPERTY(cased) + PROPERTY(case_ignorable) + PROPERTY(changes_when_lowercased) + PROPERTY(changes_when_uppercased) + PROPERTY(changes_when_titlecased) + PROPERTY(changes_when_casefolded) + PROPERTY(changes_when_casemapped) PROPERTY(soft_dotted) PROPERTY(id_start) PROPERTY(other_id_start) @@ -3532,109 +3668,258 @@ output_properties (const char *version) /* ========================================================================= */ -/* Scripts. */ +/* Arabic Shaping. */ -static const char *scripts[256]; -static unsigned int numscripts; +enum +{ + UC_JOINING_TYPE_U, /* Non_Joining */ + UC_JOINING_TYPE_T, /* Transparent */ + UC_JOINING_TYPE_C, /* Join_Causing */ + UC_JOINING_TYPE_L, /* Left_Joining */ + UC_JOINING_TYPE_R, /* Right_Joining */ + UC_JOINING_TYPE_D /* Dual_Joining */ +}; -static uint8_t unicode_scripts[0x110000]; +static uint8_t unicode_joining_type[0x110000]; + +enum +{ + UC_JOINING_GROUP_NONE, /* No_Joining_Group */ + UC_JOINING_GROUP_AIN, /* Ain */ + UC_JOINING_GROUP_ALAPH, /* Alaph */ + UC_JOINING_GROUP_ALEF, /* Alef */ + UC_JOINING_GROUP_BEH, /* Beh */ + UC_JOINING_GROUP_BETH, /* Beth */ + UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */ + UC_JOINING_GROUP_DAL, /* Dal */ + UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */ + UC_JOINING_GROUP_E, /* E */ + UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */ + UC_JOINING_GROUP_FE, /* Fe */ + UC_JOINING_GROUP_FEH, /* Feh */ + UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */ + UC_JOINING_GROUP_GAF, /* Gaf */ + UC_JOINING_GROUP_GAMAL, /* Gamal */ + UC_JOINING_GROUP_HAH, /* Hah */ + UC_JOINING_GROUP_HE, /* He */ + UC_JOINING_GROUP_HEH, /* Heh */ + UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */ + UC_JOINING_GROUP_HETH, /* Heth */ + UC_JOINING_GROUP_KAF, /* Kaf */ + UC_JOINING_GROUP_KAPH, /* Kaph */ + UC_JOINING_GROUP_KHAPH, /* Khaph */ + UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */ + UC_JOINING_GROUP_LAM, /* Lam */ + UC_JOINING_GROUP_LAMADH, /* Lamadh */ + UC_JOINING_GROUP_MEEM, /* Meem */ + UC_JOINING_GROUP_MIM, /* Mim */ + UC_JOINING_GROUP_NOON, /* Noon */ + UC_JOINING_GROUP_NUN, /* Nun */ + UC_JOINING_GROUP_NYA, /* Nya */ + UC_JOINING_GROUP_PE, /* Pe */ + UC_JOINING_GROUP_QAF, /* Qaf */ + UC_JOINING_GROUP_QAPH, /* Qaph */ + UC_JOINING_GROUP_REH, /* Reh */ + UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */ + UC_JOINING_GROUP_SAD, /* Sad */ + UC_JOINING_GROUP_SADHE, /* Sadhe */ + UC_JOINING_GROUP_SEEN, /* Seen */ + UC_JOINING_GROUP_SEMKATH, /* Semkath */ + UC_JOINING_GROUP_SHIN, /* Shin */ + UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */ + UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */ + UC_JOINING_GROUP_TAH, /* Tah */ + UC_JOINING_GROUP_TAW, /* Taw */ + UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */ + UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */ + UC_JOINING_GROUP_TETH, /* Teth */ + UC_JOINING_GROUP_WAW, /* Waw */ + UC_JOINING_GROUP_YEH, /* Yeh */ + UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */ + UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */ + UC_JOINING_GROUP_YUDH, /* Yudh */ + UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */ + UC_JOINING_GROUP_ZAIN, /* Zain */ + UC_JOINING_GROUP_ZHAIN /* Zhain */ +}; + +static uint8_t unicode_joining_group[0x110000]; static void -fill_scripts (const char *scripts_filename) +fill_arabicshaping (const char *arabicshaping_filename) { FILE *stream; unsigned int i; + int lineno; - stream = fopen (scripts_filename, "r"); + stream = fopen (arabicshaping_filename, "r"); if (stream == NULL) { - fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); + fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename); exit (1); } - numscripts = 0; - for (i = 0; i < 0x110000; i++) - unicode_scripts[i] = (uint8_t)~(uint8_t)0; + { + unicode_joining_type[i] = (uint8_t)~(uint8_t)0; + unicode_joining_group[i] = UC_JOINING_GROUP_NONE; + } + lineno = 0; for (;;) { - char buf[200+1]; - unsigned int i1, i2; - char padding[200+1]; - char scriptname[200+1]; - int script; + char buf[100+1]; + char separator1[100+1]; + char padding1[100+1]; + char schematic_name[100+1]; + char separator2[100+1]; + char padding2[100+1]; + char joining_type_name[100+1]; + char separator3[100+1]; + char padding3[100+1]; + char joining_group_name[100+1]; + int joining_type; + int joining_group; - if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; + lineno++; + if (fscanf (stream, "%100[^\n]\n", buf) < 1) + break; if (buf[0] == '\0' || buf[0] == '#') - continue; - - if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) - { - if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) - { - fprintf (stderr, "parse error in '%s'\n", scripts_filename); - exit (1); - } - i2 = i1; - } - if (i2 < i1) - abort (); - if (i2 >= 0x110000) - abort (); - - for (script = numscripts - 1; script >= 0; script--) - if (strcmp (scripts[script], scriptname) == 0) - break; - if (script < 0) - { - scripts[numscripts] = strdup (scriptname); - script = numscripts; - numscripts++; - if (numscripts == 256) - abort (); - } + continue; + + if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]", + &i, separator1, padding1, schematic_name, separator2, + padding2, joining_type_name, separator3, padding3, + joining_group_name) != 10) + { + fprintf (stderr, "parse error in '%s':%d\n", + arabicshaping_filename, lineno); + exit (1); + } + if (i >= 0x110000) + abort (); + +#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name; + if (false) {} + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + else + { + fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n", + joining_type_name, arabicshaping_filename, lineno); + exit (1); + } + + /* Remove trailing spaces. */ + while (joining_group_name[0] != '\0' + && joining_group_name[strlen (joining_group_name) - 1] == ' ') + joining_group_name[strlen (joining_group_name) - 1] = '\0'; + +#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value; + if (false) {} + TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group") + TRY(UC_JOINING_GROUP_AIN, "AIN") + TRY(UC_JOINING_GROUP_ALAPH, "ALAPH") + TRY(UC_JOINING_GROUP_ALEF, "ALEF") + TRY(UC_JOINING_GROUP_BEH, "BEH") + TRY(UC_JOINING_GROUP_BETH, "BETH") + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE") + TRY(UC_JOINING_GROUP_DAL, "DAL") + TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH") + TRY(UC_JOINING_GROUP_E, "E") + TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH") + TRY(UC_JOINING_GROUP_FE, "FE") + TRY(UC_JOINING_GROUP_FEH, "FEH") + TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH") + TRY(UC_JOINING_GROUP_GAF, "GAF") + TRY(UC_JOINING_GROUP_GAMAL, "GAMAL") + TRY(UC_JOINING_GROUP_HAH, "HAH") + TRY(UC_JOINING_GROUP_HE, "HE") + TRY(UC_JOINING_GROUP_HEH, "HEH") + TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL") + TRY(UC_JOINING_GROUP_HETH, "HETH") + TRY(UC_JOINING_GROUP_KAF, "KAF") + TRY(UC_JOINING_GROUP_KAPH, "KAPH") + TRY(UC_JOINING_GROUP_KHAPH, "KHAPH") + TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH") + TRY(UC_JOINING_GROUP_LAM, "LAM") + TRY(UC_JOINING_GROUP_LAMADH, "LAMADH") + TRY(UC_JOINING_GROUP_MEEM, "MEEM") + TRY(UC_JOINING_GROUP_MIM, "MIM") + TRY(UC_JOINING_GROUP_NOON, "NOON") + TRY(UC_JOINING_GROUP_NUN, "NUN") + TRY(UC_JOINING_GROUP_NYA, "NYA") + TRY(UC_JOINING_GROUP_PE, "PE") + TRY(UC_JOINING_GROUP_QAF, "QAF") + TRY(UC_JOINING_GROUP_QAPH, "QAPH") + TRY(UC_JOINING_GROUP_REH, "REH") + TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE") + TRY(UC_JOINING_GROUP_SAD, "SAD") + TRY(UC_JOINING_GROUP_SADHE, "SADHE") + TRY(UC_JOINING_GROUP_SEEN, "SEEN") + TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH") + TRY(UC_JOINING_GROUP_SHIN, "SHIN") + TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF") + TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW") + TRY(UC_JOINING_GROUP_TAH, "TAH") + TRY(UC_JOINING_GROUP_TAW, "TAW") + TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA") + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL") + TRY(UC_JOINING_GROUP_TETH, "TETH") + TRY(UC_JOINING_GROUP_WAW, "WAW") + TRY(UC_JOINING_GROUP_YEH, "YEH") + TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE") + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL") + TRY(UC_JOINING_GROUP_YUDH, "YUDH") + TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE") + TRY(UC_JOINING_GROUP_ZAIN, "ZAIN") + TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN") +#undef TRY + else + { + fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n", + joining_group_name, arabicshaping_filename, lineno); + exit (1); + } - for (i = i1; i <= i2; i++) - { - if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) - fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); - unicode_scripts[i] = script; - } + unicode_joining_type[i] = joining_type; + unicode_joining_group[i] = joining_group; } if (ferror (stream) || fclose (stream)) { - fprintf (stderr, "error reading from '%s'\n", scripts_filename); + fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename); exit (1); } } -/* Construction of sparse 3-level tables. */ -#define TABLE script_table -#define ELEMENT uint8_t -#define DEFAULT (uint8_t)~(uint8_t)0 -#define xmalloc malloc -#define xrealloc realloc -#include "3level.h" +/* Convert a Joining_Type value to a C identifier. */ +static const char * +joining_type_as_c_identifier (int joining_type) +{ +#define TRY(value) if (joining_type == value) return #value; + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + abort (); +} static void -output_scripts (const char *version) +output_joining_type_test (const char *filename, const char *version) { - const char *filename = "unictype/scripts.h"; FILE *stream; - unsigned int ch, s, i; - struct script_table t; - unsigned int level1_offset, level2_offset, level3_offset; - - typedef struct - { - const char *lowercase_name; - } - scriptinfo_t; - scriptinfo_t scriptinfo[256]; + bool need_comma; + unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) @@ -3644,81 +3929,74 @@ output_scripts (const char *version) } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); - for (s = 0; s < numscripts; s++) + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) { - char *lcp = strdup (scripts[s]); - char *cp; + int value = unicode_joining_type[ch]; - for (cp = lcp; *cp != '\0'; cp++) - if (*cp >= 'A' && *cp <= 'Z') - *cp += 'a' - 'A'; - - scriptinfo[s].lowercase_name = lcp; + if (value != (uint8_t)~(uint8_t)0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value)); + need_comma = true; + } } + if (need_comma) + fprintf (stream, "\n"); - for (s = 0; s < numscripts; s++) + if (ferror (stream) || fclose (stream)) { - fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", - scriptinfo[s].lowercase_name); - fprintf (stream, "{\n"); - i = 0; - for (ch = 0; ch < 0x110000; ch++) - if (unicode_scripts[ch] == s) - { - unsigned int start; - unsigned int end; - - start = ch; - while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) - ch++; - end = ch; - - if (i > 0) - fprintf (stream, ",\n"); - if (start == end) - fprintf (stream, " { 0x%04X, 1, 1 }", start); - else - fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", - start, end); - i++; - } - fprintf (stream, "\n"); - fprintf (stream, "};\n"); + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); } +} - fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); - fprintf (stream, "{\n"); - for (s = 0; s < numscripts; s++) +/* Construction of sparse 3-level tables. */ +#define TABLE joining_type_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_joining_type (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct joining_type_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint8_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) { - fprintf (stream, " {\n"); - fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", - scriptinfo[s].lowercase_name); - fprintf (stream, " script_%s_intervals,\n", - scriptinfo[s].lowercase_name); - fprintf (stream, " \"%s\"\n", scripts[s]); - fprintf (stream, " }"); - if (s+1 < numscripts) - fprintf (stream, ","); - fprintf (stream, "\n"); + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); } - fprintf (stream, "};\n"); + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; t.q = 9; - script_table_init (&t); + joining_type_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { - unsigned int s = unicode_scripts[ch]; - if (s != (uint8_t)~(uint8_t)0) - script_table_add (&t, ch, s); + uint8_t value = unicode_joining_type[ch]; + + joining_type_table_add (&t, ch, value); } - script_table_finalize (&t); + joining_type_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = @@ -3732,16 +4010,17 @@ output_scripts (const char *version) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) - fprintf (stream, "#define script_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + fprintf (stream, "#define joining_type_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); - fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size, + (1 << t.p) * 4 / 8); fprintf (stream, " }\n"); - fprintf (stream, "u_script =\n"); + fprintf (stream, "u_joining_type =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) @@ -3750,15 +4029,15 @@ output_scripts (const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -3770,33 +4049,44 @@ output_scripts (const char *version) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 4 bits only. */ + level3_packed = + (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 4) / 8; + unsigned int k = (i * 4) % 8; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f; + level3_packed[j] |= (value << k); + } fprintf (stream, " {"); - if (t.level3_size << t.p > 8) + if ((t.level3_size << t.p) * 4 / 8 > 8) fprintf (stream, "\n "); - for (i = 0; i < t.level3_size << t.p; i++) + for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); - fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); - if (i+1 < t.level3_size << t.p) - fprintf (stream, ","); + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 4 / 8) + fprintf (stream, ","); } - if (t.level3_size << t.p > 8) + if ((t.level3_size << t.p) * 4 / 8 > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); + free (level3_packed); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) @@ -3806,12 +4096,78 @@ output_scripts (const char *version) } } +/* Convert a Joining_Group value to a C identifier. */ +static const char * +joining_group_as_c_identifier (int joining_group) +{ +#define TRY(value) if (joining_group == value) return #value; + TRY(UC_JOINING_GROUP_NONE) + TRY(UC_JOINING_GROUP_AIN) + TRY(UC_JOINING_GROUP_ALAPH) + TRY(UC_JOINING_GROUP_ALEF) + TRY(UC_JOINING_GROUP_BEH) + TRY(UC_JOINING_GROUP_BETH) + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE) + TRY(UC_JOINING_GROUP_DAL) + TRY(UC_JOINING_GROUP_DALATH_RISH) + TRY(UC_JOINING_GROUP_E) + TRY(UC_JOINING_GROUP_FARSI_YEH) + TRY(UC_JOINING_GROUP_FE) + TRY(UC_JOINING_GROUP_FEH) + TRY(UC_JOINING_GROUP_FINAL_SEMKATH) + TRY(UC_JOINING_GROUP_GAF) + TRY(UC_JOINING_GROUP_GAMAL) + TRY(UC_JOINING_GROUP_HAH) + TRY(UC_JOINING_GROUP_HE) + TRY(UC_JOINING_GROUP_HEH) + TRY(UC_JOINING_GROUP_HEH_GOAL) + TRY(UC_JOINING_GROUP_HETH) + TRY(UC_JOINING_GROUP_KAF) + TRY(UC_JOINING_GROUP_KAPH) + TRY(UC_JOINING_GROUP_KHAPH) + TRY(UC_JOINING_GROUP_KNOTTED_HEH) + TRY(UC_JOINING_GROUP_LAM) + TRY(UC_JOINING_GROUP_LAMADH) + TRY(UC_JOINING_GROUP_MEEM) + TRY(UC_JOINING_GROUP_MIM) + TRY(UC_JOINING_GROUP_NOON) + TRY(UC_JOINING_GROUP_NUN) + TRY(UC_JOINING_GROUP_NYA) + TRY(UC_JOINING_GROUP_PE) + TRY(UC_JOINING_GROUP_QAF) + TRY(UC_JOINING_GROUP_QAPH) + TRY(UC_JOINING_GROUP_REH) + TRY(UC_JOINING_GROUP_REVERSED_PE) + TRY(UC_JOINING_GROUP_SAD) + TRY(UC_JOINING_GROUP_SADHE) + TRY(UC_JOINING_GROUP_SEEN) + TRY(UC_JOINING_GROUP_SEMKATH) + TRY(UC_JOINING_GROUP_SHIN) + TRY(UC_JOINING_GROUP_SWASH_KAF) + TRY(UC_JOINING_GROUP_SYRIAC_WAW) + TRY(UC_JOINING_GROUP_TAH) + TRY(UC_JOINING_GROUP_TAW) + TRY(UC_JOINING_GROUP_TEH_MARBUTA) + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL) + TRY(UC_JOINING_GROUP_TETH) + TRY(UC_JOINING_GROUP_WAW) + TRY(UC_JOINING_GROUP_YEH) + TRY(UC_JOINING_GROUP_YEH_BARREE) + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL) + TRY(UC_JOINING_GROUP_YUDH) + TRY(UC_JOINING_GROUP_YUDH_HE) + TRY(UC_JOINING_GROUP_ZAIN) + TRY(UC_JOINING_GROUP_ZHAIN) +#undef TRY + abort (); +} + static void -output_scripts_byname (const char *version) +output_joining_group_test (const char *filename, const char *version) { - const char *filename = "unictype/scripts_byname.gperf"; FILE *stream; - unsigned int s; + bool need_comma; + unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) @@ -3821,20 +4177,25 @@ output_scripts_byname (const char *version) } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Unicode scripts. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); - fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n"); - fprintf (stream, "%%struct-type\n"); - fprintf (stream, "%%language=ANSI-C\n"); - fprintf (stream, "%%define hash-function-name scripts_hash\n"); - fprintf (stream, "%%define lookup-function-name uc_script_lookup\n"); - fprintf (stream, "%%readonly-tables\n"); - fprintf (stream, "%%global-table\n"); - fprintf (stream, "%%define word-array-name script_names\n"); - fprintf (stream, "%%%%\n"); - for (s = 0; s < numscripts; s++) - fprintf (stream, "%s, %u\n", scripts[s], s); + fprintf (stream, "/* Arabic joining group of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_group[ch]; + + if (value != UC_JOINING_GROUP_NONE) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); if (ferror (stream) || fclose (stream)) { @@ -3843,9 +4204,396 @@ output_scripts_byname (const char *version) } } -/* ========================================================================= */ - -/* Blocks. */ +static void +output_joining_group (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch_min, ch_max, ch, i; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + ch_min = 0x10FFFF; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_min = ch; + break; + } + + ch_max = 0; + for (ch = 0x10FFFF; ch > 0; ch--) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_max = ch; + break; + } + + if (!(ch_min <= ch_max)) + abort (); + + /* If the interval [ch_min, ch_max] is too large, we should better use a + 3-level table. */ + if (!(ch_max - ch_min < 0x200)) + abort (); + + fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min); + fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n", + ch_max + 1, ch_min); + fprintf (stream, "{"); + for (i = 0; i <= ch_max - ch_min; i++) + { + const char *s; + + ch = ch_min + i; + if ((i % 2) == 0) + fprintf (stream, "\n "); + s = joining_group_as_c_identifier (unicode_joining_group[ch]); + fprintf (stream, " %s", s); + if (i+1 <= ch_max - ch_min) + { + fprintf (stream, ","); + if (((i+1) % 2) != 0) + fprintf (stream, "%*s", 38 - (int) strlen (s), ""); + } + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Scripts. */ + +static const char *scripts[256]; +static unsigned int numscripts; + +static uint8_t unicode_scripts[0x110000]; + +static void +fill_scripts (const char *scripts_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (scripts_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); + exit (1); + } + + numscripts = 0; + + for (i = 0; i < 0x110000; i++) + unicode_scripts[i] = (uint8_t)~(uint8_t)0; + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char scriptname[200+1]; + int script; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", scripts_filename); + exit (1); + } + i2 = i1; + } + if (i2 < i1) + abort (); + if (i2 >= 0x110000) + abort (); + + for (script = numscripts - 1; script >= 0; script--) + if (strcmp (scripts[script], scriptname) == 0) + break; + if (script < 0) + { + scripts[numscripts] = strdup (scriptname); + script = numscripts; + numscripts++; + if (numscripts == 256) + abort (); + } + + for (i = i1; i <= i2; i++) + { + if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) + fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); + unicode_scripts[i] = script; + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", scripts_filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE script_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_scripts (const char *version) +{ + const char *filename = "unictype/scripts.h"; + FILE *stream; + unsigned int ch, s, i; + struct script_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + typedef struct + { + const char *lowercase_name; + } + scriptinfo_t; + scriptinfo_t scriptinfo[256]; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode scripts. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + for (s = 0; s < numscripts; s++) + { + char *lcp = strdup (scripts[s]); + char *cp; + + for (cp = lcp; *cp != '\0'; cp++) + if (*cp >= 'A' && *cp <= 'Z') + *cp += 'a' - 'A'; + + scriptinfo[s].lowercase_name = lcp; + } + + for (s = 0; s < numscripts; s++) + { + fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", + scriptinfo[s].lowercase_name); + fprintf (stream, "{\n"); + i = 0; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_scripts[ch] == s) + { + unsigned int start; + unsigned int end; + + start = ch; + while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) + ch++; + end = ch; + + if (i > 0) + fprintf (stream, ",\n"); + if (start == end) + fprintf (stream, " { 0x%04X, 1, 1 }", start); + else + fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", + start, end); + i++; + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + } + + fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); + fprintf (stream, "{\n"); + for (s = 0; s < numscripts; s++) + { + fprintf (stream, " {\n"); + fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " script_%s_intervals,\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " \"%s\"\n", scripts[s]); + fprintf (stream, " }"); + if (s+1 < numscripts) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + + t.p = 7; + t.q = 9; + script_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int s = unicode_scripts[ch]; + if (s != (uint8_t)~(uint8_t)0) + script_table_add (&t, ch, s); + } + + script_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define script_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_script =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_scripts_byname (const char *version) +{ + const char *filename = "unictype/scripts_byname.gperf"; + FILE *stream; + unsigned int s; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode scripts. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "struct named_script { int name; unsigned int index; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define hash-function-name scripts_hash\n"); + fprintf (stream, "%%define lookup-function-name uc_script_lookup\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%global-table\n"); + fprintf (stream, "%%define word-array-name script_names\n"); + fprintf (stream, "%%pic\n"); + fprintf (stream, "%%define string-pool-name script_stringpool\n"); + fprintf (stream, "%%%%\n"); + for (s = 0; s < numscripts; s++) + fprintf (stream, "%s, %u\n", scripts[s], s); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Blocks. */ typedef struct { unsigned int start; unsigned int end; const char *name; } block_t; @@ -3872,25 +4620,25 @@ fill_blocks (const char *blocks_filename) char blockname[200+1]; if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; + break; if (buf[0] == '\0' || buf[0] == '#') - continue; + continue; if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4) - { - fprintf (stderr, "parse error in '%s'\n", blocks_filename); - exit (1); - } + { + fprintf (stderr, "parse error in '%s'\n", blocks_filename); + exit (1); + } blocks[numblocks].start = i1; blocks[numblocks].end = i2; blocks[numblocks].name = strdup (blockname); /* It must be sorted. */ if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start)) - abort (); + abort (); numblocks++; if (numblocks == 256) - abort (); + abort (); } if (ferror (stream) || fclose (stream)) @@ -3914,9 +4662,9 @@ block_first_index (unsigned int ch) { unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ if (blocks[mid].end < ch) - lo = mid + 1; + lo = mid + 1; else - hi = mid; + hi = mid; } return hi; } @@ -3936,9 +4684,9 @@ block_last_index (unsigned int ch) { unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ if (blocks[mid].start <= ch) - lo = mid + 1; + lo = mid + 1; else - hi = mid; + hi = mid; } return hi; } @@ -3962,24 +4710,24 @@ output_blocks (const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode blocks. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); fprintf (stream, "static const uc_block_t blocks[] =\n"); fprintf (stream, "{\n"); for (i = 0; i < numblocks; i++) { fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start, - blocks[i].end, blocks[i].name); + blocks[i].end, blocks[i].name); if (i+1 < numblocks) - fprintf (stream, ","); + fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); fprintf (stream, "#define blocks_level1_shift %d\n", shift); fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold); fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n", - threshold >> shift); + threshold >> shift); fprintf (stream, "{\n"); for (i1 = 0; i1 < (threshold >> shift); i1++) { @@ -3987,14 +4735,14 @@ output_blocks (const char *version) unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1); fprintf (stream, " %3d, %3d", first_index, last_index); if (i1+1 < (threshold >> shift)) - fprintf (stream, ","); + fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); fprintf (stream, "#define blocks_upper_first_index %d\n", - block_first_index (threshold)); + block_first_index (threshold)); fprintf (stream, "#define blocks_upper_last_index %d\n", - block_last_index (0x10FFFF)); + block_last_index (0x10FFFF)); if (ferror (stream) || fclose (stream)) { @@ -4020,10 +4768,10 @@ static bool is_c_whitespace (unsigned int ch) { return (ch == ' ' /* space */ - || ch == '\t' /* horizontal tab */ - || ch == '\n' || ch == '\r' /* new-line */ - || ch == '\v' /* vertical tab */ - || ch == '\f'); /* form-feed */ + || ch == '\t' /* horizontal tab */ + || ch == '\n' || ch == '\r' /* new-line */ + || ch == '\v' /* vertical tab */ + || ch == '\f'); /* form-feed */ } /* ISO C 99 section 6.4.2.1 and appendix D. */ @@ -4323,7 +5071,7 @@ static bool is_java_whitespace (unsigned int ch) { return (ch == ' ' || ch == '\t' || ch == '\f' - || ch == '\n' || ch == '\r'); + || ch == '\n' || ch == '\r'); } /* The Java Language Specification, 3rd edition, §3.8. @@ -4379,8 +5127,8 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Language syntax properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", - version); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); t.p = 7; /* or 8 */ t.q = 5; /* or 4 */ @@ -4390,7 +5138,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co { int syntaxcode = predicate (ch); if (syntaxcode != UC_IDENTIFIER_INVALID) - identsyntax_table_add (&t, ch, syntaxcode); + identsyntax_table_add (&t, ch, syntaxcode); } identsyntax_table_finalize (&t); @@ -4408,14 +5156,14 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co for (i = 0; i < 5; i++) fprintf (stream, "#define identsyntax_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size, - (1 << t.p) * 2 / 16); + (1 << t.p) * 2 / 16); fprintf (stream, " }\n"); fprintf (stream, "%s =\n", name); fprintf (stream, "{\n"); @@ -4426,15 +5174,15 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -4446,15 +5194,15 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -4466,18 +5214,18 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " 0x%04x", - (((uint8_t *) (t.result + level3_offset))[8 * i] << 0) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12) - | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14)); + (((uint8_t *) (t.result + level3_offset))[8 * i] << 0) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14)); if (i+1 < (t.level3_size << t.p) * 2 / 16) - fprintf (stream, ","); + fprintf (stream, ","); } if ((t.level3_size << t.p) * 2 / 16 > 8) fprintf (stream, "\n "); @@ -4556,39 +5304,39 @@ static bool is_lower (unsigned int ch) { return (to_upper (ch) != ch) - /* is lowercase, but without simple to_upper mapping. */ - || (ch == 0x00DF); + /* is lowercase, but without simple to_upper mapping. */ + || (ch == 0x00DF); } static bool is_alpha (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && ((unicode_attributes[ch].category[0] == 'L' - /* Theppitak Karoonboonyanan says - , should belong to is_punct. */ - && (ch != 0x0E2F) && (ch != 0x0E46)) - /* Theppitak Karoonboonyanan says - , .., .. are is_alpha. */ - || (ch == 0x0E31) - || (ch >= 0x0E34 && ch <= 0x0E3A) - || (ch >= 0x0E47 && ch <= 0x0E4E) - /* Avoid warning for . */ - || (ch == 0x0345) - /* Avoid warnings for ... */ - || (unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'l') - /* Avoid warnings for ... */ - || (unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'o' - && strstr (unicode_attributes[ch].name, " LETTER ") - != NULL) - /* Consider all the non-ASCII digits as alphabetic. - ISO C 99 forbids us to have them in category "digit", - but we want iswalnum to return true on them. */ - || (unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'd' - && !(ch >= 0x0030 && ch <= 0x0039)))); + && ((unicode_attributes[ch].category[0] == 'L' + /* Theppitak Karoonboonyanan says + , should belong to is_punct. */ + && (ch != 0x0E2F) && (ch != 0x0E46)) + /* Theppitak Karoonboonyanan says + , .., .. are is_alpha. */ + || (ch == 0x0E31) + || (ch >= 0x0E34 && ch <= 0x0E3A) + || (ch >= 0x0E47 && ch <= 0x0E4E) + /* Avoid warning for . */ + || (ch == 0x0345) + /* Avoid warnings for ... */ + || (unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'l') + /* Avoid warnings for ... */ + || (unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'o' + && strstr (unicode_attributes[ch].name, " LETTER ") + != NULL) + /* Consider all the non-ASCII digits as alphabetic. + ISO C 99 forbids us to have them in category "digit", + but we want iswalnum to return true on them. */ + || (unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd' + && !(ch >= 0x0030 && ch <= 0x0039)))); } static bool @@ -4596,8 +5344,8 @@ is_digit (unsigned int ch) { #if 0 return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'd'); + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd'); /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without a zero. Must add <0> in front of them by hand. */ #else @@ -4629,11 +5377,11 @@ static bool is_blank (unsigned int ch) { return (ch == 0x0009 /* '\t' */ - /* Category Zs without mention of "" */ - || (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && unicode_attributes[ch].category[1] == 's' - && !strstr (unicode_attributes[ch].decomposition, ""))); + /* Category Zs without mention of "" */ + || (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 's' + && !strstr (unicode_attributes[ch].decomposition, ""))); } static bool @@ -4642,30 +5390,30 @@ is_space (unsigned int ch) /* Don't make U+00A0 a space. Non-breaking space means that all programs should treat it like a punctuation character, not like a space. */ return (ch == 0x0020 /* ' ' */ - || ch == 0x000C /* '\f' */ - || ch == 0x000A /* '\n' */ - || ch == 0x000D /* '\r' */ - || ch == 0x0009 /* '\t' */ - || ch == 0x000B /* '\v' */ - /* Categories Zl, Zp, and Zs without mention of "" */ - || (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && (unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 'p' - || (unicode_attributes[ch].category[1] == 's' - && !strstr (unicode_attributes[ch].decomposition, - ""))))); + || ch == 0x000C /* '\f' */ + || ch == 0x000A /* '\n' */ + || ch == 0x000D /* '\r' */ + || ch == 0x0009 /* '\t' */ + || ch == 0x000B /* '\v' */ + /* Categories Zl, Zp, and Zs without mention of "" */ + || (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p' + || (unicode_attributes[ch].category[1] == 's' + && !strstr (unicode_attributes[ch].decomposition, + ""))))); } static bool is_cntrl (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && (strcmp (unicode_attributes[ch].name, "") == 0 - /* Categories Zl and Zp */ - || (unicode_attributes[ch].category[0] == 'Z' - && (unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 'p')))); + && (strcmp (unicode_attributes[ch].name, "") == 0 + /* Categories Zl and Zp */ + || (unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p')))); } static bool @@ -4673,8 +5421,8 @@ is_xdigit (unsigned int ch) { #if 0 return is_digit (ch) - || (ch >= 0x0041 && ch <= 0x0046) - || (ch >= 0x0061 && ch <= 0x0066); + || (ch >= 0x0041 && ch <= 0x0046) + || (ch >= 0x0061 && ch <= 0x0066); #else /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 takes it away: @@ -4685,8 +5433,8 @@ is_xdigit (unsigned int ch) hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F */ return (ch >= 0x0030 && ch <= 0x0039) - || (ch >= 0x0041 && ch <= 0x0046) - || (ch >= 0x0061 && ch <= 0x0066); + || (ch >= 0x0041 && ch <= 0x0046) + || (ch >= 0x0061 && ch <= 0x0066); #endif } @@ -4694,20 +5442,20 @@ static bool is_graph (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && strcmp (unicode_attributes[ch].name, "") - && !is_space (ch)); + && strcmp (unicode_attributes[ch].name, "") + && !is_space (ch)); } static bool is_print (unsigned int ch) { return (unicode_attributes[ch].name != NULL - && strcmp (unicode_attributes[ch].name, "") - /* Categories Zl and Zp */ - && !(unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'Z' - && (unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 'p'))); + && strcmp (unicode_attributes[ch].name, "") + /* Categories Zl and Zp */ + && !(unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p'))); } static bool @@ -4715,7 +5463,7 @@ is_punct (unsigned int ch) { #if 0 return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'P'); + && unicode_attributes[ch].category[0] == 'P'); #else /* The traditional POSIX definition of punctuation is every graphic, non-alphanumeric character. */ @@ -4756,19 +5504,19 @@ is_combining (unsigned int ch) "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the PropList.txt file, so we take the latter definition. */ return (unicode_attributes[ch].name != NULL - && unicode_attributes[ch].category[0] == 'M' - && (unicode_attributes[ch].category[1] == 'n' - || unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'e')); + && unicode_attributes[ch].category[0] == 'M' + && (unicode_attributes[ch].category[1] == 'n' + || unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'e')); } static bool is_combining_level3 (unsigned int ch) { return is_combining (ch) - && !(unicode_attributes[ch].combining[0] != '\0' - && unicode_attributes[ch].combining[0] != '0' - && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200); + && !(unicode_attributes[ch].combining[0] != '\0' + && unicode_attributes[ch].combining[0] != '0' + && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200); } /* Return the UCS symbol string for a Unicode character. */ @@ -4797,7 +5545,7 @@ ucs_symbol_range (unsigned int low, unsigned int high) static void output_charclass (FILE *stream, const char *classname, - bool (*func) (unsigned int)) + bool (*func) (unsigned int)) { char table[0x110000]; unsigned int i; @@ -4814,39 +5562,39 @@ output_charclass (FILE *stream, const char *classname, for (i = 0; i < 0x110000; ) { if (!table[i]) - i++; + i++; else - { - unsigned int low, high; - char buf[25]; - - low = i; - do - i++; - while (i < 0x110000 && table[i]); - high = i - 1; - - if (low == high) - strcpy (buf, ucs_symbol (low)); - else - strcpy (buf, ucs_symbol_range (low, high)); - - if (need_semicolon) - { - fprintf (stream, ";"); - column++; - } - - if (column + strlen (buf) > max_column) - { - fprintf (stream, "/\n "); - column = 3; - } - - fprintf (stream, "%s", buf); - column += strlen (buf); - need_semicolon = true; - } + { + unsigned int low, high; + char buf[25]; + + low = i; + do + i++; + while (i < 0x110000 && table[i]); + high = i - 1; + + if (low == high) + strcpy (buf, ucs_symbol (low)); + else + strcpy (buf, ucs_symbol_range (low, high)); + + if (need_semicolon) + { + fprintf (stream, ";"); + column++; + } + + if (column + strlen (buf) > max_column) + { + fprintf (stream, "/\n "); + column = 3; + } + + fprintf (stream, "%s", buf); + column += strlen (buf); + need_semicolon = true; + } } fprintf (stream, "\n"); } @@ -4855,7 +5603,7 @@ output_charclass (FILE *stream, const char *classname, static void output_charmap (FILE *stream, const char *mapname, - unsigned int (*func) (unsigned int)) + unsigned int (*func) (unsigned int)) { char table[0x110000]; unsigned int i; @@ -4872,29 +5620,29 @@ output_charmap (FILE *stream, const char *mapname, for (i = 0; i < 0x110000; i++) if (table[i]) { - char buf[25+1]; - - strcpy (buf, "("); - strcat (buf, ucs_symbol (i)); - strcat (buf, ","); - strcat (buf, ucs_symbol (func (i))); - strcat (buf, ")"); - - if (need_semicolon) - { - fprintf (stream, ";"); - column++; - } - - if (column + strlen (buf) > max_column) - { - fprintf (stream, "/\n "); - column = 3; - } - - fprintf (stream, "%s", buf); - column += strlen (buf); - need_semicolon = true; + char buf[25+1]; + + strcpy (buf, "("); + strcat (buf, ucs_symbol (i)); + strcat (buf, ","); + strcat (buf, ucs_symbol (func (i))); + strcat (buf, ")"); + + if (need_semicolon) + { + fprintf (stream, ";"); + column++; + } + + if (column + strlen (buf) > max_column) + { + fprintf (stream, "/\n "); + column = 3; + } + + fprintf (stream, "%s", buf); + column += strlen (buf); + need_semicolon = true; } fprintf (stream, "\n"); } @@ -4925,7 +5673,7 @@ output_tables (const char *filename, const char *version) fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", - version); + version); fprintf (stream, "\n"); fprintf (stream, "LC_IDENTIFICATION\n"); @@ -4954,85 +5702,85 @@ output_tables (const char *filename, const char *version) for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords - lower and upper shall be specified. */ + lower and upper shall be specified. */ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) - fprintf (stderr, - "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", - ucs_symbol (ch), ch, to_upper (ch)); + fprintf (stderr, + "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", + ucs_symbol (ch), ch, to_upper (ch)); /* tolower restriction: "Only characters specified for the keywords - lower and upper shall be specified. */ + lower and upper shall be specified. */ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) - fprintf (stderr, - "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", - ucs_symbol (ch), ch, to_lower (ch)); + fprintf (stderr, + "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", + ucs_symbol (ch), ch, to_lower (ch)); /* alpha restriction: "Characters classified as either upper or lower - shall automatically belong to this class. */ + shall automatically belong to this class. */ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) - fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); + fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); /* alpha restriction: "No character specified for the keywords cntrl, - digit, punct or space shall be specified." */ + digit, punct or space shall be specified." */ if (is_alpha (ch) && is_cntrl (ch)) - fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); + fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); if (is_alpha (ch) && is_digit (ch)) - fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); if (is_alpha (ch) && is_punct (ch)) - fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); + fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); if (is_alpha (ch) && is_space (ch)) - fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); + fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); /* space restriction: "No character specified for the keywords upper, - lower, alpha, digit, graph or xdigit shall be specified." - upper, lower, alpha already checked above. */ + lower, alpha, digit, graph or xdigit shall be specified." + upper, lower, alpha already checked above. */ if (is_space (ch) && is_digit (ch)) - fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); if (is_space (ch) && is_graph (ch)) - fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); + fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); if (is_space (ch) && is_xdigit (ch)) - fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); /* cntrl restriction: "No character specified for the keywords upper, - lower, alpha, digit, punct, graph, print or xdigit shall be - specified." upper, lower, alpha already checked above. */ + lower, alpha, digit, punct, graph, print or xdigit shall be + specified." upper, lower, alpha already checked above. */ if (is_cntrl (ch) && is_digit (ch)) - fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_punct (ch)) - fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); + fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_graph (ch)) - fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); + fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_print (ch)) - fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); + fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_xdigit (ch)) - fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); /* punct restriction: "No character specified for the keywords upper, - lower, alpha, digit, cntrl, xdigit or as the character shall - be specified." upper, lower, alpha, cntrl already checked above. */ + lower, alpha, digit, cntrl, xdigit or as the character shall + be specified." upper, lower, alpha, cntrl already checked above. */ if (is_punct (ch) && is_digit (ch)) - fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); if (is_punct (ch) && is_xdigit (ch)) - fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); + fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); if (is_punct (ch) && (ch == 0x0020)) - fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); + fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); /* graph restriction: "No character specified for the keyword cntrl - shall be specified." Already checked above. */ + shall be specified." Already checked above. */ /* print restriction: "No character specified for the keyword cntrl - shall be specified." Already checked above. */ + shall be specified." Already checked above. */ /* graph - print relation: differ only in the character. - How is this possible if there are more than one space character?! - I think susv2/xbd/locale.html should speak of "space characters", - not "space character". */ + How is this possible if there are more than one space character?! + I think susv2/xbd/locale.html should speak of "space characters", + not "space character". */ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) - fprintf (stderr, - "%s is print but not graph|\n", ucs_symbol (ch)); + fprintf (stderr, + "%s is print but not graph|\n", ucs_symbol (ch)); if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) - fprintf (stderr, - "%s is graph| but not print\n", ucs_symbol (ch)); + fprintf (stderr, + "%s is graph| but not print\n", ucs_symbol (ch)); } fprintf (stream, "LC_CTYPE\n"); @@ -5063,714 +5811,1033 @@ output_tables (const char *filename, const char *version) } } -#endif - -/* ========================================================================= */ - -/* The width property from the EastAsianWidth.txt file. - Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ -const char * unicode_width[0x110000]; - -/* Stores in unicode_width[] the width property from the EastAsianWidth.txt - file. */ +#endif + +/* ========================================================================= */ + +/* The width property from the EastAsianWidth.txt file. + Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ +const char * unicode_width[0x110000]; + +/* Stores in unicode_width[] the width property from the EastAsianWidth.txt + file. */ +static void +fill_width (const char *width_filename) +{ + unsigned int i, j; + FILE *stream; + char field0[FIELDLEN]; + char field1[FIELDLEN]; + char field2[FIELDLEN]; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); + + stream = fopen (width_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", width_filename); + exit (1); + } + + for (;;) + { + int n; + int c; + + lineno++; + c = getc (stream); + if (c == EOF) + break; + if (c == '#') + { + do c = getc (stream); while (c != EOF && c != '\n'); + continue; + } + ungetc (c, stream); + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ' '); + n += getfield (stream, field2, '\n'); + if (n == 0) + break; + if (n != 3) + { + fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); + exit (1); + } + i = strtoul (field0, NULL, 16); + if (strstr (field0, "..") != NULL) + { + /* Deal with a range. */ + j = strtoul (strstr (field0, "..") + 2, NULL, 16); + for (; i <= j; i++) + unicode_width[i] = strdup (field1); + } + else + { + /* Single character line. */ + unicode_width[i] = strdup (field1); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", width_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Non-spacing attribute and width. */ + +/* The non-spacing attribute table consists of: + - Non-spacing characters; generated from PropList.txt or + "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" + - Format control characters; generated from + "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" + - Zero width characters; generated from + "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" + */ + +static bool +is_nonspacing (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (get_bidi_category (ch) == UC_BIDI_NSM + || is_category_Cc (ch) || is_category_Cf (ch) + || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0)); +} + +static void +output_nonspacing_property (const char *filename) +{ + FILE *stream; + int ind[0x110000 / 0x200]; + unsigned int i; + unsigned int i_max; + int next_ind; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + next_ind = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = false; + unsigned int ch; + + if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */ + for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++) + if (is_nonspacing (ch)) + { + nontrivial = true; + break; + } + if (nontrivial) + ind[i] = next_ind++; + else + ind[i] = -1; + } + + fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n", + next_ind); + i_max = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = (ind[i] >= 0); + + if (nontrivial) + { + unsigned int j; + + fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1); + for (j = 0; j < 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + unsigned int l; + unsigned char bits = 0; + + for (l = 0; l < 8; l++) + { + unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l; + + if (is_nonspacing (ch)) + bits |= 1 << l; + } + fprintf (stream, " 0x%02x%c", bits, + ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1); + } + i_max = i; + } + } + fprintf (stream, "};\n"); + + i_max = ((i_max + 8 - 1) / 8) * 8; + fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n", + i_max); + { + unsigned int j; + + for (j = 0; j < i_max / 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + i = j * 8 + k; + fprintf (stream, " %2d%c", ind[i], + j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1); + } + } + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */ +static char +symbolic_width (unsigned int ch) +{ + /* Test for unassigned character. */ + if (is_property_unassigned_code_value (ch)) + { + /* Unicode TR#11 section "Unassigned and Private-Use Characters". */ + if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */ + return 'A'; + if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */ + || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */ + || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */) + return '2'; + return 0; + } + else + { + /* Test for non-spacing or control character. */ + if (is_category_Cc (ch) && ch < 0x00A0) + return 0; + if (is_nonspacing (ch)) + return '0'; + /* Test for double-width character. */ + if (unicode_width[ch] != NULL + && (strcmp (unicode_width[ch], "W") == 0 + || strcmp (unicode_width[ch], "F") == 0)) + return '2'; + /* Test for half-width character. */ + if (unicode_width[ch] != NULL + && strcmp (unicode_width[ch], "H") == 0) + return '1'; + } + /* In ancient CJK encodings, Cyrillic and most other characters are + double-width as well. */ + if (ch >= 0x00A1 && ch < 0x10000) + return 'A'; + return '1'; +} + static void -fill_width (const char *width_filename) +output_width_property_test (const char *filename) { - unsigned int i, j; FILE *stream; - char field0[FIELDLEN]; - char field1[FIELDLEN]; - char field2[FIELDLEN]; - int lineno = 0; + unsigned int interval_start, interval_end, ch; + char interval_value; - for (i = 0; i < 0x110000; i++) - unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); - - stream = fopen (width_filename, "r"); + stream = fopen (filename, "w"); if (stream == NULL) { - fprintf (stderr, "error during fopen of '%s'\n", width_filename); + fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } - for (;;) + interval_value = 0; + interval_start = interval_end = 0; /* avoid GCC warning */ + for (ch = 0; ch < 0x110000; ch++) { - int n; - int c; - - lineno++; - c = getc (stream); - if (c == EOF) - break; - if (c == '#') - { - do c = getc (stream); while (c != EOF && c != '\n'); - continue; - } - ungetc (c, stream); - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ' '); - n += getfield (stream, field2, '\n'); - if (n == 0) - break; - if (n != 3) - { - fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); - exit (1); - } - i = strtoul (field0, NULL, 16); - if (strstr (field0, "..") != NULL) - { - /* Deal with a range. */ - j = strtoul (strstr (field0, "..") + 2, NULL, 16); - for (; i <= j; i++) - unicode_width[i] = strdup (field1); - } + char value = symbolic_width (ch); + if (value != 0) /* skip Cc control characters and unassigned characters */ + { + if (value == interval_value) + /* Extend the interval. */ + interval_end = ch; + else + { + /* Terminate the interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + /* Start a new interval. */ + interval_start = interval_end = ch; + interval_value = value; + } + } + } + /* Terminate the last interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); else - { - /* Single character line. */ - unicode_width[i] = strdup (field1); - } + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); } + if (ferror (stream) || fclose (stream)) { - fprintf (stderr, "error reading from '%s'\n", width_filename); + fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } -/* Line breaking classification. */ +/* ========================================================================= */ + +/* Line breaking classification. + Updated for Unicode TR #14 revision 26. */ enum { - /* Values >= 24 are resolved at run time. */ - LBP_BK = 24, /* mandatory break */ + /* Values >= 25 are resolved at run time. */ + LBP_BK = 25, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 25, /* attached characters and combining marks */ + LBP_CM = 26, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ - LBP_ZW = 26, /* zero width space */ + LBP_ZW = 27, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ - LBP_SP = 27, /* space */ + LBP_SP = 28, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ - LBP_CB = 28, /* contingent break opportunity */ + LBP_CB = 29, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ - LBP_EX = 7, /* exclamation/interrogation */ - LBP_IN = 8, /* inseparable */ - LBP_NS = 9, /* non starter */ - LBP_OP = 10, /* opening punctuation */ - LBP_QU = 11, /* ambiguous quotation */ - LBP_IS = 12, /* infix separator (numeric) */ - LBP_NU = 13, /* numeric */ - LBP_PO = 14, /* postfix (numeric) */ - LBP_PR = 15, /* prefix (numeric) */ - LBP_SY = 16, /* symbols allowing breaks */ - LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */ - LBP_AL = 17, /* ordinary alphabetic and symbol characters */ - LBP_H2 = 18, /* Hangul LV syllable */ - LBP_H3 = 19, /* Hangul LVT syllable */ - LBP_ID = 20, /* ideographic */ - LBP_JL = 21, /* Hangul L Jamo */ - LBP_JV = 22, /* Hangul V Jamo */ - LBP_JT = 23, /* Hangul T Jamo */ - LBP_SA = 30, /* complex context (South East Asian) */ - LBP_XX = 31 /* unknown */ + LBP_CP = 7, /* closing parenthesis */ + LBP_EX = 8, /* exclamation/interrogation */ + LBP_IN = 9, /* inseparable */ + LBP_NS = 10, /* non starter */ + LBP_OP = 11, /* opening punctuation */ + LBP_QU = 12, /* ambiguous quotation */ + LBP_IS = 13, /* infix separator (numeric) */ + LBP_NU = 14, /* numeric */ + LBP_PO = 15, /* postfix (numeric) */ + LBP_PR = 16, /* prefix (numeric) */ + LBP_SY = 17, /* symbols allowing breaks */ + LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */ + LBP_AL = 18, /* ordinary alphabetic and symbol characters */ + LBP_H2 = 19, /* Hangul LV syllable */ + LBP_H3 = 20, /* Hangul LVT syllable */ + LBP_ID = 21, /* ideographic */ + LBP_JL = 22, /* Hangul L Jamo */ + LBP_JV = 23, /* Hangul V Jamo */ + LBP_JT = 24, /* Hangul T Jamo */ + LBP_SA = 31, /* complex context (South East Asian) */ + LBP_XX = 32 /* unknown */ }; /* Returns the line breaking classification for ch, as a bit mask. */ -static int +static int64_t get_lbp (unsigned int ch) { - int attr = 0; + int64_t attr = 0; if (unicode_attributes[ch].name != NULL) { /* mandatory break */ if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ - || ch == 0x000C /* form feed */ - || ch == 0x000B /* line tabulation */ - || ch == 0x2028 /* LINE SEPARATOR */ - || ch == 0x2029 /* PARAGRAPH SEPARATOR */) - attr |= 1 << LBP_BK; + || ch == 0x000C /* form feed */ + || ch == 0x000B /* line tabulation */ + || ch == 0x2028 /* LINE SEPARATOR */ + || ch == 0x2029 /* PARAGRAPH SEPARATOR */) + attr |= (int64_t) 1 << LBP_BK; if (ch == 0x2060 /* WORD JOINER */ - || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) - attr |= 1 << LBP_WJ; + || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) + attr |= (int64_t) 1 << LBP_WJ; /* zero width space */ if (ch == 0x200B /* ZERO WIDTH SPACE */) - attr |= 1 << LBP_ZW; + attr |= (int64_t) 1 << LBP_ZW; /* non-breaking (glue) */ if (ch == 0x00A0 /* NO-BREAK SPACE */ - || ch == 0x202F /* NARROW NO-BREAK SPACE */ - || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ - || ch == 0x034F /* COMBINING GRAPHEME JOINER */ - || ch == 0x2007 /* FIGURE SPACE */ - || ch == 0x2011 /* NON-BREAKING HYPHEN */ - || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ - || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ - || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ - || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */) - attr |= 1 << LBP_GL; + || ch == 0x202F /* NARROW NO-BREAK SPACE */ + || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ + || ch == 0x034F /* COMBINING GRAPHEME JOINER */ + || ch == 0x2007 /* FIGURE SPACE */ + || ch == 0x2011 /* NON-BREAKING HYPHEN */ + || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ + || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ + || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ + || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */ + || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */) + attr |= (int64_t) 1 << LBP_GL; /* space */ if (ch == 0x0020 /* SPACE */) - attr |= 1 << LBP_SP; + attr |= (int64_t) 1 << LBP_SP; /* break opportunity before and after */ if (ch == 0x2014 /* EM DASH */) - attr |= 1 << LBP_B2; + attr |= (int64_t) 1 << LBP_B2; /* break opportunity after */ - if (ch == 0x1680 /* OGHAM SPACE MARK */ - || ch == 0x2000 /* EN QUAD */ - || ch == 0x2001 /* EM QUAD */ - || ch == 0x2002 /* EN SPACE */ - || ch == 0x2003 /* EM SPACE */ - || ch == 0x2004 /* THREE-PER-EM SPACE */ - || ch == 0x2005 /* FOUR-PER-EM SPACE */ - || ch == 0x2006 /* SIX-PER-EM SPACE */ - || ch == 0x2008 /* PUNCTUATION SPACE */ - || ch == 0x2009 /* THIN SPACE */ - || ch == 0x200A /* HAIR SPACE */ - || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ - || ch == 0x0009 /* tab */ - || ch == 0x00AD /* SOFT HYPHEN */ - || ch == 0x058A /* ARMENIAN HYPHEN */ - || ch == 0x2010 /* HYPHEN */ - || ch == 0x2012 /* FIGURE DASH */ - || ch == 0x2013 /* EN DASH */ - || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ - || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ - || ch == 0x1361 /* ETHIOPIC WORDSPACE */ - || ch == 0x17D8 /* KHMER SIGN BEYYAL */ - || ch == 0x17DA /* KHMER SIGN KOOMUUT */ - || ch == 0x2027 /* HYPHENATION POINT */ - || ch == 0x007C /* VERTICAL LINE */ - || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ - || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ - || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ - || ch == 0x2056 /* THREE DOT PUNCTUATION */ - || ch == 0x2058 /* FOUR DOT PUNCTUATION */ - || ch == 0x2059 /* FIVE DOT PUNCTUATION */ - || ch == 0x205A /* TWO DOT PUNCTUATION */ - || ch == 0x205B /* FOUR DOT MARK */ - || ch == 0x205D /* TRICOLON */ - || ch == 0x205E /* VERTICAL FOUR DOTS */ - || ch == 0x2E19 /* PALM BRANCH */ - || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */ - || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */ - || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ - || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ - || ch == 0x2E30 /* RING POINT */ - || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ - || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ - || ch == 0x10102 /* AEGEAN CHECK MARK */ - || ch == 0x1039F /* UGARITIC WORD DIVIDER */ - || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ - || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ - || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ - || ch == 0x0964 /* DEVANAGARI DANDA */ - || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ - || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ - || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ - || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ - || ch == 0x104B /* MYANMAR SIGN SECTION */ - || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */ - || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */ - || ch == 0x17D4 /* KHMER SIGN KHAN */ - || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ - || ch == 0x1B5E /* BALINESE CARIK SIKI */ - || ch == 0x1B5F /* BALINESE CARIK PAREREN */ - || ch == 0xA8CE /* SAURASHTRA DANDA */ - || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */ - || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */ - || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */ - || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ - || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ - || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ - || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ - || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ - || ch == 0x0F85 /* TIBETAN MARK PALUTA */ - || ch == 0x0FBE /* TIBETAN KU RU KHA */ - || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ - || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ - || ch == 0x1804 /* MONGOLIAN COLON */ - || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ - || ch == 0x1B5A /* BALINESE PANTI */ - || ch == 0x1B5B /* BALINESE PAMADA */ - || ch == 0x1B5C /* BALINESE WINDU */ - || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ - || ch == 0x1B60 /* BALINESE PAMENENG */ - || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ - || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */ - || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */ - || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */ - || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ - || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ - || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ - || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ - || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ - || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ - || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ - || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ - || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ - || ch == 0xA60D /* VAI COMMA */ - || ch == 0xA60F /* VAI QUESTION MARK */ - || ch == 0xA92E /* KAYAH LI SIGN CWI */ - || ch == 0xA92F /* KAYAH LI SIGN SHYA */ - || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */ - || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */ - || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */ - || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */ - || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ - || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ - || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ - || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) - attr |= 1 << LBP_BA; + if (/* Breaking Spaces */ + ch == 0x1680 /* OGHAM SPACE MARK */ + || ch == 0x2000 /* EN QUAD */ + || ch == 0x2001 /* EM QUAD */ + || ch == 0x2002 /* EN SPACE */ + || ch == 0x2003 /* EM SPACE */ + || ch == 0x2004 /* THREE-PER-EM SPACE */ + || ch == 0x2005 /* FOUR-PER-EM SPACE */ + || ch == 0x2006 /* SIX-PER-EM SPACE */ + || ch == 0x2008 /* PUNCTUATION SPACE */ + || ch == 0x2009 /* THIN SPACE */ + || ch == 0x200A /* HAIR SPACE */ + || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ + /* Tabs */ + || ch == 0x0009 /* tab */ + /* Conditional Hyphens */ + || ch == 0x00AD /* SOFT HYPHEN */ + /* Breaking Hyphens */ + || ch == 0x058A /* ARMENIAN HYPHEN */ + || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */ + || ch == 0x2010 /* HYPHEN */ + || ch == 0x2012 /* FIGURE DASH */ + || ch == 0x2013 /* EN DASH */ + /* Visible Word Dividers */ + || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ + || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ + || ch == 0x1361 /* ETHIOPIC WORDSPACE */ + || ch == 0x17D8 /* KHMER SIGN BEYYAL */ + || ch == 0x17DA /* KHMER SIGN KOOMUUT */ + || ch == 0x2027 /* HYPHENATION POINT */ + || ch == 0x007C /* VERTICAL LINE */ + /* Historic Word Separators */ + || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ + || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ + || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ + || ch == 0x2056 /* THREE DOT PUNCTUATION */ + || ch == 0x2058 /* FOUR DOT PUNCTUATION */ + || ch == 0x2059 /* FIVE DOT PUNCTUATION */ + || ch == 0x205A /* TWO DOT PUNCTUATION */ + || ch == 0x205B /* FOUR DOT MARK */ + || ch == 0x205D /* TRICOLON */ + || ch == 0x205E /* VERTICAL FOUR DOTS */ + || ch == 0x2E19 /* PALM BRANCH */ + || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ + || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ + || ch == 0x2E30 /* RING POINT */ + || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */ + || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ + || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ + || ch == 0x10102 /* AEGEAN CHECK MARK */ + || ch == 0x1039F /* UGARITIC WORD DIVIDER */ + || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ + || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ + || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ + /* Dandas */ + || ch == 0x0964 /* DEVANAGARI DANDA */ + || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ + || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ + || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ + || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ + || ch == 0x104B /* MYANMAR SIGN SECTION */ + || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */ + || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */ + || ch == 0x17D4 /* KHMER SIGN KHAN */ + || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ + || ch == 0x1B5E /* BALINESE CARIK SIKI */ + || ch == 0x1B5F /* BALINESE CARIK PAREREN */ + || ch == 0xA8CE /* SAURASHTRA DANDA */ + || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */ + || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */ + || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */ + || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ + || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ + || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ + /* Tibetan */ + || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ + || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ + || ch == 0x0F85 /* TIBETAN MARK PALUTA */ + || ch == 0x0FBE /* TIBETAN KU RU KHA */ + || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ + || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ + /* Other Terminating Punctuation */ + || ch == 0x1804 /* MONGOLIAN COLON */ + || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ + || ch == 0x1B5A /* BALINESE PANTI */ + || ch == 0x1B5B /* BALINESE PAMADA */ + || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ + || ch == 0x1B60 /* BALINESE PAMENENG */ + || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ + || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */ + || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */ + || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */ + || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ + || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ + || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ + || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ + || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ + || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ + || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ + || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ + || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ + || ch == 0xA60D /* VAI COMMA */ + || ch == 0xA60F /* VAI QUESTION MARK */ + || ch == 0xA92E /* KAYAH LI SIGN CWI */ + || ch == 0xA92F /* KAYAH LI SIGN SHYA */ + || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */ + || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */ + || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */ + || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */ + || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ + || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */ + || ch == 0xA4FE /* LISU PUNCTUATION COMMA */ + || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */ + || ch == 0xA6F3 /* BAMUM FULL STOP */ + || ch == 0xA6F4 /* BAMUM COLON */ + || ch == 0xA6F5 /* BAMUM COMMA */ + || ch == 0xA6F6 /* BAMUM SEMICOLON */ + || ch == 0xA6F7 /* BAMUM QUESTION MARK */ + || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */ + || ch == 0xA9C8 /* JAVANESE PADA LINGSA */ + || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */ + || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */ + || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */ + || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */ + || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */ + || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */ + || ch == 0x11047 /* BRAHMI DANDA */ + || ch == 0x11048 /* BRAHMI DOUBLE DANDA */ + || ch == 0x110BE /* KAITHI SECTION MARK */ + || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */ + || ch == 0x110C0 /* KAITHI DANDA */ + || ch == 0x110C1 /* KAITHI DOUBLE DANDA */ + || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ + || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ + || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) + attr |= (int64_t) 1 << LBP_BA; /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ - || ch == 0x1FFD /* GREEK OXIA */ - || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ - || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ - || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ - || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ - || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */ - || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */ - || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ - || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ - || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ - || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ - || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ - || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */ - || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */ - || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */ - || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ - || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ - || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) - attr |= 1 << LBP_BB; + || ch == 0x1FFD /* GREEK OXIA */ + || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ + || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ + || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ + || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ + || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */ + || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */ + || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ + || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ + || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ + || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ + || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ + || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */ + || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */ + || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */ + || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ + || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ + || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) + attr |= (int64_t) 1 << LBP_BB; /* hyphen */ if (ch == 0x002D /* HYPHEN-MINUS */) - attr |= 1 << LBP_HY; + attr |= (int64_t) 1 << LBP_HY; /* contingent break opportunity */ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) - attr |= 1 << LBP_CB; + attr |= (int64_t) 1 << LBP_CB; + + /* closing parenthesis */ + if (ch == 0x0029 /* RIGHT PARENTHESIS */ + || ch == 0x005D /* RIGHT SQUARE BRACKET */) + attr |= (int64_t) 1 << LBP_CP; /* closing punctuation */ if ((unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'e') - || ch == 0x3001 /* IDEOGRAPHIC COMMA */ - || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ - || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ - || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */ - || ch == 0xFE50 /* SMALL COMMA */ - || ch == 0xFE52 /* SMALL FULL STOP */ - || ch == 0xFF0C /* FULLWIDTH COMMA */ - || ch == 0xFF0E /* FULLWIDTH FULL STOP */ - || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ - || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */) - attr |= 1 << LBP_CL; + && unicode_attributes[ch].category[1] == 'e' + && !(attr & ((int64_t) 1 << LBP_CP))) + || ch == 0x3001 /* IDEOGRAPHIC COMMA */ + || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ + || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ + || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */ + || ch == 0xFE50 /* SMALL COMMA */ + || ch == 0xFE52 /* SMALL FULL STOP */ + || ch == 0xFF0C /* FULLWIDTH COMMA */ + || ch == 0xFF0E /* FULLWIDTH FULL STOP */ + || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ + || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */ + || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */ + || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */ + || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */ + || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */ + || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */ + || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */ + || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */) + attr |= (int64_t) 1 << LBP_CL; /* exclamation/interrogation */ if (ch == 0x0021 /* EXCLAMATION MARK */ - || ch == 0x003F /* QUESTION MARK */ - || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ - || ch == 0x061B /* ARABIC SEMICOLON */ - || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ - || ch == 0x061F /* ARABIC QUESTION MARK */ - || ch == 0x06D4 /* ARABIC FULL STOP */ - || ch == 0x07F9 /* NKO EXCLAMATION MARK */ - || ch == 0x0F0D /* TIBETAN MARK SHAD */ - || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ - || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ - || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ - || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ - || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ - || ch == 0x1802 /* MONGOLIAN COMMA */ - || ch == 0x1803 /* MONGOLIAN FULL STOP */ - || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ - || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ - || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ - || ch == 0x1945 /* LIMBU QUESTION MARK */ - || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ - || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ - || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ - || ch == 0x2CFE /* COPTIC FULL STOP */ - || ch == 0x2E2E /* REVERSED QUESTION MARK */ -#if REVISION_22 - || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ -#endif - || ch == 0xA60E /* VAI FULL STOP */ - || ch == 0xA876 /* PHAGS-PA MARK SHAD */ - || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ - || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ - || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */ - || ch == 0xFE56 /* SMALL QUESTION MARK */ - || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ - || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ - || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) - attr |= 1 << LBP_EX; + || ch == 0x003F /* QUESTION MARK */ + || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ + || ch == 0x061B /* ARABIC SEMICOLON */ + || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ + || ch == 0x061F /* ARABIC QUESTION MARK */ + || ch == 0x06D4 /* ARABIC FULL STOP */ + || ch == 0x07F9 /* NKO EXCLAMATION MARK */ + || ch == 0x0F0D /* TIBETAN MARK SHAD */ + || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ + || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ + || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ + || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ + || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ + || ch == 0x1802 /* MONGOLIAN COMMA */ + || ch == 0x1803 /* MONGOLIAN FULL STOP */ + || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ + || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ + || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ + || ch == 0x1945 /* LIMBU QUESTION MARK */ + || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ + || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ + || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ + || ch == 0x2CFE /* COPTIC FULL STOP */ + || ch == 0x2E2E /* REVERSED QUESTION MARK */ + || ch == 0xA60E /* VAI FULL STOP */ + || ch == 0xA876 /* PHAGS-PA MARK SHAD */ + || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ + || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ + || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */ + || ch == 0xFE56 /* SMALL QUESTION MARK */ + || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ + || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ + || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) + attr |= (int64_t) 1 << LBP_EX; /* inseparable */ if (ch == 0x2024 /* ONE DOT LEADER */ - || ch == 0x2025 /* TWO DOT LEADER */ - || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ - || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) - attr |= 1 << LBP_IN; + || ch == 0x2025 /* TWO DOT LEADER */ + || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ + || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) + attr |= (int64_t) 1 << LBP_IN; /* non starter */ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ - || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ - || ch == 0x203D /* INTERROBANG */ - || ch == 0x2047 /* DOUBLE QUESTION MARK */ - || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ - || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ - || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ - || ch == 0x301C /* WAVE DASH */ - || ch == 0x303C /* MASU MARK */ - || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */ - || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ - || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ - || ch == 0x309D /* HIRAGANA ITERATION MARK */ - || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ - || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */ - || ch == 0x30FB /* KATAKANA MIDDLE DOT */ - || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ - || ch == 0x30FD /* KATAKANA ITERATION MARK */ - || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */ - || ch == 0xA015 /* YI SYLLABLE WU */ - || ch == 0xFE54 /* SMALL SEMICOLON */ - || ch == 0xFE55 /* SMALL COLON */ - || ch == 0xFF1A /* FULLWIDTH COLON */ - || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ - || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ - || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ - || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ - || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ - || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL - || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) - attr |= 1 << LBP_NS; + || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ + || ch == 0x203D /* INTERROBANG */ + || ch == 0x2047 /* DOUBLE QUESTION MARK */ + || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ + || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ + || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ + || ch == 0x301C /* WAVE DASH */ + || ch == 0x303C /* MASU MARK */ + || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */ + || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ + || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ + || ch == 0x309D /* HIRAGANA ITERATION MARK */ + || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ + || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */ + || ch == 0x30FB /* KATAKANA MIDDLE DOT */ + || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + || ch == 0x30FD /* KATAKANA ITERATION MARK */ + || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */ + || ch == 0xA015 /* YI SYLLABLE WU */ + || ch == 0xFE54 /* SMALL SEMICOLON */ + || ch == 0xFE55 /* SMALL COLON */ + || ch == 0xFF1A /* FULLWIDTH COLON */ + || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ + || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ + || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ + || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ + || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL + || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) + attr |= (int64_t) 1 << LBP_NS; /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 's') - || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ - || ch == 0x00BF /* INVERTED QUESTION MARK */ - || ch == 0x2E18 /* INVERTED INTERROBANG */) - attr |= 1 << LBP_OP; + && unicode_attributes[ch].category[1] == 's') + || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ + || ch == 0x00BF /* INVERTED QUESTION MARK */ + || ch == 0x2E18 /* INVERTED INTERROBANG */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */ + || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */ + || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */ + || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */ + || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */ + || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */) + attr |= (int64_t) 1 << LBP_OP; /* ambiguous quotation */ if ((unicode_attributes[ch].category[0] == 'P' - && (unicode_attributes[ch].category[1] == 'f' - || unicode_attributes[ch].category[1] == 'i')) - || ch == 0x0022 /* QUOTATION MARK */ - || ch == 0x0027 /* APOSTROPHE */ - || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */ - || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */ - || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */ - || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */ - || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */ - || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */ - || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */ - || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ - || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ - || ch == 0x2E0B /* RAISED SQUARE */) - attr |= 1 << LBP_QU; + && (unicode_attributes[ch].category[1] == 'f' + || unicode_attributes[ch].category[1] == 'i')) + || ch == 0x0022 /* QUOTATION MARK */ + || ch == 0x0027 /* APOSTROPHE */ + || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */ + || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */ + || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */ + || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ + || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ + || ch == 0x2E0B /* RAISED SQUARE */) + attr |= (int64_t) 1 << LBP_QU; /* infix separator (numeric) */ if (ch == 0x002C /* COMMA */ - || ch == 0x002E /* FULL STOP */ - || ch == 0x003A /* COLON */ - || ch == 0x003B /* SEMICOLON */ - || ch == 0x037E /* GREEK QUESTION MARK */ - || ch == 0x0589 /* ARMENIAN FULL STOP */ - || ch == 0x060C /* ARABIC COMMA */ - || ch == 0x060D /* ARABIC DATE SEPARATOR */ - || ch == 0x07F8 /* NKO COMMA */ - || ch == 0x2044 /* FRACTION SLASH */ - || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ - || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ - || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) - attr |= 1 << LBP_IS; + || ch == 0x002E /* FULL STOP */ + || ch == 0x003A /* COLON */ + || ch == 0x003B /* SEMICOLON */ + || ch == 0x037E /* GREEK QUESTION MARK */ + || ch == 0x0589 /* ARMENIAN FULL STOP */ + || ch == 0x060C /* ARABIC COMMA */ + || ch == 0x060D /* ARABIC DATE SEPARATOR */ + || ch == 0x07F8 /* NKO COMMA */ + || ch == 0x2044 /* FRACTION SLASH */ + || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ + || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ + || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) + attr |= (int64_t) 1 << LBP_IS; /* numeric */ if ((unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'd' - && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) - || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ - || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) - attr |= 1 << LBP_NU; + && unicode_attributes[ch].category[1] == 'd' + && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) + || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ + || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) + attr |= (int64_t) 1 << LBP_NU; /* postfix (numeric) */ if (ch == 0x0025 /* PERCENT SIGN */ - || ch == 0x00A2 /* CENT SIGN */ - || ch == 0x00B0 /* DEGREE SIGN */ - || ch == 0x060B /* AFGHANI SIGN */ - || ch == 0x066A /* ARABIC PERCENT SIGN */ - || ch == 0x2030 /* PER MILLE SIGN */ - || ch == 0x2031 /* PER TEN THOUSAND SIGN */ - || ch == 0x2032 /* PRIME */ - || ch == 0x2033 /* DOUBLE PRIME */ - || ch == 0x2034 /* TRIPLE PRIME */ - || ch == 0x2035 /* REVERSED PRIME */ - || ch == 0x2036 /* REVERSED DOUBLE PRIME */ - || ch == 0x2037 /* REVERSED TRIPLE PRIME */ - || ch == 0x20A7 /* PESETA SIGN */ - || ch == 0x2103 /* DEGREE CELSIUS */ - || ch == 0x2109 /* DEGREE FAHRENHEIT */ - || ch == 0xFDFC /* RIAL SIGN */ - || ch == 0xFE6A /* SMALL PERCENT SIGN */ - || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ - || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ - || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ - || ch == 0x0D79 /* MALAYALAM DATE MARK */) - attr |= 1 << LBP_PO; + || ch == 0x00A2 /* CENT SIGN */ + || ch == 0x00B0 /* DEGREE SIGN */ + || ch == 0x060B /* AFGHANI SIGN */ + || ch == 0x066A /* ARABIC PERCENT SIGN */ + || ch == 0x2030 /* PER MILLE SIGN */ + || ch == 0x2031 /* PER TEN THOUSAND SIGN */ + || ch == 0x2032 /* PRIME */ + || ch == 0x2033 /* DOUBLE PRIME */ + || ch == 0x2034 /* TRIPLE PRIME */ + || ch == 0x2035 /* REVERSED PRIME */ + || ch == 0x2036 /* REVERSED DOUBLE PRIME */ + || ch == 0x2037 /* REVERSED TRIPLE PRIME */ + || ch == 0x20A7 /* PESETA SIGN */ + || ch == 0x2103 /* DEGREE CELSIUS */ + || ch == 0x2109 /* DEGREE FAHRENHEIT */ + || ch == 0xFDFC /* RIAL SIGN */ + || ch == 0xFE6A /* SMALL PERCENT SIGN */ + || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ + || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ + || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ + || ch == 0x09F2 /* BENGALI RUPEE MARK */ + || ch == 0x09F3 /* BENGALI RUPEE SIGN */ + || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */ + || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */ + || ch == 0xA838 /* NORTH INDIC RUPEE MARK */) + attr |= (int64_t) 1 << LBP_PO; /* prefix (numeric) */ if ((unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'c') - || ch == 0x002B /* PLUS SIGN */ - || ch == 0x005C /* REVERSE SOLIDUS */ - || ch == 0x00B1 /* PLUS-MINUS SIGN */ - || ch == 0x2116 /* NUMERO SIGN */ - || ch == 0x2212 /* MINUS SIGN */ - || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) - if (!(attr & (1 << LBP_PO))) - attr |= 1 << LBP_PR; + && unicode_attributes[ch].category[1] == 'c') + || ch == 0x002B /* PLUS SIGN */ + || ch == 0x005C /* REVERSE SOLIDUS */ + || ch == 0x00B1 /* PLUS-MINUS SIGN */ + || ch == 0x2116 /* NUMERO SIGN */ + || ch == 0x2212 /* MINUS SIGN */ + || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) + if (!(attr & ((int64_t) 1 << LBP_PO))) + attr |= (int64_t) 1 << LBP_PR; /* symbols allowing breaks */ if (ch == 0x002F /* SOLIDUS */) - attr |= 1 << LBP_SY; + attr |= (int64_t) 1 << LBP_SY; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0) - attr |= 1 << LBP_H2; + attr |= (int64_t) 1 << LBP_H2; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0) - attr |= 1 << LBP_H3; + attr |= (int64_t) 1 << LBP_H3; - if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F) - attr |= 1 << LBP_JL; + if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C)) + attr |= (int64_t) 1 << LBP_JL; - if (ch >= 0x1160 && ch <= 0x11A2) - attr |= 1 << LBP_JV; + if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6)) + attr |= (int64_t) 1 << LBP_JV; - if (ch >= 0x11A8 && ch <= 0x11F9) - attr |= 1 << LBP_JT; + if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) + attr |= (int64_t) 1 << LBP_JT; /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' - && unicode_attributes[ch].category[1] == 'f') - || (unicode_attributes[ch].category[0] == 'L' - && (unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'o')) - || (unicode_attributes[ch].category[0] == 'M' - && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'n')) - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ - || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ - || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ - || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) - && ((ch >= 0x0E00 && ch <= 0x0EFF) - || (ch >= 0x1000 && ch <= 0x109F) - || (ch >= 0x1780 && ch <= 0x17FF) - || (ch >= 0x1950 && ch <= 0x19DF))) - attr |= 1 << LBP_SA; + && unicode_attributes[ch].category[1] == 'f') + || (unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'M' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'n') + && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */) + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ + || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ + || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */ + || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ + || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */ + || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */ + || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */ + || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */) + && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */ + || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */ + || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */ + || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */ + || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */ + || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */)) + attr |= (int64_t) 1 << LBP_SA; /* attached characters and combining marks */ if ((unicode_attributes[ch].category[0] == 'M' - && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'e' - || unicode_attributes[ch].category[1] == 'n')) - || (unicode_attributes[ch].category[0] == 'C' - && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'f'))) - if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW)))) - attr |= 1 << LBP_CM; + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'e' + || unicode_attributes[ch].category[1] == 'n')) + || (unicode_attributes[ch].category[0] == 'C' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'f') + && ch != 0x110BD /* KAITHI NUMBER SIGN */)) + if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW)))) + attr |= (int64_t) 1 << LBP_CM; /* ideographic */ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ - || ch == 0x3000 /* IDEOGRAPHIC SPACE */ - || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ - || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ - || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ - || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ - || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ - || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ - || ch == 0xFE62 /* SMALL PLUS SIGN */ - || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ - || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ - || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ - || ch == 0xFE66 /* SMALL EQUALS SIGN */ - || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ - || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ - || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ - || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL - || (ch >= 0x3000 && ch <= 0x33FF - && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ - || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ - || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ - || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ - || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ - || ch == 0xFE45 /* SESAME DOT */ - || ch == 0xFE46 /* WHITE SESAME DOT */ - || ch == 0xFE49 /* DASHED OVERLINE */ - || ch == 0xFE4A /* CENTRELINE OVERLINE */ - || ch == 0xFE4B /* WAVY OVERLINE */ - || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ - || ch == 0xFE4D /* DASHED LOW LINE */ - || ch == 0xFE4E /* CENTRELINE LOW LINE */ - || ch == 0xFE4F /* WAVY LOW LINE */ - || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ - || ch == 0xFE58 /* SMALL EM DASH */ - || ch == 0xFE5F /* SMALL NUMBER SIGN */ - || ch == 0xFE60 /* SMALL AMPERSAND */ - || ch == 0xFE61 /* SMALL ASTERISK */ - || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ - || ch == 0xFE6B /* SMALL COMMERCIAL AT */ - || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ - || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ - || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ - || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ - || ch == 0xFF0A /* FULLWIDTH ASTERISK */ - || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ - || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ - || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ - || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ - || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ - || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ - || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ - || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ - || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ - || ch == 0xFF3F /* FULLWIDTH LOW LINE */ - || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ - || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ - || ch == 0xFF5E /* FULLWIDTH TILDE */ - || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ - || ch == 0xFFE3 /* FULLWIDTH MACRON */ - || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */) - if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM)))) - { - /* ambiguous (ideograph) ? */ - if ((unicode_width[ch] != NULL - && unicode_width[ch][0] == 'A' - && ch >= 0x2000) - || ch == 0x24EA /* CIRCLED DIGIT ZERO */ - || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) - attr |= 1 << LBP_AI; - else - attr |= 1 << LBP_ID; - } + || ch == 0x3000 /* IDEOGRAPHIC SPACE */ + || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ + || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */ + || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ + || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ + || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ + || ch == 0xFE62 /* SMALL PLUS SIGN */ + || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ + || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ + || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ + || ch == 0xFE66 /* SMALL EQUALS SIGN */ + || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ + || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ + || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ + || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL + || (ch >= 0x3000 && ch <= 0x33FF + && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP)))) + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ + || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ + || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ + || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ + || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ + || ch == 0xFE45 /* SESAME DOT */ + || ch == 0xFE46 /* WHITE SESAME DOT */ + || ch == 0xFE49 /* DASHED OVERLINE */ + || ch == 0xFE4A /* CENTRELINE OVERLINE */ + || ch == 0xFE4B /* WAVY OVERLINE */ + || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ + || ch == 0xFE4D /* DASHED LOW LINE */ + || ch == 0xFE4E /* CENTRELINE LOW LINE */ + || ch == 0xFE4F /* WAVY LOW LINE */ + || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ + || ch == 0xFE58 /* SMALL EM DASH */ + || ch == 0xFE5F /* SMALL NUMBER SIGN */ + || ch == 0xFE60 /* SMALL AMPERSAND */ + || ch == 0xFE61 /* SMALL ASTERISK */ + || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ + || ch == 0xFE6B /* SMALL COMMERCIAL AT */ + || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ + || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ + || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ + || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ + || ch == 0xFF0A /* FULLWIDTH ASTERISK */ + || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ + || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ + || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ + || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ + || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ + || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ + || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ + || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ + || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ + || ch == 0xFF3F /* FULLWIDTH LOW LINE */ + || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ + || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ + || ch == 0xFF5E /* FULLWIDTH TILDE */ + || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ + || ch == 0xFFE3 /* FULLWIDTH MACRON */ + || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */ + || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */ + || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */ + || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */ + || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */) + if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM)))) + { + /* ambiguous (ideograph) ? */ + if ((unicode_width[ch] != NULL + && unicode_width[ch][0] == 'A' + && ch >= 0x2000) + || ch == 0x24EA /* CIRCLED DIGIT ZERO */ + || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) + attr |= (int64_t) 1 << LBP_AI; + else + attr |= (int64_t) 1 << LBP_ID; + } /* ordinary alphabetic and symbol characters */ if ((unicode_attributes[ch].category[0] == 'L' - && (unicode_attributes[ch].category[1] == 'u' - || unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 't' - || unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'o')) - || (unicode_attributes[ch].category[0] == 'S' - && (unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'k' - || unicode_attributes[ch].category[1] == 'o')) - || (unicode_attributes[ch].category[0] == 'N' - && (unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 'o')) - || (unicode_attributes[ch].category[0] == 'P' - && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'd' - || unicode_attributes[ch].category[1] == 'o')) - || ch == 0x0600 /* ARABIC NUMBER SIGN */ - || ch == 0x0601 /* ARABIC SIGN SANAH */ - || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */ - || ch == 0x0603 /* ARABIC SIGN SAFHA */ - || ch == 0x06DD /* ARABIC END OF AYAH */ - || ch == 0x070F /* SYRIAC ABBREVIATION MARK */ - || ch == 0x2061 /* FUNCTION APPLICATION */ - || ch == 0x2062 /* INVISIBLE TIMES */ - || ch == 0x2063 /* INVISIBLE SEPARATOR */ - || ch == 0x2064 /* INVISIBLE PLUS */) - if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID)))) - { - /* ambiguous (alphabetic) ? */ - if ((unicode_width[ch] != NULL - && unicode_width[ch][0] == 'A' - && ch >= 0x2000 - /* Extra exceptions for compatibility with Unicode LineBreak.txt. */ - && ch != 0x2022 /* BULLET */ - && ch != 0x203E /* OVERLINE */ - && ch != 0x2126 /* OHM SIGN */ - && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */ - && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */ - && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */ - && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */ - && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */ - && ch != 0x21E7 /* UPWARDS WHITE ARROW */ - && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ - && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) -#if !REVISION_22 - || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ - || ch == 0x00A7 /* SECTION SIGN */ - || ch == 0x00A8 /* DIAERESIS */ - || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ - || ch == 0x00B2 /* SUPERSCRIPT TWO */ - || ch == 0x00B3 /* SUPERSCRIPT THREE */ - || ch == 0x00B6 /* PILCROW SIGN */ - || ch == 0x00B7 /* MIDDLE DOT */ - || ch == 0x00B8 /* CEDILLA */ - || ch == 0x00B9 /* SUPERSCRIPT ONE */ - || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */ - || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ - || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ - || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ - || ch == 0x00BF /* INVERTED QUESTION MARK */ - || ch == 0x00D7 /* MULTIPLICATION SIGN */ - || ch == 0x00F7 /* DIVISION SIGN */ - || ch == 0x02C7 /* CARON */ - || ch == 0x02C9 /* MODIFIER LETTER MACRON */ - || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */ - || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */ - || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */ - || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */ - || ch == 0x02D8 /* BREVE */ - || ch == 0x02D9 /* DOT ABOVE */ - || ch == 0x02DA /* RING ABOVE */ - || ch == 0x02DB /* OGONEK */ - || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ -#endif - || ch == 0x24EA /* CIRCLED DIGIT ZERO */ - || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */ - || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ - || ch == 0x2616 /* WHITE SHOGI PIECE */ - || ch == 0x2617 /* BLACK SHOGI PIECE */) - attr |= 1 << LBP_AI; - else - attr |= 1 << LBP_AL; - attr &= ~(1 << LBP_CM); - } + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't' + || unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'S' + && (unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'k' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'N' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'P' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'd' + || unicode_attributes[ch].category[1] == 'o')) + || ch == 0x0600 /* ARABIC NUMBER SIGN */ + || ch == 0x0601 /* ARABIC SIGN SANAH */ + || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */ + || ch == 0x0603 /* ARABIC SIGN SAFHA */ + || ch == 0x06DD /* ARABIC END OF AYAH */ + || ch == 0x070F /* SYRIAC ABBREVIATION MARK */ + || ch == 0x2061 /* FUNCTION APPLICATION */ + || ch == 0x2062 /* INVISIBLE TIMES */ + || ch == 0x2063 /* INVISIBLE SEPARATOR */ + || ch == 0x2064 /* INVISIBLE PLUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x110BD /* KAITHI NUMBER SIGN */) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) + { + /* ambiguous (alphabetic) ? */ + if ((unicode_width[ch] != NULL + && unicode_width[ch][0] == 'A' + && ch >= 0x2000 + /* Extra exceptions for compatibility with Unicode LineBreak.txt. */ + && ch != 0x2022 /* BULLET */ + && ch != 0x203E /* OVERLINE */ + && ch != 0x2126 /* OHM SIGN */ + && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */ + && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */ + && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */ + && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */ + && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */ + && ch != 0x21E7 /* UPWARDS WHITE ARROW */ + && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ + && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) + || ch == 0x00A7 /* SECTION SIGN */ + || ch == 0x00A8 /* DIAERESIS */ + || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ + || ch == 0x00B2 /* SUPERSCRIPT TWO */ + || ch == 0x00B3 /* SUPERSCRIPT THREE */ + || ch == 0x00B6 /* PILCROW SIGN */ + || ch == 0x00B7 /* MIDDLE DOT */ + || ch == 0x00B8 /* CEDILLA */ + || ch == 0x00B9 /* SUPERSCRIPT ONE */ + || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */ + || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ + || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ + || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ + || ch == 0x00D7 /* MULTIPLICATION SIGN */ + || ch == 0x00F7 /* DIVISION SIGN */ + || ch == 0x02C7 /* CARON */ + || ch == 0x02C9 /* MODIFIER LETTER MACRON */ + || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */ + || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */ + || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */ + || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */ + || ch == 0x02D8 /* BREVE */ + || ch == 0x02D9 /* DOT ABOVE */ + || ch == 0x02DA /* RING ABOVE */ + || ch == 0x02DB /* OGONEK */ + || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ + || ch == 0x24EA /* CIRCLED DIGIT ZERO */ + || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */ + || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ + || ch == 0x2616 /* WHITE SHOGI PIECE */ + || ch == 0x2617 /* BLACK SHOGI PIECE */) + attr |= (int64_t) 1 << LBP_AI; + else + attr |= (int64_t) 1 << LBP_AL; + attr &= ~((int64_t) 1 << LBP_CM); + } + } + else + { + /* Unassigned character. */ + if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ + || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ + || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + attr |= (int64_t) 1 << LBP_ID; } if (attr == 0) /* unknown */ - attr |= 1 << LBP_XX; + attr |= (int64_t) 1 << LBP_XX; return attr; } @@ -5783,47 +6850,48 @@ debug_output_lbp (FILE *stream) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); - if (attr != 1 << LBP_XX) - { - fprintf (stream, "0x%04X", i); + int64_t attr = get_lbp (i); + if (attr != (int64_t) 1 << LBP_XX) + { + fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ - if (attr & (1 << bit)) fprintf (stream, " " #bit); - PRINT_BIT(attr,LBP_BK); - PRINT_BIT(attr,LBP_CM); - PRINT_BIT(attr,LBP_WJ); - PRINT_BIT(attr,LBP_ZW); - PRINT_BIT(attr,LBP_GL); - PRINT_BIT(attr,LBP_SP); - PRINT_BIT(attr,LBP_B2); - PRINT_BIT(attr,LBP_BA); - PRINT_BIT(attr,LBP_BB); - PRINT_BIT(attr,LBP_HY); - PRINT_BIT(attr,LBP_CB); - PRINT_BIT(attr,LBP_CL); - PRINT_BIT(attr,LBP_EX); - PRINT_BIT(attr,LBP_IN); - PRINT_BIT(attr,LBP_NS); - PRINT_BIT(attr,LBP_OP); - PRINT_BIT(attr,LBP_QU); - PRINT_BIT(attr,LBP_IS); - PRINT_BIT(attr,LBP_NU); - PRINT_BIT(attr,LBP_PO); - PRINT_BIT(attr,LBP_PR); - PRINT_BIT(attr,LBP_SY); - PRINT_BIT(attr,LBP_AI); - PRINT_BIT(attr,LBP_AL); - PRINT_BIT(attr,LBP_H2); - PRINT_BIT(attr,LBP_H3); - PRINT_BIT(attr,LBP_ID); - PRINT_BIT(attr,LBP_JL); - PRINT_BIT(attr,LBP_JV); - PRINT_BIT(attr,LBP_JT); - PRINT_BIT(attr,LBP_SA); - PRINT_BIT(attr,LBP_XX); + if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit); + PRINT_BIT(attr,LBP_BK); + PRINT_BIT(attr,LBP_CM); + PRINT_BIT(attr,LBP_WJ); + PRINT_BIT(attr,LBP_ZW); + PRINT_BIT(attr,LBP_GL); + PRINT_BIT(attr,LBP_SP); + PRINT_BIT(attr,LBP_B2); + PRINT_BIT(attr,LBP_BA); + PRINT_BIT(attr,LBP_BB); + PRINT_BIT(attr,LBP_HY); + PRINT_BIT(attr,LBP_CB); + PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); + PRINT_BIT(attr,LBP_EX); + PRINT_BIT(attr,LBP_IN); + PRINT_BIT(attr,LBP_NS); + PRINT_BIT(attr,LBP_OP); + PRINT_BIT(attr,LBP_QU); + PRINT_BIT(attr,LBP_IS); + PRINT_BIT(attr,LBP_NU); + PRINT_BIT(attr,LBP_PO); + PRINT_BIT(attr,LBP_PR); + PRINT_BIT(attr,LBP_SY); + PRINT_BIT(attr,LBP_AI); + PRINT_BIT(attr,LBP_AL); + PRINT_BIT(attr,LBP_H2); + PRINT_BIT(attr,LBP_H3); + PRINT_BIT(attr,LBP_ID); + PRINT_BIT(attr,LBP_JL); + PRINT_BIT(attr,LBP_JV); + PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_SA); + PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT - fprintf (stream, "\n"); - } + fprintf (stream, "\n"); + } } } @@ -5882,24 +6950,24 @@ fill_org_lbp (const char *linebreak_filename) lineno++; c = getc (stream); if (c == EOF) - break; + break; if (c == '#') - { - do c = getc (stream); while (c != EOF && c != '\n'); - continue; - } + { + do c = getc (stream); while (c != EOF && c != '\n'); + continue; + } ungetc (c, stream); n = getfield (stream, field0, ';'); n += getfield (stream, field1, ' '); n += getfield (stream, field2, '\n'); if (n == 0) - break; + break; if (n != 3) - { - fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, - lineno); - exit (1); - } + { + fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, + lineno); + exit (1); + } #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; if (false) {} TRY(LBP_BK) @@ -5914,6 +6982,7 @@ fill_org_lbp (const char *linebreak_filename) TRY(LBP_HY) TRY(LBP_CB) TRY(LBP_CL) + TRY(LBP_CP) TRY(LBP_EX) TRY(LBP_IN) TRY(LBP_NS) @@ -5940,25 +7009,26 @@ fill_org_lbp (const char *linebreak_filename) else if (strcmp (field1, "NL") == 0) value = LBP_BK; else if (strcmp (field1, "SG") == 0) value = LBP_XX; else - { - fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", - field1, linebreak_filename, lineno); - exit (1); - } + { + fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", + field1, linebreak_filename, lineno); + exit (1); + } i = strtoul (field0, NULL, 16); if (strstr (field0, "..") != NULL) - { - /* Deal with a range. */ - j = strtoul (strstr (field0, "..") + 2, NULL, 16); - for (; i <= j; i++) - unicode_org_lbp[i] = value; - } + { + /* Deal with a range. */ + j = strtoul (strstr (field0, "..") + 2, NULL, 16); + for (; i <= j; i++) + unicode_org_lbp[i] = value; + } else - { - /* Single character line. */ - unicode_org_lbp[i] = value; - } + { + /* Single character line. */ + unicode_org_lbp[i] = value; + } } + if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", linebreak_filename); @@ -5976,45 +7046,46 @@ debug_output_org_lbp (FILE *stream) { int attr = unicode_org_lbp[i]; if (attr != LBP_XX) - { - fprintf (stream, "0x%04X", i); + { + fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ if (attr == bit) fprintf (stream, " " #bit); - PRINT_BIT(attr,LBP_BK); - PRINT_BIT(attr,LBP_CM); - PRINT_BIT(attr,LBP_WJ); - PRINT_BIT(attr,LBP_ZW); - PRINT_BIT(attr,LBP_GL); - PRINT_BIT(attr,LBP_SP); - PRINT_BIT(attr,LBP_B2); - PRINT_BIT(attr,LBP_BA); - PRINT_BIT(attr,LBP_BB); - PRINT_BIT(attr,LBP_HY); - PRINT_BIT(attr,LBP_CB); - PRINT_BIT(attr,LBP_CL); - PRINT_BIT(attr,LBP_EX); - PRINT_BIT(attr,LBP_IN); - PRINT_BIT(attr,LBP_NS); - PRINT_BIT(attr,LBP_OP); - PRINT_BIT(attr,LBP_QU); - PRINT_BIT(attr,LBP_IS); - PRINT_BIT(attr,LBP_NU); - PRINT_BIT(attr,LBP_PO); - PRINT_BIT(attr,LBP_PR); - PRINT_BIT(attr,LBP_SY); - PRINT_BIT(attr,LBP_AI); - PRINT_BIT(attr,LBP_AL); - PRINT_BIT(attr,LBP_H2); - PRINT_BIT(attr,LBP_H3); - PRINT_BIT(attr,LBP_ID); - PRINT_BIT(attr,LBP_JL); - PRINT_BIT(attr,LBP_JV); - PRINT_BIT(attr,LBP_JT); - PRINT_BIT(attr,LBP_SA); - PRINT_BIT(attr,LBP_XX); + PRINT_BIT(attr,LBP_BK); + PRINT_BIT(attr,LBP_CM); + PRINT_BIT(attr,LBP_WJ); + PRINT_BIT(attr,LBP_ZW); + PRINT_BIT(attr,LBP_GL); + PRINT_BIT(attr,LBP_SP); + PRINT_BIT(attr,LBP_B2); + PRINT_BIT(attr,LBP_BA); + PRINT_BIT(attr,LBP_BB); + PRINT_BIT(attr,LBP_HY); + PRINT_BIT(attr,LBP_CB); + PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); + PRINT_BIT(attr,LBP_EX); + PRINT_BIT(attr,LBP_IN); + PRINT_BIT(attr,LBP_NS); + PRINT_BIT(attr,LBP_OP); + PRINT_BIT(attr,LBP_QU); + PRINT_BIT(attr,LBP_IS); + PRINT_BIT(attr,LBP_NU); + PRINT_BIT(attr,LBP_PO); + PRINT_BIT(attr,LBP_PR); + PRINT_BIT(attr,LBP_SY); + PRINT_BIT(attr,LBP_AI); + PRINT_BIT(attr,LBP_AL); + PRINT_BIT(attr,LBP_H2); + PRINT_BIT(attr,LBP_H3); + PRINT_BIT(attr,LBP_ID); + PRINT_BIT(attr,LBP_JL); + PRINT_BIT(attr,LBP_JV); + PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_SA); + PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT - fprintf (stream, "\n"); - } + fprintf (stream, "\n"); + } } } @@ -6060,19 +7131,19 @@ output_lbp (FILE *stream1, FILE *stream2) for (i = 0; i < 0x110000; i++) { - int attr = get_lbp (i); + int64_t attr = get_lbp (i); /* Now attr should contain exactly one bit. */ if (attr == 0 || ((attr & (attr - 1)) != 0)) - abort (); + abort (); - if (attr != 1 << LBP_XX) - { - unsigned int log2_attr; - for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + if (attr != (int64_t) 1 << LBP_XX) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); - lbp_table_add (&t, i, log2_attr); - } + lbp_table_add (&t, i, log2_attr); + } } lbp_table_finalize (&t); @@ -6089,7 +7160,7 @@ output_lbp (FILE *stream1, FILE *stream2) for (i = 0; i < 5; i++) fprintf (stream1, "#define lbrkprop_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream1, "\n"); fprintf (stream1, "typedef struct\n"); fprintf (stream1, " {\n"); @@ -6109,15 +7180,15 @@ output_lbp (FILE *stream1, FILE *stream2) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream2, " %5d", -1); + fprintf (stream2, " %5d", -1); else - fprintf (stream2, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream2, ","); + fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); @@ -6129,15 +7200,15 @@ output_lbp (FILE *stream1, FILE *stream2) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream2, " %5d", -1); + fprintf (stream2, " %5d", -1); else - fprintf (stream2, " %5zu", - (offset - level3_offset) / sizeof (unsigned char)); + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); if (i+1 < t.level2_size << t.q) - fprintf (stream2, ","); + fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); @@ -6150,48 +7221,49 @@ output_lbp (FILE *stream1, FILE *stream2) unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; const char *value_string; switch (value) - { + { #define CASE(x) case x: value_string = #x; break; - CASE(LBP_BK); - CASE(LBP_CM); - CASE(LBP_WJ); - CASE(LBP_ZW); - CASE(LBP_GL); - CASE(LBP_SP); - CASE(LBP_B2); - CASE(LBP_BA); - CASE(LBP_BB); - CASE(LBP_HY); - CASE(LBP_CB); - CASE(LBP_CL); - CASE(LBP_EX); - CASE(LBP_IN); - CASE(LBP_NS); - CASE(LBP_OP); - CASE(LBP_QU); - CASE(LBP_IS); - CASE(LBP_NU); - CASE(LBP_PO); - CASE(LBP_PR); - CASE(LBP_SY); - CASE(LBP_AI); - CASE(LBP_AL); - CASE(LBP_H2); - CASE(LBP_H3); - CASE(LBP_ID); - CASE(LBP_JL); - CASE(LBP_JV); - CASE(LBP_JT); - CASE(LBP_SA); - CASE(LBP_XX); + CASE(LBP_BK); + CASE(LBP_CM); + CASE(LBP_WJ); + CASE(LBP_ZW); + CASE(LBP_GL); + CASE(LBP_SP); + CASE(LBP_B2); + CASE(LBP_BA); + CASE(LBP_BB); + CASE(LBP_HY); + CASE(LBP_CB); + CASE(LBP_CL); + CASE(LBP_CP); + CASE(LBP_EX); + CASE(LBP_IN); + CASE(LBP_NS); + CASE(LBP_OP); + CASE(LBP_QU); + CASE(LBP_IS); + CASE(LBP_NU); + CASE(LBP_PO); + CASE(LBP_PR); + CASE(LBP_SY); + CASE(LBP_AI); + CASE(LBP_AL); + CASE(LBP_H2); + CASE(LBP_H3); + CASE(LBP_ID); + CASE(LBP_JL); + CASE(LBP_JV); + CASE(LBP_JT); + CASE(LBP_SA); + CASE(LBP_XX); #undef CASE - default: - abort (); - } + default: + abort (); + } if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); fprintf (stream2, " %s%s", value_string, - (i+1 < t.level3_size << t.p ? "," : "")); + (i+1 < t.level3_size << t.p ? "," : "")); } if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); @@ -6213,10 +7285,10 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve { streams[i] = fopen (filenames[i], "w"); if (streams[i] == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); - exit (1); - } + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } } for (i = 0; i < 2; i++) @@ -6226,12 +7298,12 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n", - version); + version); fprintf (stream, "\n"); /* Put a GPL header on it. The gnulib module is under LGPL (although it - still carries the GPL header), and it's gnulib-tool which replaces the - GPL header with an LGPL header. */ + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); @@ -6254,16 +7326,17 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve for (i = 0; i < 2; i++) { if (ferror (streams[i]) || fclose (streams[i])) - { - fprintf (stderr, "error writing to '%s'\n", filenames[i]); - exit (1); - } + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } } } /* ========================================================================= */ -/* Word break property. */ +/* Word break property. + Updated for Unicode TR #29 revision 17. */ /* Possible values of the Word_Break property. */ enum @@ -6292,64 +7365,62 @@ get_wbp (unsigned int ch) if (unicode_attributes[ch].name != NULL) { if (ch == 0x000D) - attr |= 1 << WBP_CR; + attr |= 1 << WBP_CR; if (ch == 0x000A) - attr |= 1 << WBP_LF; + attr |= 1 << WBP_LF; if (ch == 0x000B || ch == 0x000C - || ch == 0x0085 - || ch == 0x2028 || ch == 0x2029) - attr |= 1 << WBP_NEWLINE; + || ch == 0x0085 + || ch == 0x2028 || ch == 0x2029) + attr |= 1 << WBP_NEWLINE; if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0 - || (unicode_attributes[ch].category != NULL - && strcmp (unicode_attributes[ch].category, "Mc") == 0)) - attr |= 1 << WBP_EXTEND; + || (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Mc") == 0)) + attr |= 1 << WBP_EXTEND; if (unicode_attributes[ch].category != NULL - && strcmp (unicode_attributes[ch].category, "Cf") == 0 - && ch != 0x200C && ch != 0x200D) - attr |= 1 << WBP_FORMAT; + && strcmp (unicode_attributes[ch].category, "Cf") == 0 + && ch != 0x200B && ch != 0x200C && ch != 0x200D) + attr |= 1 << WBP_FORMAT; if ((unicode_scripts[ch] < numscripts - && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) - || (ch >= 0x3031 && ch <= 0x3035) - || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC - || ch == 0xFF70) - attr |= 1 << WBP_KATAKANA; + && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) + || (ch >= 0x3031 && ch <= 0x3035) + || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC + || ch == 0xFF70) + attr |= 1 << WBP_KATAKANA; if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 - || ch == 0x05F3) - && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 - && (attr & (1 << WBP_KATAKANA)) == 0 - && ((get_lbp (ch) >> LBP_SA) & 1) == 0 - && !(unicode_scripts[ch] < numscripts - && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) - && (attr & (1 << WBP_EXTEND)) == 0) - attr |= 1 << WBP_ALETTER; - - if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 - || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) - attr |= 1 << WBP_MIDNUMLET; - - if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A - || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) - attr |= 1 << WBP_MIDLETTER; + || ch == 0x05F3) + && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 + && (attr & (1 << WBP_KATAKANA)) == 0 + && ((get_lbp (ch) >> LBP_SA) & 1) == 0 + && !(unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) + && (attr & (1 << WBP_EXTEND)) == 0) + attr |= 1 << WBP_ALETTER; + + if (is_WBP_MIDNUMLET (ch)) + attr |= 1 << WBP_MIDNUMLET; + + if (is_WBP_MIDLETTER (ch)) + attr |= 1 << WBP_MIDLETTER; if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 - || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C - || ch == 0xFF1B) - && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) - attr |= 1 << WBP_MIDNUM; + || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C + || ch == 0xFF1B) + && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) + attr |= 1 << WBP_MIDNUM; if (((get_lbp (ch) >> LBP_NU) & 1) != 0 - && ch != 0x066C) - attr |= 1 << WBP_NUMERIC; + && ch != 0x066C) + attr |= 1 << WBP_NUMERIC; if (unicode_attributes[ch].category != NULL - && strcmp (unicode_attributes[ch].category, "Pc") == 0) - attr |= 1 << WBP_EXTENDNUMLET; + && strcmp (unicode_attributes[ch].category, "Pc") == 0) + attr |= 1 << WBP_EXTENDNUMLET; } if (attr == 0) @@ -6369,34 +7440,34 @@ debug_output_wbp (FILE *stream) { int attr = get_wbp (i); if (attr != 1 << WBP_OTHER) - { - fprintf (stream, "0x%04X", i); - if (attr & (1 << WBP_CR)) - fprintf (stream, " CR"); - if (attr & (1 << WBP_LF)) - fprintf (stream, " LF"); - if (attr & (1 << WBP_NEWLINE)) - fprintf (stream, " Newline"); - if (attr & (1 << WBP_EXTEND)) - fprintf (stream, " Extend"); - if (attr & (1 << WBP_FORMAT)) - fprintf (stream, " Format"); - if (attr & (1 << WBP_KATAKANA)) - fprintf (stream, " Katakana"); - if (attr & (1 << WBP_ALETTER)) - fprintf (stream, " ALetter"); - if (attr & (1 << WBP_MIDNUMLET)) - fprintf (stream, " MidNumLet"); - if (attr & (1 << WBP_MIDLETTER)) - fprintf (stream, " MidLetter"); - if (attr & (1 << WBP_MIDNUM)) - fprintf (stream, " MidNum"); - if (attr & (1 << WBP_NUMERIC)) - fprintf (stream, " Numeric"); - if (attr & (1 << WBP_EXTENDNUMLET)) - fprintf (stream, " ExtendNumLet"); - fprintf (stream, "\n"); - } + { + fprintf (stream, "0x%04X", i); + if (attr & (1 << WBP_CR)) + fprintf (stream, " CR"); + if (attr & (1 << WBP_LF)) + fprintf (stream, " LF"); + if (attr & (1 << WBP_NEWLINE)) + fprintf (stream, " Newline"); + if (attr & (1 << WBP_EXTEND)) + fprintf (stream, " Extend"); + if (attr & (1 << WBP_FORMAT)) + fprintf (stream, " Format"); + if (attr & (1 << WBP_KATAKANA)) + fprintf (stream, " Katakana"); + if (attr & (1 << WBP_ALETTER)) + fprintf (stream, " ALetter"); + if (attr & (1 << WBP_MIDNUMLET)) + fprintf (stream, " MidNumLet"); + if (attr & (1 << WBP_MIDLETTER)) + fprintf (stream, " MidLetter"); + if (attr & (1 << WBP_MIDNUM)) + fprintf (stream, " MidNum"); + if (attr & (1 << WBP_NUMERIC)) + fprintf (stream, " Numeric"); + if (attr & (1 << WBP_EXTENDNUMLET)) + fprintf (stream, " ExtendNumLet"); + fprintf (stream, "\n"); + } } } @@ -6451,21 +7522,21 @@ fill_org_wbp (const char *wordbreakproperty_filename) int propvalue; if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; + break; if (buf[0] == '\0' || buf[0] == '#') - continue; + continue; if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) - { - if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) - { - fprintf (stderr, "parse error in '%s'\n", - wordbreakproperty_filename); - exit (1); - } - i2 = i1; - } + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + wordbreakproperty_filename); + exit (1); + } + i2 = i1; + } #define PROP(name,value) \ if (strcmp (propname, name) == 0) propvalue = value; else PROP ("CR", WBP_CR) @@ -6481,16 +7552,16 @@ fill_org_wbp (const char *wordbreakproperty_filename) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) #undef PROP - { - fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, - wordbreakproperty_filename); - exit (1); - } + { + fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, + wordbreakproperty_filename); + exit (1); + } if (!(i1 <= i2 && i2 < 0x110000)) - abort (); + abort (); for (i = i1; i <= i2; i++) - unicode_org_wbp[i] = propvalue; + unicode_org_wbp[i] = propvalue; } if (ferror (stream) || fclose (stream)) @@ -6510,26 +7581,26 @@ debug_output_org_wbp (FILE *stream) { int propvalue = unicode_org_wbp[i]; if (propvalue != WBP_OTHER) - { - fprintf (stream, "0x%04X", i); + { + fprintf (stream, "0x%04X", i); #define PROP(name,value) \ - if (propvalue == value) fprintf (stream, " " name); else - PROP ("CR", WBP_CR) - PROP ("LF", WBP_LF) - PROP ("Newline", WBP_NEWLINE) - PROP ("Extend", WBP_EXTEND) - PROP ("Format", WBP_FORMAT) - PROP ("Katakana", WBP_KATAKANA) - PROP ("ALetter", WBP_ALETTER) - PROP ("MidNumLet", WBP_MIDNUMLET) - PROP ("MidLetter", WBP_MIDLETTER) - PROP ("MidNum", WBP_MIDNUM) - PROP ("Numeric", WBP_NUMERIC) - PROP ("ExtendNumLet", WBP_EXTENDNUMLET) + if (propvalue == value) fprintf (stream, " " name); else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) #undef PROP - fprintf (stream, " ??"); - fprintf (stream, "\n"); - } + fprintf (stream, " ??"); + fprintf (stream, "\n"); + } } } @@ -6569,29 +7640,317 @@ output_wbp (FILE *stream) struct wbp_table t; unsigned int level1_offset, level2_offset, level3_offset; - t.p = 7; - t.q = 9; - wbp_table_init (&t); - - for (i = 0; i < 0x110000; i++) + t.p = 7; + t.q = 9; + wbp_table_init (&t); + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + + /* Now attr should contain exactly one bit. */ + if (attr == 0 || ((attr & (attr - 1)) != 0)) + abort (); + + if (attr != 1 << WBP_OTHER) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + + wbp_table_add (&t, i, log2_attr); + } + } + + wbp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define wbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "\n"); + fprintf (stream, "typedef struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "wbrkprop_t;\n"); + fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; + const char *value_string; + switch (value) + { +#define CASE(x) case x: value_string = #x; break; + CASE(WBP_OTHER); + CASE(WBP_CR); + CASE(WBP_LF); + CASE(WBP_NEWLINE); + CASE(WBP_EXTEND); + CASE(WBP_FORMAT); + CASE(WBP_KATAKANA); + CASE(WBP_ALETTER); + CASE(WBP_MIDNUMLET); + CASE(WBP_MIDLETTER); + CASE(WBP_MIDNUM); + CASE(WBP_NUMERIC); + CASE(WBP_EXTENDNUMLET); +#undef CASE + default: + abort (); + } + if (i > 0 && (i % 4) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %s%s", value_string, + (i+1 < t.level3_size << t.p ? "," : "")); + } + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); +} + +static void +output_wbrk_tables (const char *filename, const char *version) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + output_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Grapheme break property. + Updated for Unicode TR #29 revision 17. */ + +/* Possible values of the Grapheme_Cluster_Break property. */ +enum +{ + GBP_OTHER = 0, + GBP_CR = 1, + GBP_LF = 2, + GBP_CONTROL = 3, + GBP_EXTEND = 4, + GBP_PREPEND = 5, + GBP_SPACINGMARK = 6, + GBP_L = 7, + GBP_V = 8, + GBP_T = 9, + GBP_LV = 10, + GBP_LVT = 11 +}; + +/* Construction of sparse 3-level tables. */ +#define TABLE gbp_table +#define ELEMENT unsigned char +#define DEFAULT GBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* The grapheme break property from the GraphemeBreakProperty.txt file. */ +int unicode_org_gbp[0x110000]; + +/* Output the unit test data for the grapheme break property. */ +static void +output_gbp_test (const char *filename) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode grapheme break property functions.\n"); + fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see . */\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int gbp = unicode_org_gbp[ch]; + const char *gbp_string; + + while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp) + ch++; + + switch (gbp) + { +#define CASE(x) case x: gbp_string = #x; break; + CASE (GBP_OTHER) + CASE (GBP_CR) + CASE (GBP_LF) + CASE (GBP_CONTROL) + CASE (GBP_EXTEND) + CASE (GBP_PREPEND) + CASE (GBP_SPACINGMARK) + CASE (GBP_L) + CASE (GBP_V) + CASE (GBP_T) + CASE (GBP_LV) + CASE (GBP_LVT) +#undef CASE + default: + abort (); + } + + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string); + + need_comma = true; + } + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the per-character grapheme break property table. */ +static void +output_gbp_table (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct gbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) { - int attr = get_wbp (i); + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } - /* Now attr should contain exactly one bit. */ - if (attr == 0 || ((attr & (attr - 1)) != 0)) - abort (); + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Grapheme break property of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); - if (attr != 1 << WBP_OTHER) - { - unsigned int log2_attr; - for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + t.p = 7; + t.q = 9; + gbp_table_init (&t); - wbp_table_add (&t, i, log2_attr); - } - } + for (ch = 0; ch < 0x110000; ch++) + gbp_table_add (&t, ch, unicode_org_gbp[ch]); - wbp_table_finalize (&t); + gbp_table_finalize (&t); + /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = @@ -6603,17 +7962,17 @@ output_wbp (FILE *stream) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) - fprintf (stream, "#define wbrkprop_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); - fprintf (stream, "\n"); - fprintf (stream, "typedef struct\n"); + fprintf (stream, "#define gbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); - fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); - fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n", + t.level3_size, t.p); fprintf (stream, " }\n"); - fprintf (stream, "wbrkprop_t;\n"); - fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); + fprintf (stream, "unigbrkprop =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) @@ -6622,15 +7981,15 @@ output_wbp (FILE *stream) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -6642,105 +8001,127 @@ output_wbp (FILE *stream) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (unsigned char)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t) / 2); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); - if (t.level3_size << t.p > 4) + if (t.level3_size << t.p > 8) fprintf (stream, "\n "); - for (i = 0; i < t.level3_size << t.p; i++) + for (i = 0; i < (t.level3_size << t.p) / 2; i++) { - unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; - const char *value_string; - switch (value) - { -#define CASE(x) case x: value_string = #x; break; - CASE(WBP_OTHER); - CASE(WBP_CR); - CASE(WBP_LF); - CASE(WBP_NEWLINE); - CASE(WBP_EXTEND); - CASE(WBP_FORMAT); - CASE(WBP_KATAKANA); - CASE(WBP_ALETTER); - CASE(WBP_MIDNUMLET); - CASE(WBP_MIDLETTER); - CASE(WBP_MIDNUM); - CASE(WBP_NUMERIC); - CASE(WBP_EXTENDNUMLET); -#undef CASE - default: - abort (); - } - if (i > 0 && (i % 4) == 0) - fprintf (stream, "\n "); - fprintf (stream, " %s%s", value_string, - (i+1 < t.level3_size << t.p ? "," : "")); + unsigned char *p = (unsigned char *) (t.result + level3_offset); + unsigned char value0 = p[i * 2]; + unsigned char value1 = p[i * 2 + 1]; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x%s", (value1 << 4) + value0, + (i+1 < (t.level3_size << t.p) / 2 ? "," : "")); } - if (t.level3_size << t.p > 4) + if (t.level3_size << t.p > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } } +/* Stores in unicode_org_gbp[] the grapheme breaking property from the + GraphemeBreakProperty.txt file. */ static void -output_wbrk_tables (const char *filename, const char *version) +fill_org_gbp (const char *graphemebreakproperty_filename) { + unsigned int i; FILE *stream; + int lineno = 0; - stream = fopen (filename, "w"); + for (i = 0; i < 0x110000; i++) + unicode_org_gbp[i] = GBP_OTHER; + + stream = fopen (graphemebreakproperty_filename, "r"); if (stream == NULL) { - fprintf (stderr, "cannot open '%s' for writing\n", filename); + fprintf (stderr, "error during fopen of '%s'\n", + graphemebreakproperty_filename); exit (1); } - fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", - version); - fprintf (stream, "\n"); + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; - /* Put a GPL header on it. The gnulib module is under LGPL (although it - still carries the GPL header), and it's gnulib-tool which replaces the - GPL header with an LGPL header. */ - fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); - fprintf (stream, "\n"); - fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); - fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); - fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); - fprintf (stream, " (at your option) any later version.\n"); - fprintf (stream, "\n"); - fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); - fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); - fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); - fprintf (stream, " GNU General Public License for more details.\n"); - fprintf (stream, "\n"); - fprintf (stream, " You should have received a copy of the GNU General Public License\n"); - fprintf (stream, " along with this program. If not, see . */\n"); - fprintf (stream, "\n"); + lineno++; + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; - output_wbp (stream); + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", GBP_CR) + PROP ("LF", GBP_LF) + PROP ("Control", GBP_CONTROL) + PROP ("Extend", GBP_EXTEND) + PROP ("Prepend", GBP_PREPEND) + PROP ("SpacingMark", GBP_SPACINGMARK) + PROP ("L", GBP_L) + PROP ("V", GBP_V) + PROP ("T", GBP_T) + PROP ("LV", GBP_LV) + PROP ("LVT", GBP_LVT) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, + graphemebreakproperty_filename, lineno); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_gbp[i] = propvalue; + } if (ferror (stream) || fclose (stream)) { - fprintf (stderr, "error writing to '%s'\n", filename); + fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); exit (1); } } /* ========================================================================= */ +/* Composition and decomposition. + Updated for Unicode TR #15 revision 33. */ + /* Maximum number of characters into which a single Unicode character can be decomposed. */ #define MAX_DECOMP_LENGTH 18 @@ -6770,7 +8151,7 @@ enum decompositions). Return the type, or -1 for none. */ static int get_decomposition (unsigned int ch, - unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) + unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) { const char *decomposition = unicode_attributes[ch].decomposition; @@ -6781,55 +8162,55 @@ get_decomposition (unsigned int ch, char *endptr; if (decomposition[0] == '<') - { - const char *rangle; - size_t typelen; - - rangle = strchr (decomposition + 1, '>'); - if (rangle == NULL) - abort (); - typelen = rangle + 1 - decomposition; + { + const char *rangle; + size_t typelen; + + rangle = strchr (decomposition + 1, '>'); + if (rangle == NULL) + abort (); + typelen = rangle + 1 - decomposition; #define TYPE(t1,t2) \ - if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ - type = t2; \ - else - TYPE ("", UC_DECOMP_FONT) - TYPE ("", UC_DECOMP_NOBREAK) - TYPE ("", UC_DECOMP_INITIAL) - TYPE ("", UC_DECOMP_MEDIAL) - TYPE ("", UC_DECOMP_FINAL) - TYPE ("", UC_DECOMP_ISOLATED) - TYPE ("", UC_DECOMP_CIRCLE) - TYPE ("", UC_DECOMP_SUPER) - TYPE ("", UC_DECOMP_SUB) - TYPE ("", UC_DECOMP_VERTICAL) - TYPE ("", UC_DECOMP_WIDE) - TYPE ("", UC_DECOMP_NARROW) - TYPE ("", UC_DECOMP_SMALL) - TYPE ("", UC_DECOMP_SQUARE) - TYPE ("", UC_DECOMP_FRACTION) - TYPE ("", UC_DECOMP_COMPAT) - { - fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); - exit (1); - } + if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ + type = t2; \ + else + TYPE ("", UC_DECOMP_FONT) + TYPE ("", UC_DECOMP_NOBREAK) + TYPE ("", UC_DECOMP_INITIAL) + TYPE ("", UC_DECOMP_MEDIAL) + TYPE ("", UC_DECOMP_FINAL) + TYPE ("", UC_DECOMP_ISOLATED) + TYPE ("", UC_DECOMP_CIRCLE) + TYPE ("", UC_DECOMP_SUPER) + TYPE ("", UC_DECOMP_SUB) + TYPE ("", UC_DECOMP_VERTICAL) + TYPE ("", UC_DECOMP_WIDE) + TYPE ("", UC_DECOMP_NARROW) + TYPE ("", UC_DECOMP_SMALL) + TYPE ("", UC_DECOMP_SQUARE) + TYPE ("", UC_DECOMP_FRACTION) + TYPE ("", UC_DECOMP_COMPAT) + { + fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); + exit (1); + } #undef TYPE - decomposition = rangle + 1; - if (decomposition[0] == ' ') - decomposition++; - } + decomposition = rangle + 1; + if (decomposition[0] == ' ') + decomposition++; + } for (length = 0; length < MAX_DECOMP_LENGTH; length++) - { - decomposed[length] = strtoul (decomposition, &endptr, 16); - if (endptr == decomposition) - break; - decomposition = endptr; - if (decomposition[0] == ' ') - decomposition++; - } + { + decomposed[length] = strtoul (decomposition, &endptr, 16); + if (endptr == decomposition) + break; + decomposition = endptr; + if (decomposition[0] == ' ') + decomposition++; + } if (*decomposition != '\0') - /* MAX_DECOMP_LENGTH is too small. */ - abort (); + /* MAX_DECOMP_LENGTH is too small. */ + abort (); *lengthp = length; return type; @@ -6871,32 +8252,32 @@ output_decomposition (FILE *stream1, FILE *stream2) int type = get_decomposition (ch, &length, decomposed); if (type >= 0) - { - if (!(offset < (1 << 15))) - abort (); - decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); - - /* Produce length 3-bytes entries. */ - if (length == 0) - /* We would need a special representation of zero-length entries. */ - abort (); - for (i = 0; i < length; i++) - { - if (offset > 0) - fprintf (stream2, ","); - if ((offset % 4) == 0) - fprintf (stream2, "\n "); - if (!(decomposed[i] < (1 << 18))) - abort (); - fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", - (((i+1 < length ? (1 << 23) : 0) - | (i == 0 ? (type << 18) : 0) - | decomposed[i]) >> 16) & 0xff, - (decomposed[i] >> 8) & 0xff, - decomposed[i] & 0xff); - offset++; - } - } + { + if (!(offset < (1 << 15))) + abort (); + decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); + + /* Produce length 3-bytes entries. */ + if (length == 0) + /* We would need a special representation of zero-length entries. */ + abort (); + for (i = 0; i < length; i++) + { + if (offset > 0) + fprintf (stream2, ","); + if ((offset % 4) == 0) + fprintf (stream2, "\n "); + if (!(decomposed[i] < (1 << 18))) + abort (); + fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", + (((i+1 < length ? (1 << 23) : 0) + | (i == 0 ? (type << 18) : 0) + | decomposed[i]) >> 16) & 0xff, + (decomposed[i] >> 8) & 0xff, + decomposed[i] & 0xff); + offset++; + } + } } fprintf (stream2, "\n};\n"); @@ -6916,7 +8297,7 @@ output_decomposition (FILE *stream1, FILE *stream2) for (i = 0; i < 5; i++) fprintf (stream1, "#define decomp_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream1, "\n"); fprintf (stream1, "typedef struct\n"); fprintf (stream1, " {\n"); @@ -6935,15 +8316,15 @@ output_decomposition (FILE *stream1, FILE *stream2) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream2, " %5d", -1); + fprintf (stream2, " %5d", -1); else - fprintf (stream2, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream2, ","); + fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); @@ -6955,15 +8336,15 @@ output_decomposition (FILE *stream1, FILE *stream2) { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream2, " %5d", -1); + fprintf (stream2, " %5d", -1); else - fprintf (stream2, " %5zu", - (offset - level3_offset) / sizeof (uint16_t)); + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (uint16_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream2, ","); + fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); @@ -6975,10 +8356,10 @@ output_decomposition (FILE *stream1, FILE *stream2) { uint16_t value = ((uint16_t *) (t.result + level3_offset))[i]; if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); + fprintf (stream2, "\n "); fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value); if (i+1 < t.level3_size << t.p) - fprintf (stream2, ","); + fprintf (stream2, ","); } if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); @@ -7000,10 +8381,10 @@ output_decomposition_tables (const char *filename1, const char *filename2, const { streams[i] = fopen (filenames[i], "w"); if (streams[i] == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); - exit (1); - } + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } } for (i = 0; i < 2; i++) @@ -7013,7 +8394,7 @@ output_decomposition_tables (const char *filename1, const char *filename2, const fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decomposition of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", - version); + version); fprintf (stream, "\n"); } @@ -7022,10 +8403,10 @@ output_decomposition_tables (const char *filename1, const char *filename2, const for (i = 0; i < 2; i++) { if (ferror (streams[i]) || fclose (streams[i])) - { - fprintf (stderr, "error writing to '%s'\n", filenames[i]); - exit (1); - } + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } } } @@ -7054,18 +8435,18 @@ fill_composition_exclusions (const char *compositionexclusions_filename) unsigned int i; if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; + break; if (buf[0] == '\0' || buf[0] == '#') - continue; + continue; if (sscanf (buf, "%X", &i) != 1) - { - fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); - exit (1); - } + { + fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); + exit (1); + } if (!(i < 0x110000)) - abort (); + abort (); unicode_composition_exclusions[i] = 1; } @@ -7097,32 +8478,32 @@ debug_output_composition_tables (const char *filename) int type = get_decomposition (ch, &length, decomposed); if (type == UC_DECOMP_CANONICAL - /* Consider only binary decompositions. - Exclude singleton decompositions. */ - && length == 2) - { - unsigned int code1 = decomposed[0]; - unsigned int code2 = decomposed[1]; - unsigned int combined = ch; - - /* Exclude decompositions where the first part is not a starter, - i.e. is not of canonical combining class 0. */ - if (strcmp (unicode_attributes[code1].combining, "0") == 0 - /* Exclude characters listed in CompositionExclusions.txt. */ - && !unicode_composition_exclusions[combined]) - { - /* The combined character must now also be a starter. - Verify this. */ - if (strcmp (unicode_attributes[combined].combining, "0") != 0) - abort (); - - fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", - code1, - code2, - combined, - unicode_attributes[code2].combining); - } - } + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", + code1, + code2, + combined, + unicode_attributes[code2].combining); + } + } } if (ferror (stream) || fclose (stream)) @@ -7148,7 +8529,7 @@ output_composition_tables (const char *filename, const char *version) fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", - version); + version); fprintf (stream, "\n"); /* Put a GPL header on it. The gnulib module is under LGPL (although it @@ -7189,7 +8570,7 @@ output_composition_tables (const char *filename, const char *version) 1527, which is quite good (60% filled). It requires an auxiliary table lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ - fprintf (stream, "struct composition_rule { char codes[4]; };\n"); + fprintf (stream, "struct composition_rule { char codes[6]; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define slot-name codes\n"); @@ -7208,38 +8589,31 @@ output_composition_tables (const char *filename, const char *version) int type = get_decomposition (ch, &length, decomposed); if (type == UC_DECOMP_CANONICAL - /* Consider only binary decompositions. - Exclude singleton decompositions. */ - && length == 2) - { - unsigned int code1 = decomposed[0]; - unsigned int code2 = decomposed[1]; - unsigned int combined = ch; - - /* Exclude decompositions where the first part is not a starter, - i.e. is not of canonical combining class 0. */ - if (strcmp (unicode_attributes[code1].combining, "0") == 0 - /* Exclude characters listed in CompositionExclusions.txt. */ - && !unicode_composition_exclusions[combined]) - { - /* The combined character must now also be a starter. - Verify this. */ - if (strcmp (unicode_attributes[combined].combining, "0") != 0) - abort (); - - if (!(code1 < 0x10000)) - abort (); - if (!(code2 < 0x10000)) - abort (); - if (!(combined < 0x10000)) - abort (); - - fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", - (code1 >> 8) & 0xff, code1 & 0xff, - (code2 >> 8) & 0xff, code2 & 0xff, - combined); - } - } + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, + combined); + } + } } if (ferror (stream) || fclose (stream)) @@ -7255,9 +8629,9 @@ output_composition_tables (const char *filename, const char *version) static void output_simple_mapping_test (const char *filename, - const char *function_name, - unsigned int (*func) (unsigned int), - const char *version) + const char *function_name, + unsigned int (*func) (unsigned int), + const char *version) { FILE *stream; bool need_comma; @@ -7288,7 +8662,7 @@ output_simple_mapping_test (const char *filename, fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", - version); + version); fprintf (stream, "\n"); fprintf (stream, "#include \"test-mapping-part1.h\"\n"); fprintf (stream, "\n"); @@ -7299,12 +8673,12 @@ output_simple_mapping_test (const char *filename, unsigned int value = func (ch); if (value != ch) - { - if (need_comma) - fprintf (stream, ",\n"); - fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); - need_comma = true; - } + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); + need_comma = true; + } } if (need_comma) fprintf (stream, "\n"); @@ -7332,8 +8706,8 @@ output_simple_mapping_test (const char *filename, static void output_simple_mapping (const char *filename, - unsigned int (*func) (unsigned int), - const char *version) + unsigned int (*func) (unsigned int), + const char *version) { FILE *stream; unsigned int ch, i; @@ -7350,7 +8724,7 @@ output_simple_mapping (const char *filename, fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", - version); + version); t.p = 7; t.q = 9; @@ -7378,7 +8752,7 @@ output_simple_mapping (const char *filename, for (i = 0; i < 5; i++) fprintf (stream, "#define mapping_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); + ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); @@ -7395,15 +8769,15 @@ output_simple_mapping (const char *filename, { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); @@ -7415,15 +8789,15 @@ output_simple_mapping (const char *filename, { uint32_t offset; if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1); else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (int32_t)); + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); @@ -7434,10 +8808,10 @@ output_simple_mapping (const char *filename, for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); + fprintf (stream, "\n "); fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) - fprintf (stream, ","); + fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); @@ -7453,55 +8827,881 @@ output_simple_mapping (const char *filename, /* ========================================================================= */ +/* A special casing context. + A context is negated through x -> -x. */ +enum +{ + SCC_ALWAYS = 0, + SCC_FINAL_SIGMA, + SCC_AFTER_SOFT_DOTTED, + SCC_MORE_ABOVE, + SCC_BEFORE_DOT, + SCC_AFTER_I +}; + +/* A special casing rule. */ +struct special_casing_rule +{ + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + unsigned int casefold_mapping[3]; + const char *language; + int context; +}; + +/* The special casing rules. */ +struct special_casing_rule **casing_rules; +unsigned int num_casing_rules; +unsigned int allocated_casing_rules; + +static void +add_casing_rule (struct special_casing_rule *new_rule) +{ + if (num_casing_rules == allocated_casing_rules) + { + allocated_casing_rules = 2 * allocated_casing_rules; + if (allocated_casing_rules < 16) + allocated_casing_rules = 16; + casing_rules = + (struct special_casing_rule **) + realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *)); + } + casing_rules[num_casing_rules++] = new_rule; +} + +/* Stores in casing_rules the special casing rules found in + specialcasing_filename. */ +static void +fill_casing_rules (const char *specialcasing_filename) +{ + FILE *stream; + + stream = fopen (specialcasing_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename); + exit (1); + } + + casing_rules = NULL; + num_casing_rules = 0; + allocated_casing_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + char *language; + int context; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan lower mapping. */ + for (i = 0; i < 3; i++) + lower_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + lower_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan title mapping. */ + for (i = 0; i < 3; i++) + title_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + title_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan upper mapping. */ + for (i = 0; i < 3; i++) + upper_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + upper_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan language and context. */ + language = NULL; + context = SCC_ALWAYS; + while (*scanptr == ' ') + scanptr++; + if (*scanptr != '\0' && *scanptr != '#') + { + const char *word_begin = scanptr; + const char *word_end; + + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + + while (*scanptr == ' ') + scanptr++; + + if (word_end - word_begin == 2) + { + language = (char *) malloc ((word_end - word_begin) + 1); + memcpy (language, word_begin, 2); + language[word_end - word_begin] = '\0'; + word_begin = word_end = NULL; + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + word_begin = scanptr; + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + } + } + + if (word_end > word_begin) + { + bool negate = false; + + if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0) + { + word_begin += 4; + negate = true; + } + if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0) + context = SCC_FINAL_SIGMA; + else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0) + context = SCC_AFTER_SOFT_DOTTED; + else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0) + context = SCC_MORE_ABOVE; + else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0) + context = SCC_BEFORE_DOT; + else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0) + context = SCC_AFTER_I; + else + { + fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename); + exit (1); + } + if (negate) + context = - context; + } + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + } + + /* Store the rule. */ + { + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + new_rule->code = code; + new_rule->language = language; + new_rule->context = context; + memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping)); + + add_casing_rule (new_rule); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", specialcasing_filename); + exit (1); + } +} + +/* A casefolding rule. */ +struct casefold_rule +{ + unsigned int code; + unsigned int mapping[3]; + const char *language; +}; + +/* The casefolding rules. */ +struct casefold_rule **casefolding_rules; +unsigned int num_casefolding_rules; +unsigned int allocated_casefolding_rules; + +/* Stores in casefolding_rules the case folding rules found in + casefolding_filename. */ +static void +fill_casefolding_rules (const char *casefolding_filename) +{ + FILE *stream; + + stream = fopen (casefolding_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename); + exit (1); + } + + casefolding_rules = NULL; + num_casefolding_rules = 0; + allocated_casefolding_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + char type; + unsigned int mapping[3]; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan type. */ + while (*scanptr == ' ') + scanptr++; + + switch (*scanptr) + { + case 'C': case 'F': case 'S': case 'T': + type = *scanptr; + break; + default: + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan casefold mapping. */ + for (i = 0; i < 3; i++) + mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */ + if (type != 'S') + { + const char * const *languages; + unsigned int languages_count; + + /* Type 'T' indicates that the rule is applicable to Turkish + languages only. */ + if (type == 'T') + { + static const char * const turkish_languages[] = { "tr", "az" }; + languages = turkish_languages; + languages_count = 2; + } + else + { + static const char * const all_languages[] = { NULL }; + languages = all_languages; + languages_count = 1; + } + + for (i = 0; i < languages_count; i++) + { + /* Store a new rule. */ + struct casefold_rule *new_rule = + (struct casefold_rule *) malloc (sizeof (struct casefold_rule)); + new_rule->code = code; + memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping)); + new_rule->language = languages[i]; + + if (num_casefolding_rules == allocated_casefolding_rules) + { + allocated_casefolding_rules = 2 * allocated_casefolding_rules; + if (allocated_casefolding_rules < 16) + allocated_casefolding_rules = 16; + casefolding_rules = + (struct casefold_rule **) + realloc (casefolding_rules, + allocated_casefolding_rules * sizeof (struct casefold_rule *)); + } + casefolding_rules[num_casefolding_rules++] = new_rule; + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", casefolding_filename); + exit (1); + } +} + +/* Casefold mapping, when it maps to a single character. */ +unsigned int unicode_casefold[0x110000]; + +static unsigned int +to_casefold (unsigned int ch) +{ + return unicode_casefold[ch]; +} + +/* Redistribute the casefolding_rules: + - Rules that map to a single character, language independently, are stored + in unicode_casefold. + - Other rules are merged into casing_rules. */ +static void +redistribute_casefolding_rules (void) +{ + unsigned int ch, i, j; + + /* Fill unicode_casefold[]. */ + for (ch = 0; ch < 0x110000; ch++) + unicode_casefold[ch] = ch; + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (cfrule->language == NULL && cfrule->mapping[1] == 0) + { + ch = cfrule->code; + if (!(ch < 0x110000)) + abort (); + unicode_casefold[ch] = cfrule->mapping[0]; + } + } + + /* Extend the special casing rules by filling in their casefold_mapping[] + field. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + unsigned int k; + + rule->casefold_mapping[0] = to_casefold (rule->code); + for (k = 1; k < 3; k++) + rule->casefold_mapping[k] = 0; + } + + /* Now merge the other casefolding rules into casing_rules. */ + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (!(cfrule->language == NULL && cfrule->mapping[1] == 0)) + { + /* Find a rule that applies to the same code, same language, and it + has context SCC_ALWAYS. At the same time, update all rules that + have the same code and same or more specific language. */ + struct special_casing_rule *found_rule = NULL; + + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && (cfrule->language == NULL + || (rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0))) + { + memcpy (rule->casefold_mapping, cfrule->mapping, + sizeof (rule->casefold_mapping)); + + if ((cfrule->language == NULL + ? rule->language == NULL + : rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0) + && rule->context == SCC_ALWAYS) + { + /* Found it. */ + found_rule = rule; + } + } + } + + if (found_rule == NULL) + { + /* Create a new rule. */ + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + + /* Try to find a rule that applies to the same code, no language + restriction, and with context SCC_ALWAYS. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && rule->context == SCC_ALWAYS + && rule->language == NULL) + { + /* Found it. */ + found_rule = rule; + break; + } + } + + new_rule->code = cfrule->code; + new_rule->language = cfrule->language; + new_rule->context = SCC_ALWAYS; + if (found_rule != NULL) + { + memcpy (new_rule->lower_mapping, found_rule->lower_mapping, + sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, found_rule->title_mapping, + sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, found_rule->upper_mapping, + sizeof (new_rule->upper_mapping)); + } + else + { + unsigned int k; + + new_rule->lower_mapping[0] = to_lower (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->lower_mapping[k] = 0; + new_rule->title_mapping[0] = to_title (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->title_mapping[k] = 0; + new_rule->upper_mapping[0] = to_upper (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->upper_mapping[k] = 0; + } + memcpy (new_rule->casefold_mapping, cfrule->mapping, + sizeof (new_rule->casefold_mapping)); + + add_casing_rule (new_rule); + } + } + } +} + +static int +compare_casing_rules (const void *a, const void *b) +{ + struct special_casing_rule *a_rule = *(struct special_casing_rule **) a; + struct special_casing_rule *b_rule = *(struct special_casing_rule **) b; + unsigned int a_code = a_rule->code; + unsigned int b_code = b_rule->code; + + if (a_code < b_code) + return -1; + if (a_code > b_code) + return 1; + + /* Sort the more specific rules before the more general ones. */ + return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0)) + + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0))); +} + +static void +sort_casing_rules (void) +{ + /* Sort the rules 1. by code, 2. by specificity. */ + if (num_casing_rules > 1) + qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *), + compare_casing_rules); +} + +/* Output the special casing rules. */ +static void +output_casing_rules (const char *filename, const char *version) +{ + FILE *stream; + unsigned int i, j; + unsigned int minor; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Special casing rules of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "struct special_casing_rule { char code[3]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name code\n"); + fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + minor = 0; + for (i = 0; i < num_casing_rules; i++) + { + struct special_casing_rule *rule = casing_rules[i]; + int context; + + if (i > 0 && rule->code == casing_rules[i - 1]->code) + minor += 1; + else + minor = 0; + + if (!(rule->code < 0x10000)) + { + fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code); + exit (1); + } + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ", + (rule->code >> 8) & 0xff, rule->code & 0xff, minor); + + fprintf (stream, "%d, ", + i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0); + + context = rule->context; + if (context < 0) + { + fprintf (stream, "-"); + context = - context; + } + else + fprintf (stream, " "); + switch (context) + { + case SCC_ALWAYS: + fprintf (stream, "SCC_ALWAYS "); + break; + case SCC_FINAL_SIGMA: + fprintf (stream, "SCC_FINAL_SIGMA "); + break; + case SCC_AFTER_SOFT_DOTTED: + fprintf (stream, "SCC_AFTER_SOFT_DOTTED"); + break; + case SCC_MORE_ABOVE: + fprintf (stream, "SCC_MORE_ABOVE "); + break; + case SCC_BEFORE_DOT: + fprintf (stream, "SCC_BEFORE_DOT "); + break; + case SCC_AFTER_I: + fprintf (stream, "SCC_AFTER_I "); + break; + default: + abort (); + } + fprintf (stream, ", "); + + if (rule->language != NULL) + { + if (strlen (rule->language) != 2) + abort (); + fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]); + } + else + fprintf (stream, "{ '\\0', '\\0' }, "); + + fprintf (stream, "{ "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->upper_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->upper_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->upper_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->lower_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->lower_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->lower_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->title_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->title_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->title_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->casefold_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->casefold_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->casefold_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }\n"); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Quoting the Unicode standard: + Definition: A character is defined to be "cased" if it has the Lowercase + or Uppercase property or has a General_Category value of + Titlecase_Letter. */ +static bool +is_cased (unsigned int ch) +{ + return (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); +} + +/* Quoting the Unicode standard: + Definition: A character is defined to be "case-ignorable" if it has the + value MidLetter {or the value MidNumLet} for the Word_Break property or + its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), + Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). + The text marked in braces was added in Unicode 5.1.0, see + section "Update of + Definition of case-ignorable". */ +/* Since this predicate is only used for the "Before C" and "After C" + conditions of FINAL_SIGMA, we exclude the "cased" characters here. + This simplifies the evaluation of the regular expressions + \p{cased} (\p{case-ignorable})* C + and + C (\p{case-ignorable})* \p{cased} + */ +static bool +is_case_ignorable (unsigned int ch) +{ + return (unicode_org_wbp[ch] == WBP_MIDLETTER + || unicode_org_wbp[ch] == WBP_MIDNUMLET + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)) + && !is_cased (ch); +} + +/* ------------------------------------------------------------------------- */ + +/* Output all case related properties. */ +static void +output_casing_properties (const char *version) +{ +#define PROPERTY(FN,P) \ + debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \ + output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \ + output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version); + PROPERTY(cased, cased) + PROPERTY(ignorable, case_ignorable) +#undef PROPERTY +} + +/* ========================================================================= */ + int main (int argc, char * argv[]) { const char *unicodedata_filename; const char *proplist_filename; const char *derivedproplist_filename; + const char *arabicshaping_filename; const char *scripts_filename; const char *blocks_filename; const char *proplist30_filename; const char *eastasianwidth_filename; const char *linebreak_filename; const char *wordbreakproperty_filename; + const char *graphemebreakproperty_filename; const char *compositionexclusions_filename; + const char *specialcasing_filename; + const char *casefolding_filename; const char *version; - if (argc != 12) + if (argc != 16) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt version\n", - argv[0]); + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + argv[0]); exit (1); } unicodedata_filename = argv[1]; proplist_filename = argv[2]; derivedproplist_filename = argv[3]; - scripts_filename = argv[4]; - blocks_filename = argv[5]; - proplist30_filename = argv[6]; - eastasianwidth_filename = argv[7]; - linebreak_filename = argv[8]; - wordbreakproperty_filename = argv[9]; - compositionexclusions_filename = argv[10]; - version = argv[11]; + arabicshaping_filename = argv[4]; + scripts_filename = argv[5]; + blocks_filename = argv[6]; + proplist30_filename = argv[7]; + eastasianwidth_filename = argv[8]; + linebreak_filename = argv[9]; + wordbreakproperty_filename = argv[10]; + graphemebreakproperty_filename = argv[11]; + compositionexclusions_filename = argv[12]; + specialcasing_filename = argv[13]; + casefolding_filename = argv[14]; + version = argv[15]; fill_attributes (unicodedata_filename); clear_properties (); fill_properties (proplist_filename); fill_properties (derivedproplist_filename); fill_properties30 (proplist30_filename); + fill_arabicshaping (arabicshaping_filename); fill_scripts (scripts_filename); fill_blocks (blocks_filename); fill_width (eastasianwidth_filename); fill_org_lbp (linebreak_filename); fill_org_wbp (wordbreakproperty_filename); + fill_org_gbp (graphemebreakproperty_filename); fill_composition_exclusions (compositionexclusions_filename); + fill_casing_rules (specialcasing_filename); + fill_casefolding_rules (casefolding_filename); + redistribute_casefolding_rules (); + sort_casing_rules (); output_categories (version); output_category ("unictype/categ_of.h", version); - output_combclass ("unictype/combining.h", version); + output_combclass ("unictype/combiningclass.h", version); output_bidi_category ("unictype/bidi_of.h", version); output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); output_decimal_digit ("unictype/decdigit.h", version); @@ -7511,10 +9711,17 @@ main (int argc, char * argv[]) output_numeric ("unictype/numeric.h", version); output_mirror ("unictype/mirror.h", version); output_properties (version); + output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version); + output_joining_type ("unictype/joiningtype_of.h", version); + output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version); + output_joining_group ("unictype/joininggroup_of.h", version); + output_scripts (version); output_scripts_byname (version); output_blocks (version); output_ident_properties (version); + output_nonspacing_property ("uniwidth/width.c.part"); + output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part"); output_old_ctype (version); debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); @@ -7525,6 +9732,9 @@ main (int argc, char * argv[]) debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h"); + output_gbp_table ("unigbrk/gbrkprop.h", version); + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); debug_output_composition_tables ("uninorm/composition.txt"); output_composition_tables ("uninorm/composition-table.gperf", version); @@ -7535,6 +9745,9 @@ main (int argc, char * argv[]) output_simple_mapping ("unicase/toupper.h", to_upper, version); output_simple_mapping ("unicase/tolower.h", to_lower, version); output_simple_mapping ("unicase/totitle.h", to_title, version); + output_simple_mapping ("unicase/tocasefold.h", to_casefold, version); + output_casing_rules ("unicase/special-casing-table.gperf", version); + output_casing_properties (version); return 0; } @@ -7545,17 +9758,23 @@ main (int argc, char * argv[]) * compile-command: " gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ ./gen-uni-tables \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ - 5.1.0 + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \ + 6.0.0 \ + && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \ + && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt " * End: */