+/* The Java Language Specification, 3rd edition, §3.6.
+ http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
+static bool
+is_java_whitespace (unsigned int ch)
+{
+ return (ch == ' ' || ch == '\t' || ch == '\f'
+ || ch == '\n' || ch == '\r');
+}
+
+/* The Java Language Specification, 3rd edition, §3.8.
+ http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
+ and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
+static int
+java_ident_category (unsigned int ch)
+{
+ /* FIXME: Check this against Sun's JDK implementation. */
+ if (is_category_L (ch) /* = Character.isLetter(ch) */
+ || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
+ || is_category_Sc (ch) /* currency symbol */
+ || is_category_Pc (ch) /* connector punctuation */
+ )
+ return UC_IDENTIFIER_START;
+ if (is_category_Nd (ch) /* digit */
+ || is_category_Mc (ch) /* combining mark */
+ || is_category_Mn (ch) /* non-spacing mark */
+ )
+ return UC_IDENTIFIER_VALID;
+ if ((ch >= 0x0000 && ch <= 0x0008)
+ || (ch >= 0x000E && ch <= 0x001B)
+ || (ch >= 0x007F && ch <= 0x009F)
+ || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
+ )
+ return UC_IDENTIFIER_IGNORABLE;
+ return UC_IDENTIFIER_INVALID;
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE identsyntax_table
+#define ELEMENT uint8_t
+#define DEFAULT UC_IDENTIFIER_INVALID
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* Output an identifier syntax categorization in a three-level bitmap. */
+static void
+output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct identsyntax_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ t.p = 7; /* or 8 */
+ t.q = 5; /* or 4 */
+ identsyntax_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int syntaxcode = predicate (ch);
+ if (syntaxcode != UC_IDENTIFIER_INVALID)
+ identsyntax_table_add (&t, ch, syntaxcode);
+ }
+
+ identsyntax_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define identsyntax_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
+ (1 << t.p) * 2 / 16);
+ fprintf (stream, " }\n");
+ fprintf (stream, "%s =\n", name);
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ /* Pack the level3 array. Each entry needs 2 bits only. */
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 2 / 16 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%04x",
+ (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
+ if (i+1 < (t.level3_size << t.p) * 2 / 16)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 2 / 16 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_ident_properties (const char *version)
+{
+#define PROPERTY(P) \
+ debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
+ output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
+ output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
+ PROPERTY(c_whitespace)
+ PROPERTY(java_whitespace)
+#undef PROPERTY
+
+ output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
+ output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
+}
+
+/* ========================================================================= */
+
+/* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
+ glibc/localedata/locales/i18n file, generated by
+ glibc/localedata/gen-unicode-ctype.c. */
+
+/* Character mappings. */
+
+static unsigned int
+to_upper (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].upper != NONE)
+ return unicode_attributes[ch].upper;
+ else
+ return ch;
+}
+
+static unsigned int
+to_lower (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].lower != NONE)
+ return unicode_attributes[ch].lower;
+ else
+ return ch;
+}
+
+static unsigned int
+to_title (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].title != NONE)
+ return unicode_attributes[ch].title;
+ else
+ return ch;
+}
+
+/* Character class properties. */
+
+static bool
+is_upper (unsigned int ch)
+{
+ return (to_lower (ch) != ch);
+}
+
+static bool
+is_lower (unsigned int ch)
+{
+ return (to_upper (ch) != ch)
+ /* <U00DF> is lowercase, but without simple to_upper mapping. */
+ || (ch == 0x00DF);
+}
+
+static bool
+is_alpha (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && ((unicode_attributes[ch].category[0] == 'L'
+ /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
+ <U0E2F>, <U0E46> should belong to is_punct. */
+ && (ch != 0x0E2F) && (ch != 0x0E46))
+ /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
+ <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
+ || (ch == 0x0E31)
+ || (ch >= 0x0E34 && ch <= 0x0E3A)
+ || (ch >= 0x0E47 && ch <= 0x0E4E)
+ /* Avoid warning for <U0345>. */
+ || (ch == 0x0345)
+ /* Avoid warnings for <U2160>..<U217F>. */
+ || (unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'l')
+ /* Avoid warnings for <U24B6>..<U24E9>. */
+ || (unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'o'
+ && strstr (unicode_attributes[ch].name, " LETTER ")
+ != NULL)
+ /* Consider all the non-ASCII digits as alphabetic.
+ ISO C 99 forbids us to have them in category "digit",
+ but we want iswalnum to return true on them. */
+ || (unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd'
+ && !(ch >= 0x0030 && ch <= 0x0039))));
+}
+
+static bool
+is_digit (unsigned int ch)
+{
+#if 0
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd');
+ /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+ a zero. Must add <0> in front of them by hand. */
+#else
+ /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+ takes it away:
+ 7.25.2.1.5:
+ The iswdigit function tests for any wide character that corresponds
+ to a decimal-digit character (as defined in 5.2.1).
+ 5.2.1:
+ the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+ */
+ return (ch >= 0x0030 && ch <= 0x0039);
+#endif
+}
+
+static bool
+is_outdigit (unsigned int ch)
+{
+ return (ch >= 0x0030 && ch <= 0x0039);
+}
+
+static bool
+is_alnum (unsigned int ch)
+{
+ return is_alpha (ch) || is_digit (ch);
+}
+
+static bool
+is_blank (unsigned int ch)
+{
+ return (ch == 0x0009 /* '\t' */
+ /* Category Zs without mention of "<noBreak>" */
+ || (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && unicode_attributes[ch].category[1] == 's'
+ && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
+}
+
+static bool
+is_space (unsigned int ch)
+{
+ /* Don't make U+00A0 a space. Non-breaking space means that all programs
+ should treat it like a punctuation character, not like a space. */
+ return (ch == 0x0020 /* ' ' */
+ || ch == 0x000C /* '\f' */
+ || ch == 0x000A /* '\n' */
+ || ch == 0x000D /* '\r' */
+ || ch == 0x0009 /* '\t' */
+ || ch == 0x000B /* '\v' */
+ /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
+ || (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p'
+ || (unicode_attributes[ch].category[1] == 's'
+ && !strstr (unicode_attributes[ch].decomposition,
+ "<noBreak>")))));
+}
+
+static bool
+is_cntrl (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && (strcmp (unicode_attributes[ch].name, "<control>") == 0
+ /* Categories Zl and Zp */
+ || (unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p'))));
+}
+
+static bool
+is_xdigit (unsigned int ch)
+{
+#if 0
+ return is_digit (ch)
+ || (ch >= 0x0041 && ch <= 0x0046)
+ || (ch >= 0x0061 && ch <= 0x0066);
+#else
+ /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+ takes it away:
+ 7.25.2.1.12:
+ The iswxdigit function tests for any wide character that corresponds
+ to a hexadecimal-digit character (as defined in 6.4.4.1).
+ 6.4.4.1:
+ hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+ */
+ return (ch >= 0x0030 && ch <= 0x0039)
+ || (ch >= 0x0041 && ch <= 0x0046)
+ || (ch >= 0x0061 && ch <= 0x0066);
+#endif
+}
+
+static bool
+is_graph (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && strcmp (unicode_attributes[ch].name, "<control>")
+ && !is_space (ch));
+}
+
+static bool
+is_print (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && strcmp (unicode_attributes[ch].name, "<control>")
+ /* Categories Zl and Zp */
+ && !(unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p')));
+}
+
+static bool
+is_punct (unsigned int ch)
+{
+#if 0
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'P');
+#else
+ /* The traditional POSIX definition of punctuation is every graphic,
+ non-alphanumeric character. */
+ return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
+#endif
+}
+
+/* Output all properties. */
+static void
+output_old_ctype (const char *version)
+{
+#define PROPERTY(P) \
+ debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
+ output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
+ output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
+ PROPERTY(alnum)
+ PROPERTY(alpha)
+ PROPERTY(cntrl)
+ PROPERTY(digit)
+ PROPERTY(graph)
+ PROPERTY(lower)
+ PROPERTY(print)
+ PROPERTY(punct)
+ PROPERTY(space)
+ PROPERTY(upper)
+ PROPERTY(xdigit)
+ PROPERTY(blank)
+#undef PROPERTY
+}
+
+#if 0
+
+static bool
+is_combining (unsigned int ch)
+{
+ /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+ file. In 3.0.1 it was identical to the union of the general categories
+ "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+ PropList.txt file, so we take the latter definition. */
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'n'
+ || unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'e'));
+}
+
+static bool
+is_combining_level3 (unsigned int ch)
+{
+ return is_combining (ch)
+ && !(unicode_attributes[ch].combining[0] != '\0'
+ && unicode_attributes[ch].combining[0] != '0'
+ && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
+}
+
+/* Return the UCS symbol string for a Unicode character. */
+static const char *
+ucs_symbol (unsigned int i)
+{
+ static char buf[11+1];
+
+ sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
+ return buf;
+}
+
+/* Return the UCS symbol range string for a Unicode characters interval. */
+static const char *
+ucs_symbol_range (unsigned int low, unsigned int high)
+{
+ static char buf[24+1];
+
+ strcpy (buf, ucs_symbol (low));
+ strcat (buf, "..");
+ strcat (buf, ucs_symbol (high));
+ return buf;
+}
+
+/* Output a character class (= property) table. */
+
+static void
+output_charclass (FILE *stream, const char *classname,
+ bool (*func) (unsigned int))
+{
+ char table[0x110000];
+ unsigned int i;
+ bool need_semicolon;
+ const int max_column = 75;
+ int column;
+
+ for (i = 0; i < 0x110000; i++)
+ table[i] = (int) func (i);
+
+ fprintf (stream, "%s ", classname);
+ need_semicolon = false;
+ column = 1000;
+ for (i = 0; i < 0x110000; )
+ {
+ if (!table[i])
+ i++;
+ else
+ {
+ unsigned int low, high;
+ char buf[25];
+
+ low = i;
+ do
+ i++;
+ while (i < 0x110000 && table[i]);
+ high = i - 1;
+
+ if (low == high)
+ strcpy (buf, ucs_symbol (low));
+ else
+ strcpy (buf, ucs_symbol_range (low, high));
+
+ if (need_semicolon)
+ {
+ fprintf (stream, ";");
+ column++;
+ }
+
+ if (column + strlen (buf) > max_column)
+ {
+ fprintf (stream, "/\n ");
+ column = 3;
+ }
+
+ fprintf (stream, "%s", buf);
+ column += strlen (buf);
+ need_semicolon = true;
+ }
+ }
+ fprintf (stream, "\n");
+}
+
+/* Output a character mapping table. */
+
+static void
+output_charmap (FILE *stream, const char *mapname,
+ unsigned int (*func) (unsigned int))
+{
+ char table[0x110000];
+ unsigned int i;
+ bool need_semicolon;
+ const int max_column = 75;
+ int column;
+
+ for (i = 0; i < 0x110000; i++)
+ table[i] = (func (i) != i);
+
+ fprintf (stream, "%s ", mapname);
+ need_semicolon = false;
+ column = 1000;
+ for (i = 0; i < 0x110000; i++)
+ if (table[i])
+ {
+ char buf[25+1];
+
+ strcpy (buf, "(");
+ strcat (buf, ucs_symbol (i));
+ strcat (buf, ",");
+ strcat (buf, ucs_symbol (func (i)));
+ strcat (buf, ")");
+
+ if (need_semicolon)
+ {
+ fprintf (stream, ";");
+ column++;
+ }
+
+ if (column + strlen (buf) > max_column)
+ {
+ fprintf (stream, "/\n ");
+ column = 3;
+ }
+
+ fprintf (stream, "%s", buf);
+ column += strlen (buf);
+ need_semicolon = true;
+ }
+ fprintf (stream, "\n");
+}
+
+/* Output the width table. */
+
+static void
+output_widthmap (FILE *stream)
+{
+}
+
+/* Output the tables to the given file. */
+
+static void
+output_tables (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "escape_char /\n");
+ fprintf (stream, "comment_char %%\n");
+ fprintf (stream, "\n");
+ fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
+ version);
+ fprintf (stream, "\n");
+
+ fprintf (stream, "LC_IDENTIFICATION\n");
+ fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
+ fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
+ fprintf (stream, "address \"\"\n");
+ fprintf (stream, "contact \"\"\n");
+ fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
+ fprintf (stream, "tel \"\"\n");
+ fprintf (stream, "fax \"\"\n");
+ fprintf (stream, "language \"\"\n");
+ fprintf (stream, "territory \"Earth\"\n");
+ fprintf (stream, "revision \"%s\"\n", version);
+ {
+ time_t now;
+ char date[11];
+ now = time (NULL);
+ strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
+ fprintf (stream, "date \"%s\"\n", date);
+ }
+ fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
+ fprintf (stream, "END LC_IDENTIFICATION\n");
+ fprintf (stream, "\n");
+
+ /* Verification. */
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ /* toupper restriction: "Only characters specified for the keywords
+ lower and upper shall be specified. */
+ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
+ fprintf (stderr,
+ "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
+ ucs_symbol (ch), ch, to_upper (ch));
+
+ /* tolower restriction: "Only characters specified for the keywords
+ lower and upper shall be specified. */
+ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
+ fprintf (stderr,
+ "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
+ ucs_symbol (ch), ch, to_lower (ch));
+
+ /* alpha restriction: "Characters classified as either upper or lower
+ shall automatically belong to this class. */
+ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
+ fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
+
+ /* alpha restriction: "No character specified for the keywords cntrl,
+ digit, punct or space shall be specified." */
+ if (is_alpha (ch) && is_cntrl (ch))
+ fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_digit (ch))
+ fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_punct (ch))
+ fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_space (ch))
+ fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
+
+ /* space restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, graph or xdigit shall be specified."
+ upper, lower, alpha already checked above. */
+ if (is_space (ch) && is_digit (ch))
+ fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
+ if (is_space (ch) && is_graph (ch))
+ fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
+ if (is_space (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
+
+ /* cntrl restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, punct, graph, print or xdigit shall be
+ specified." upper, lower, alpha already checked above. */
+ if (is_cntrl (ch) && is_digit (ch))
+ fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_punct (ch))
+ fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_graph (ch))
+ fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_print (ch))
+ fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
+
+ /* punct restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+ be specified." upper, lower, alpha, cntrl already checked above. */
+ if (is_punct (ch) && is_digit (ch))
+ fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
+ if (is_punct (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
+ if (is_punct (ch) && (ch == 0x0020))
+ fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
+
+ /* graph restriction: "No character specified for the keyword cntrl
+ shall be specified." Already checked above. */
+
+ /* print restriction: "No character specified for the keyword cntrl
+ shall be specified." Already checked above. */
+
+ /* graph - print relation: differ only in the <space> character.
+ How is this possible if there are more than one space character?!
+ I think susv2/xbd/locale.html should speak of "space characters",
+ not "space character". */
+ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
+ fprintf (stderr,
+ "%s is print but not graph|<space>\n", ucs_symbol (ch));
+ if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
+ fprintf (stderr,
+ "%s is graph|<space> but not print\n", ucs_symbol (ch));
+ }
+
+ fprintf (stream, "LC_CTYPE\n");
+ output_charclass (stream, "upper", is_upper);
+ output_charclass (stream, "lower", is_lower);
+ output_charclass (stream, "alpha", is_alpha);
+ output_charclass (stream, "digit", is_digit);
+ output_charclass (stream, "outdigit", is_outdigit);
+ output_charclass (stream, "blank", is_blank);
+ output_charclass (stream, "space", is_space);
+ output_charclass (stream, "cntrl", is_cntrl);
+ output_charclass (stream, "punct", is_punct);
+ output_charclass (stream, "xdigit", is_xdigit);
+ output_charclass (stream, "graph", is_graph);
+ output_charclass (stream, "print", is_print);
+ output_charclass (stream, "class \"combining\";", is_combining);
+ output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
+ output_charmap (stream, "toupper", to_upper);
+ output_charmap (stream, "tolower", to_lower);
+ output_charmap (stream, "map \"totitle\";", to_title);
+ output_widthmap (stream);
+ fprintf (stream, "END LC_CTYPE\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+#endif
+
+/* ========================================================================= */
+
+/* The width property from the EastAsianWidth.txt file.
+ Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
+const char * unicode_width[0x110000];
+
+/* Stores in unicode_width[] the width property from the EastAsianWidth.txt
+ file. */
+static void
+fill_width (const char *width_filename)
+{
+ unsigned int i, j;
+ FILE *stream;
+ char field0[FIELDLEN];
+ char field1[FIELDLEN];
+ char field2[FIELDLEN];
+ int lineno = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
+
+ stream = fopen (width_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", width_filename);
+ exit (1);
+ }
+
+ for (;;)
+ {
+ int n;
+ int c;
+
+ lineno++;
+ c = getc (stream);
+ if (c == EOF)
+ break;
+ if (c == '#')
+ {
+ do c = getc (stream); while (c != EOF && c != '\n');
+ continue;
+ }
+ ungetc (c, stream);
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ' ');
+ n += getfield (stream, field2, '\n');
+ if (n == 0)
+ break;
+ if (n != 3)
+ {
+ fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
+ exit (1);
+ }
+ i = strtoul (field0, NULL, 16);
+ if (strstr (field0, "..") != NULL)
+ {
+ /* Deal with a range. */
+ j = strtoul (strstr (field0, "..") + 2, NULL, 16);
+ for (; i <= j; i++)
+ unicode_width[i] = strdup (field1);
+ }
+ else
+ {
+ /* Single character line. */
+ unicode_width[i] = strdup (field1);
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", width_filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Non-spacing attribute and width. */
+
+/* The non-spacing attribute table consists of:
+ - Non-spacing characters; generated from PropList.txt or
+ "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
+ - Format control characters; generated from
+ "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+ - Zero width characters; generated from
+ "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+ */
+
+static bool
+is_nonspacing (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && (get_bidi_category (ch) == UC_BIDI_NSM
+ || is_category_Cc (ch) || is_category_Cf (ch)
+ || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
+}
+
+static void
+output_nonspacing_property (const char *filename)
+{
+ FILE *stream;
+ int ind[0x110000 / 0x200];
+ unsigned int i;
+ unsigned int i_max;
+ int next_ind;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ next_ind = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = false;
+ unsigned int ch;
+
+ if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
+ for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
+ if (is_nonspacing (ch))
+ {
+ nontrivial = true;
+ break;
+ }
+ if (nontrivial)
+ ind[i] = next_ind++;
+ else
+ ind[i] = -1;
+ }
+
+ fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
+ next_ind);
+ i_max = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = (ind[i] >= 0);
+
+ if (nontrivial)
+ {
+ unsigned int j;
+
+ fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
+ for (j = 0; j < 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ unsigned int l;
+ unsigned char bits = 0;
+
+ for (l = 0; l < 8; l++)
+ {
+ unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
+
+ if (is_nonspacing (ch))
+ bits |= 1 << l;
+ }
+ fprintf (stream, " 0x%02x%c", bits,
+ ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
+ }
+ i_max = i;
+ }
+ }
+ fprintf (stream, "};\n");
+
+ i_max = ((i_max + 8 - 1) / 8) * 8;
+ fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
+ i_max);
+ {
+ unsigned int j;
+
+ for (j = 0; j < i_max / 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ i = j * 8 + k;
+ fprintf (stream, " %2d%c", ind[i],
+ j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
+ }
+ }
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
+static char
+symbolic_width (unsigned int ch)
+{
+ /* Test for unassigned character. */
+ if (is_property_unassigned_code_value (ch))
+ {
+ /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
+ if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
+ return 'A';
+ if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
+ || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
+ || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
+ return '2';
+ return 0;
+ }
+ else
+ {
+ /* Test for non-spacing or control character. */
+ if (is_category_Cc (ch) && ch < 0x00A0)
+ return 0;
+ if (is_nonspacing (ch))
+ return '0';
+ /* Test for double-width character. */
+ if (unicode_width[ch] != NULL
+ && (strcmp (unicode_width[ch], "W") == 0
+ || strcmp (unicode_width[ch], "F") == 0))
+ return '2';
+ /* Test for half-width character. */
+ if (unicode_width[ch] != NULL
+ && strcmp (unicode_width[ch], "H") == 0)
+ return '1';
+ }
+ /* In ancient CJK encodings, Cyrillic and most other characters are
+ double-width as well. */
+ if (ch >= 0x00A1 && ch < 0x10000)
+ return 'A';
+ return '1';
+}
+
+static void
+output_width_property_test (const char *filename)
+{
+ FILE *stream;
+ unsigned int interval_start, interval_end, ch;
+ char interval_value;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ interval_value = 0;
+ interval_start = interval_end = 0; /* avoid GCC warning */
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ char value = symbolic_width (ch);
+ if (value != 0) /* skip Cc control characters and unassigned characters */
+ {
+ if (value == interval_value)
+ /* Extend the interval. */
+ interval_end = ch;
+ else
+ {
+ /* Terminate the interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+ /* Start a new interval. */
+ interval_start = interval_end = ch;
+ interval_value = value;
+ }
+ }
+ }
+ /* Terminate the last interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Line breaking classification.
+ Updated for Unicode TR #14 revision 26. */
+
+enum
+{
+ /* Values >= 25 are resolved at run time. */
+ LBP_BK = 25, /* mandatory break */
+/*LBP_CR, carriage return - not used here because it's a DOSism */
+/*LBP_LF, line feed - not used here because it's a DOSism */
+ LBP_CM = 26, /* attached characters and combining marks */
+/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
+/*LBP_SG, surrogates - not used here because they are not characters */
+ LBP_WJ = 0, /* word joiner */
+ LBP_ZW = 27, /* zero width space */
+ LBP_GL = 1, /* non-breaking (glue) */
+ LBP_SP = 28, /* space */
+ LBP_B2 = 2, /* break opportunity before and after */
+ LBP_BA = 3, /* break opportunity after */
+ LBP_BB = 4, /* break opportunity before */
+ LBP_HY = 5, /* hyphen */
+ LBP_CB = 29, /* contingent break opportunity */
+ LBP_CL = 6, /* closing punctuation */
+ LBP_CP = 7, /* closing parenthesis */
+ LBP_EX = 8, /* exclamation/interrogation */
+ LBP_IN = 9, /* inseparable */
+ LBP_NS = 10, /* non starter */
+ LBP_OP = 11, /* opening punctuation */
+ LBP_QU = 12, /* ambiguous quotation */
+ LBP_IS = 13, /* infix separator (numeric) */
+ LBP_NU = 14, /* numeric */
+ LBP_PO = 15, /* postfix (numeric) */
+ LBP_PR = 16, /* prefix (numeric) */
+ LBP_SY = 17, /* symbols allowing breaks */
+ LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+ LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+ LBP_H2 = 19, /* Hangul LV syllable */
+ LBP_H3 = 20, /* Hangul LVT syllable */
+ LBP_ID = 21, /* ideographic */
+ LBP_JL = 22, /* Hangul L Jamo */
+ LBP_JV = 23, /* Hangul V Jamo */
+ LBP_JT = 24, /* Hangul T Jamo */
+ LBP_SA = 31, /* complex context (South East Asian) */
+ LBP_XX = 32 /* unknown */
+};
+
+/* Returns the line breaking classification for ch, as a bit mask. */
+static int64_t
+get_lbp (unsigned int ch)
+{
+ int64_t attr = 0;
+
+ if (unicode_attributes[ch].name != NULL)
+ {
+ /* mandatory break */
+ if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
+ || ch == 0x000C /* form feed */
+ || ch == 0x000B /* line tabulation */
+ || ch == 0x2028 /* LINE SEPARATOR */
+ || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
+ attr |= (int64_t) 1 << LBP_BK;
+
+ if (ch == 0x2060 /* WORD JOINER */
+ || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
+ attr |= (int64_t) 1 << LBP_WJ;
+
+ /* zero width space */
+ if (ch == 0x200B /* ZERO WIDTH SPACE */)
+ attr |= (int64_t) 1 << LBP_ZW;
+
+ /* non-breaking (glue) */
+ if (ch == 0x00A0 /* NO-BREAK SPACE */
+ || ch == 0x202F /* NARROW NO-BREAK SPACE */
+ || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
+ || ch == 0x034F /* COMBINING GRAPHEME JOINER */
+ || ch == 0x2007 /* FIGURE SPACE */
+ || ch == 0x2011 /* NON-BREAKING HYPHEN */
+ || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
+ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
+ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
+ || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
+ || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
+ attr |= (int64_t) 1 << LBP_GL;
+
+ /* space */
+ if (ch == 0x0020 /* SPACE */)
+ attr |= (int64_t) 1 << LBP_SP;
+
+ /* break opportunity before and after */
+ if (ch == 0x2014 /* EM DASH */)
+ attr |= (int64_t) 1 << LBP_B2;
+
+ /* break opportunity after */
+ if (/* Breaking Spaces */
+ ch == 0x1680 /* OGHAM SPACE MARK */
+ || ch == 0x2000 /* EN QUAD */
+ || ch == 0x2001 /* EM QUAD */
+ || ch == 0x2002 /* EN SPACE */
+ || ch == 0x2003 /* EM SPACE */
+ || ch == 0x2004 /* THREE-PER-EM SPACE */
+ || ch == 0x2005 /* FOUR-PER-EM SPACE */
+ || ch == 0x2006 /* SIX-PER-EM SPACE */
+ || ch == 0x2008 /* PUNCTUATION SPACE */
+ || ch == 0x2009 /* THIN SPACE */
+ || ch == 0x200A /* HAIR SPACE */
+ || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
+ /* Tabs */
+ || ch == 0x0009 /* tab */
+ /* Conditional Hyphens */
+ || ch == 0x00AD /* SOFT HYPHEN */
+ /* Breaking Hyphens */
+ || ch == 0x058A /* ARMENIAN HYPHEN */
+ || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
+ || ch == 0x2010 /* HYPHEN */
+ || ch == 0x2012 /* FIGURE DASH */
+ || ch == 0x2013 /* EN DASH */
+ /* Visible Word Dividers */
+ || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
+ || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
+ || ch == 0x1361 /* ETHIOPIC WORDSPACE */
+ || ch == 0x17D8 /* KHMER SIGN BEYYAL */
+ || ch == 0x17DA /* KHMER SIGN KOOMUUT */
+ || ch == 0x2027 /* HYPHENATION POINT */
+ || ch == 0x007C /* VERTICAL LINE */
+ /* Historic Word Separators */
+ || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
+ || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
+ || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
+ || ch == 0x2056 /* THREE DOT PUNCTUATION */
+ || ch == 0x2058 /* FOUR DOT PUNCTUATION */
+ || ch == 0x2059 /* FIVE DOT PUNCTUATION */
+ || ch == 0x205A /* TWO DOT PUNCTUATION */
+ || ch == 0x205B /* FOUR DOT MARK */
+ || ch == 0x205D /* TRICOLON */
+ || ch == 0x205E /* VERTICAL FOUR DOTS */
+ || ch == 0x2E19 /* PALM BRANCH */
+ || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
+ || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
+ || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
+ || ch == 0x2E30 /* RING POINT */
+ || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
+ || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
+ || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
+ || ch == 0x10102 /* AEGEAN CHECK MARK */
+ || ch == 0x1039F /* UGARITIC WORD DIVIDER */
+ || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
+ || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
+ || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
+ /* Dandas */
+ || ch == 0x0964 /* DEVANAGARI DANDA */
+ || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
+ || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
+ || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
+ || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
+ || ch == 0x104B /* MYANMAR SIGN SECTION */
+ || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
+ || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
+ || ch == 0x17D4 /* KHMER SIGN KHAN */
+ || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
+ || ch == 0x1B5E /* BALINESE CARIK SIKI */
+ || ch == 0x1B5F /* BALINESE CARIK PAREREN */
+ || ch == 0xA8CE /* SAURASHTRA DANDA */
+ || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
+ || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
+ || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
+ || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
+ || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
+ || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
+ /* Tibetan */
+ || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
+ || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
+ || ch == 0x0F85 /* TIBETAN MARK PALUTA */
+ || ch == 0x0FBE /* TIBETAN KU RU KHA */
+ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
+ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
+ /* Other Terminating Punctuation */
+ || ch == 0x1804 /* MONGOLIAN COLON */
+ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
+ || ch == 0x1B5A /* BALINESE PANTI */
+ || ch == 0x1B5B /* BALINESE PAMADA */
+ || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
+ || ch == 0x1B60 /* BALINESE PAMENENG */
+ || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
+ || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
+ || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
+ || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
+ || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
+ || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
+ || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
+ || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
+ || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
+ || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
+ || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
+ || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
+ || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
+ || ch == 0xA60D /* VAI COMMA */
+ || ch == 0xA60F /* VAI QUESTION MARK */
+ || ch == 0xA92E /* KAYAH LI SIGN CWI */
+ || ch == 0xA92F /* KAYAH LI SIGN SHYA */
+ || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
+ || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
+ || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
+ || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
+ || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
+ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
+ || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
+ || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
+ || ch == 0xA6F3 /* BAMUM FULL STOP */
+ || ch == 0xA6F4 /* BAMUM COLON */
+ || ch == 0xA6F5 /* BAMUM COMMA */
+ || ch == 0xA6F6 /* BAMUM SEMICOLON */
+ || ch == 0xA6F7 /* BAMUM QUESTION MARK */
+ || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
+ || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
+ || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+ || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
+ || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
+ || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
+ || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
+ || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
+ || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
+ || ch == 0x11047 /* BRAHMI DANDA */
+ || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
+ || ch == 0x110BE /* KAITHI SECTION MARK */
+ || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
+ || ch == 0x110C0 /* KAITHI DANDA */
+ || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
+ || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
+ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
+ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
+ attr |= (int64_t) 1 << LBP_BA;
+
+ /* break opportunity before */
+ if (ch == 0x00B4 /* ACUTE ACCENT */
+ || ch == 0x1FFD /* GREEK OXIA */
+ || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
+ || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
+ || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
+ || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
+ || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
+ || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
+ || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
+ || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
+ || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
+ || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
+ || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
+ || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
+ || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
+ || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
+ || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
+ || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
+ || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
+ attr |= (int64_t) 1 << LBP_BB;
+
+ /* hyphen */
+ if (ch == 0x002D /* HYPHEN-MINUS */)
+ attr |= (int64_t) 1 << LBP_HY;
+
+ /* contingent break opportunity */
+ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
+ attr |= (int64_t) 1 << LBP_CB;
+
+ /* closing parenthesis */
+ if (ch == 0x0029 /* RIGHT PARENTHESIS */
+ || ch == 0x005D /* RIGHT SQUARE BRACKET */)
+ attr |= (int64_t) 1 << LBP_CP;
+
+ /* closing punctuation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'e'
+ && !(attr & ((int64_t) 1 << LBP_CP)))
+ || ch == 0x3001 /* IDEOGRAPHIC COMMA */
+ || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
+ || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
+ || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
+ || ch == 0xFE50 /* SMALL COMMA */
+ || ch == 0xFE52 /* SMALL FULL STOP */
+ || ch == 0xFF0C /* FULLWIDTH COMMA */
+ || ch == 0xFF0E /* FULLWIDTH FULL STOP */
+ || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
+ || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
+ || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
+ || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
+ || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
+ || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
+ || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
+ || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
+ || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
+ attr |= (int64_t) 1 << LBP_CL;
+
+ /* exclamation/interrogation */
+ if (ch == 0x0021 /* EXCLAMATION MARK */
+ || ch == 0x003F /* QUESTION MARK */
+ || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
+ || ch == 0x061B /* ARABIC SEMICOLON */
+ || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
+ || ch == 0x061F /* ARABIC QUESTION MARK */
+ || ch == 0x06D4 /* ARABIC FULL STOP */
+ || ch == 0x07F9 /* NKO EXCLAMATION MARK */
+ || ch == 0x0F0D /* TIBETAN MARK SHAD */
+ || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
+ || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
+ || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
+ || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
+ || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
+ || ch == 0x1802 /* MONGOLIAN COMMA */
+ || ch == 0x1803 /* MONGOLIAN FULL STOP */
+ || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
+ || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
+ || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
+ || ch == 0x1945 /* LIMBU QUESTION MARK */
+ || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
+ || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
+ || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
+ || ch == 0x2CFE /* COPTIC FULL STOP */
+ || ch == 0x2E2E /* REVERSED QUESTION MARK */
+ || ch == 0xA60E /* VAI FULL STOP */
+ || ch == 0xA876 /* PHAGS-PA MARK SHAD */
+ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
+ || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
+ || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
+ || ch == 0xFE56 /* SMALL QUESTION MARK */
+ || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
+ || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
+ || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
+ attr |= (int64_t) 1 << LBP_EX;
+
+ /* inseparable */
+ if (ch == 0x2024 /* ONE DOT LEADER */
+ || ch == 0x2025 /* TWO DOT LEADER */
+ || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
+ || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
+ attr |= (int64_t) 1 << LBP_IN;
+
+ /* non starter */
+ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
+ || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
+ || ch == 0x203D /* INTERROBANG */
+ || ch == 0x2047 /* DOUBLE QUESTION MARK */
+ || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
+ || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
+ || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
+ || ch == 0x301C /* WAVE DASH */
+ || ch == 0x303C /* MASU MARK */
+ || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
+ || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
+ || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
+ || ch == 0x309D /* HIRAGANA ITERATION MARK */
+ || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
+ || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
+ || ch == 0x30FB /* KATAKANA MIDDLE DOT */
+ || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+ || ch == 0x30FD /* KATAKANA ITERATION MARK */
+ || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
+ || ch == 0xA015 /* YI SYLLABLE WU */
+ || ch == 0xFE54 /* SMALL SEMICOLON */
+ || ch == 0xFE55 /* SMALL COLON */
+ || ch == 0xFF1A /* FULLWIDTH COLON */
+ || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
+ || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
+ || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+ || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
+ || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
+ || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
+ || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
+ attr |= (int64_t) 1 << LBP_NS;
+
+ /* opening punctuation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 's')
+ || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
+ || ch == 0x00BF /* INVERTED QUESTION MARK */
+ || ch == 0x2E18 /* INVERTED INTERROBANG */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
+ || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
+ || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
+ || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
+ || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
+ || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
+ attr |= (int64_t) 1 << LBP_OP;
+
+ /* ambiguous quotation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && (unicode_attributes[ch].category[1] == 'f'
+ || unicode_attributes[ch].category[1] == 'i'))
+ || ch == 0x0022 /* QUOTATION MARK */
+ || ch == 0x0027 /* APOSTROPHE */
+ || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
+ || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
+ || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
+ || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
+ || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
+ || ch == 0x2E0B /* RAISED SQUARE */)
+ attr |= (int64_t) 1 << LBP_QU;
+
+ /* infix separator (numeric) */
+ if (ch == 0x002C /* COMMA */
+ || ch == 0x002E /* FULL STOP */
+ || ch == 0x003A /* COLON */
+ || ch == 0x003B /* SEMICOLON */
+ || ch == 0x037E /* GREEK QUESTION MARK */
+ || ch == 0x0589 /* ARMENIAN FULL STOP */
+ || ch == 0x060C /* ARABIC COMMA */
+ || ch == 0x060D /* ARABIC DATE SEPARATOR */
+ || ch == 0x07F8 /* NKO COMMA */
+ || ch == 0x2044 /* FRACTION SLASH */
+ || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
+ || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
+ || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
+ attr |= (int64_t) 1 << LBP_IS;
+
+ /* numeric */
+ if ((unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd'
+ && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
+ || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
+ || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
+ attr |= (int64_t) 1 << LBP_NU;
+
+ /* postfix (numeric) */
+ if (ch == 0x0025 /* PERCENT SIGN */
+ || ch == 0x00A2 /* CENT SIGN */
+ || ch == 0x00B0 /* DEGREE SIGN */
+ || ch == 0x060B /* AFGHANI SIGN */
+ || ch == 0x066A /* ARABIC PERCENT SIGN */
+ || ch == 0x2030 /* PER MILLE SIGN */
+ || ch == 0x2031 /* PER TEN THOUSAND SIGN */
+ || ch == 0x2032 /* PRIME */
+ || ch == 0x2033 /* DOUBLE PRIME */
+ || ch == 0x2034 /* TRIPLE PRIME */
+ || ch == 0x2035 /* REVERSED PRIME */
+ || ch == 0x2036 /* REVERSED DOUBLE PRIME */
+ || ch == 0x2037 /* REVERSED TRIPLE PRIME */
+ || ch == 0x20A7 /* PESETA SIGN */
+ || ch == 0x2103 /* DEGREE CELSIUS */
+ || ch == 0x2109 /* DEGREE FAHRENHEIT */
+ || ch == 0xFDFC /* RIAL SIGN */
+ || ch == 0xFE6A /* SMALL PERCENT SIGN */
+ || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
+ || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
+ || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
+ || ch == 0x09F2 /* BENGALI RUPEE MARK */
+ || ch == 0x09F3 /* BENGALI RUPEE SIGN */
+ || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
+ || ch == 0x0D79 /* MALAYALAM DATE MARK */
+ || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
+ || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
+ attr |= (int64_t) 1 << LBP_PO;
+
+ /* prefix (numeric) */
+ if ((unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'c')
+ || ch == 0x002B /* PLUS SIGN */
+ || ch == 0x005C /* REVERSE SOLIDUS */
+ || ch == 0x00B1 /* PLUS-MINUS SIGN */
+ || ch == 0x2116 /* NUMERO SIGN */
+ || ch == 0x2212 /* MINUS SIGN */
+ || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
+ if (!(attr & ((int64_t) 1 << LBP_PO)))
+ attr |= (int64_t) 1 << LBP_PR;
+
+ /* symbols allowing breaks */
+ if (ch == 0x002F /* SOLIDUS */)
+ attr |= (int64_t) 1 << LBP_SY;
+
+ if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
+ attr |= (int64_t) 1 << LBP_H2;
+
+ if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
+ attr |= (int64_t) 1 << LBP_H3;
+
+ if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
+ attr |= (int64_t) 1 << LBP_JL;
+
+ if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
+ attr |= (int64_t) 1 << LBP_JV;
+
+ if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
+ attr |= (int64_t) 1 << LBP_JT;
+
+ /* complex context (South East Asian) */
+ if (((unicode_attributes[ch].category[0] == 'C'
+ && unicode_attributes[ch].category[1] == 'f')
+ || (unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'n')
+ && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
+ || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
+ || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
+ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
+ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
+ || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
+ || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
+ || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
+ && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
+ || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
+ || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
+ || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
+ || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
+ || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
+ attr |= (int64_t) 1 << LBP_SA;
+
+ /* attached characters and combining marks */
+ if ((unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'e'
+ || unicode_attributes[ch].category[1] == 'n'))
+ || (unicode_attributes[ch].category[0] == 'C'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'f')
+ && ch != 0x110BD /* KAITHI NUMBER SIGN */))
+ if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+ attr |= (int64_t) 1 << LBP_CM;
+
+ /* ideographic */
+ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
+ || ch == 0x3000 /* IDEOGRAPHIC SPACE */
+ || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
+ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
+ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
+ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
+ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
+ || ch == 0xFE62 /* SMALL PLUS SIGN */
+ || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
+ || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
+ || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
+ || ch == 0xFE66 /* SMALL EQUALS SIGN */
+ || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
+ || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
+ || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
+ || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
+ || (ch >= 0x3000 && ch <= 0x33FF
+ && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
+ || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
+ || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
+ || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
+ || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
+ || ch == 0xFE45 /* SESAME DOT */
+ || ch == 0xFE46 /* WHITE SESAME DOT */
+ || ch == 0xFE49 /* DASHED OVERLINE */
+ || ch == 0xFE4A /* CENTRELINE OVERLINE */
+ || ch == 0xFE4B /* WAVY OVERLINE */
+ || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
+ || ch == 0xFE4D /* DASHED LOW LINE */
+ || ch == 0xFE4E /* CENTRELINE LOW LINE */
+ || ch == 0xFE4F /* WAVY LOW LINE */
+ || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
+ || ch == 0xFE58 /* SMALL EM DASH */
+ || ch == 0xFE5F /* SMALL NUMBER SIGN */
+ || ch == 0xFE60 /* SMALL AMPERSAND */
+ || ch == 0xFE61 /* SMALL ASTERISK */
+ || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
+ || ch == 0xFE6B /* SMALL COMMERCIAL AT */
+ || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
+ || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
+ || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
+ || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
+ || ch == 0xFF0A /* FULLWIDTH ASTERISK */
+ || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
+ || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
+ || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
+ || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
+ || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
+ || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
+ || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
+ || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
+ || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
+ || ch == 0xFF3F /* FULLWIDTH LOW LINE */
+ || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
+ || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
+ || ch == 0xFF5E /* FULLWIDTH TILDE */
+ || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
+ || ch == 0xFFE3 /* FULLWIDTH MACRON */
+ || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
+ || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
+ || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
+ if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
+ {
+ /* ambiguous (ideograph) ? */
+ if ((unicode_width[ch] != NULL
+ && unicode_width[ch][0] == 'A'
+ && ch >= 0x2000)
+ || ch == 0x24EA /* CIRCLED DIGIT ZERO */
+ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
+ attr |= (int64_t) 1 << LBP_AI;
+ else
+ attr |= (int64_t) 1 << LBP_ID;
+ }
+
+ /* ordinary alphabetic and symbol characters */
+ if ((unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'u'
+ || unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 't'
+ || unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'S'
+ && (unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'k'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'N'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'P'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'd'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || ch == 0x0600 /* ARABIC NUMBER SIGN */
+ || ch == 0x0601 /* ARABIC SIGN SANAH */
+ || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
+ || ch == 0x0603 /* ARABIC SIGN SAFHA */
+ || ch == 0x06DD /* ARABIC END OF AYAH */
+ || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
+ || ch == 0x2061 /* FUNCTION APPLICATION */
+ || ch == 0x2062 /* INVISIBLE TIMES */
+ || ch == 0x2063 /* INVISIBLE SEPARATOR */
+ || ch == 0x2064 /* INVISIBLE PLUS */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x110BD /* KAITHI NUMBER SIGN */)
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+ {
+ /* ambiguous (alphabetic) ? */
+ if ((unicode_width[ch] != NULL
+ && unicode_width[ch][0] == 'A'
+ && ch >= 0x2000
+ /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
+ && ch != 0x2022 /* BULLET */
+ && ch != 0x203E /* OVERLINE */
+ && ch != 0x2126 /* OHM SIGN */
+ && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
+ && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
+ && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
+ && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
+ && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
+ && ch != 0x21E7 /* UPWARDS WHITE ARROW */
+ && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
+ && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
+ || ch == 0x00A7 /* SECTION SIGN */
+ || ch == 0x00A8 /* DIAERESIS */
+ || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
+ || ch == 0x00B2 /* SUPERSCRIPT TWO */
+ || ch == 0x00B3 /* SUPERSCRIPT THREE */
+ || ch == 0x00B6 /* PILCROW SIGN */
+ || ch == 0x00B7 /* MIDDLE DOT */
+ || ch == 0x00B8 /* CEDILLA */
+ || ch == 0x00B9 /* SUPERSCRIPT ONE */
+ || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
+ || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
+ || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
+ || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
+ || ch == 0x00D7 /* MULTIPLICATION SIGN */
+ || ch == 0x00F7 /* DIVISION SIGN */
+ || ch == 0x02C7 /* CARON */
+ || ch == 0x02C9 /* MODIFIER LETTER MACRON */
+ || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
+ || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
+ || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
+ || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
+ || ch == 0x02D8 /* BREVE */
+ || ch == 0x02D9 /* DOT ABOVE */
+ || ch == 0x02DA /* RING ABOVE */
+ || ch == 0x02DB /* OGONEK */
+ || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
+ || ch == 0x24EA /* CIRCLED DIGIT ZERO */
+ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
+ || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
+ || ch == 0x2616 /* WHITE SHOGI PIECE */
+ || ch == 0x2617 /* BLACK SHOGI PIECE */)
+ attr |= (int64_t) 1 << LBP_AI;
+ else
+ attr |= (int64_t) 1 << LBP_AL;
+ attr &= ~((int64_t) 1 << LBP_CM);
+ }
+ }
+ else
+ {
+ /* Unassigned character. */
+ if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+ || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
+ || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
+ attr |= (int64_t) 1 << LBP_ID;
+ }
+
+ if (attr == 0)
+ /* unknown */
+ attr |= (int64_t) 1 << LBP_XX;
+
+ return attr;
+}
+
+/* Output the line breaking properties in a human readable format. */
+static void
+debug_output_lbp (FILE *stream)
+{
+ unsigned int i;
+
+ for (i = 0; i < 0x110000; i++)
+ {
+ int64_t attr = get_lbp (i);
+ if (attr != (int64_t) 1 << LBP_XX)
+ {
+ fprintf (stream, "0x%04X", i);
+#define PRINT_BIT(attr,bit) \
+ if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
+ PRINT_BIT(attr,LBP_BK);
+ PRINT_BIT(attr,LBP_CM);
+ PRINT_BIT(attr,LBP_WJ);
+ PRINT_BIT(attr,LBP_ZW);
+ PRINT_BIT(attr,LBP_GL);
+ PRINT_BIT(attr,LBP_SP);
+ PRINT_BIT(attr,LBP_B2);
+ PRINT_BIT(attr,LBP_BA);
+ PRINT_BIT(attr,LBP_BB);
+ PRINT_BIT(attr,LBP_HY);
+ PRINT_BIT(attr,LBP_CB);
+ PRINT_BIT(attr,LBP_CL);
+ PRINT_BIT(attr,LBP_CP);
+ PRINT_BIT(attr,LBP_EX);
+ PRINT_BIT(attr,LBP_IN);
+ PRINT_BIT(attr,LBP_NS);
+ PRINT_BIT(attr,LBP_OP);
+ PRINT_BIT(attr,LBP_QU);
+ PRINT_BIT(attr,LBP_IS);
+ PRINT_BIT(attr,LBP_NU);
+ PRINT_BIT(attr,LBP_PO);
+ PRINT_BIT(attr,LBP_PR);
+ PRINT_BIT(attr,LBP_SY);
+ PRINT_BIT(attr,LBP_AI);
+ PRINT_BIT(attr,LBP_AL);
+ PRINT_BIT(attr,LBP_H2);
+ PRINT_BIT(attr,LBP_H3);
+ PRINT_BIT(attr,LBP_ID);
+ PRINT_BIT(attr,LBP_JL);
+ PRINT_BIT(attr,LBP_JV);
+ PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_XX);
+#undef PRINT_BIT
+ fprintf (stream, "\n");
+ }
+ }
+}
+
+static void
+debug_output_lbrk_tables (const char *filename)
+{
+ FILE *stream;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ debug_output_lbp (stream);
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* The line breaking property from the LineBreak.txt file. */
+int unicode_org_lbp[0x110000];
+
+/* Stores in unicode_org_lbp[] the line breaking property from the
+ LineBreak.txt file. */
+static void
+fill_org_lbp (const char *linebreak_filename)
+{
+ unsigned int i, j;
+ FILE *stream;
+ char field0[FIELDLEN];
+ char field1[FIELDLEN];
+ char field2[FIELDLEN];
+ int lineno = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_org_lbp[i] = LBP_XX;
+
+ stream = fopen (linebreak_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
+ exit (1);
+ }
+
+ for (;;)
+ {
+ int n;
+ int c;
+ int value;
+
+ lineno++;
+ c = getc (stream);
+ if (c == EOF)
+ break;
+ if (c == '#')
+ {
+ do c = getc (stream); while (c != EOF && c != '\n');
+ continue;
+ }
+ ungetc (c, stream);
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ' ');
+ n += getfield (stream, field2, '\n');
+ if (n == 0)
+ break;
+ if (n != 3)
+ {
+ fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
+ lineno);
+ exit (1);
+ }
+#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
+ if (false) {}
+ TRY(LBP_BK)
+ TRY(LBP_CM)
+ TRY(LBP_WJ)
+ TRY(LBP_ZW)
+ TRY(LBP_GL)
+ TRY(LBP_SP)
+ TRY(LBP_B2)
+ TRY(LBP_BA)
+ TRY(LBP_BB)
+ TRY(LBP_HY)
+ TRY(LBP_CB)
+ TRY(LBP_CL)
+ TRY(LBP_CP)
+ TRY(LBP_EX)
+ TRY(LBP_IN)
+ TRY(LBP_NS)
+ TRY(LBP_OP)
+ TRY(LBP_QU)
+ TRY(LBP_IS)
+ TRY(LBP_NU)
+ TRY(LBP_PO)
+ TRY(LBP_PR)
+ TRY(LBP_SY)
+ TRY(LBP_AI)
+ TRY(LBP_AL)
+ TRY(LBP_H2)
+ TRY(LBP_H3)
+ TRY(LBP_ID)
+ TRY(LBP_JL)
+ TRY(LBP_JV)
+ TRY(LBP_JT)
+ TRY(LBP_SA)
+ TRY(LBP_XX)
+#undef TRY
+ else if (strcmp (field1, "LF") == 0) value = LBP_BK;
+ else if (strcmp (field1, "CR") == 0) value = LBP_BK;
+ else if (strcmp (field1, "NL") == 0) value = LBP_BK;
+ else if (strcmp (field1, "SG") == 0) value = LBP_XX;
+ else
+ {
+ fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
+ field1, linebreak_filename, lineno);
+ exit (1);
+ }
+ i = strtoul (field0, NULL, 16);
+ if (strstr (field0, "..") != NULL)
+ {
+ /* Deal with a range. */
+ j = strtoul (strstr (field0, "..") + 2, NULL, 16);
+ for (; i <= j; i++)
+ unicode_org_lbp[i] = value;
+ }
+ else
+ {
+ /* Single character line. */
+ unicode_org_lbp[i] = value;
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
+ exit (1);
+ }
+}
+
+/* Output the line breaking properties in a human readable format. */
+static void
+debug_output_org_lbp (FILE *stream)