/* Generate Unicode conforming character classification tables and
line break properties tables and word break property tables and
- case mapping tables from a UnicodeData file.
- Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
+ decomposition/composition and case mapping tables from a UnicodeData file.
+ Copyright (C) 2000-2002, 2004, 2007-2013 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
This program is free software: you can redistribute it and/or modify
$ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
/usr/local/share/Unidata/PropList.txt \
/usr/local/share/Unidata/DerivedCoreProperties.txt \
+ /usr/local/share/Unidata/ArabicShaping.txt \
/usr/local/share/Unidata/Scripts.txt \
/usr/local/share/Unidata/Blocks.txt \
/usr/local/share/Unidata/PropList-3.0.1.txt \
/usr/local/share/Unidata/EastAsianWidth.txt \
/usr/local/share/Unidata/LineBreak.txt \
/usr/local/share/Unidata/WordBreakProperty.txt \
- 5.1.0
+ /usr/local/share/Unidata/GraphemeBreakProperty.txt \
+ /usr/local/share/Unidata/CompositionExclusions.txt \
+ /usr/local/share/Unidata/SpecialCasing.txt \
+ /usr/local/share/Unidata/CaseFolding.txt \
+ 6.0.0
*/
#include <stdbool.h>
/* Stores in unicode_attributes[i] the values from the given fields. */
static void
fill_attribute (unsigned int i,
- const char *field1, const char *field2,
- const char *field3, const char *field4,
- const char *field5, const char *field6,
- const char *field7, const char *field8,
- const char *field9, const char *field10,
- const char *field11, const char *field12,
- const char *field13, const char *field14)
+ const char *field1, const char *field2,
+ const char *field3, const char *field4,
+ const char *field5, const char *field6,
+ const char *field7, const char *field8,
+ const char *field9, const char *field10,
+ const char *field11, const char *field12,
+ const char *field13, const char *field14)
{
struct unicode_attribute * uni;
exit (1);
}
if (strcmp (field2, "Cs") == 0)
- /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
+ /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
return;
uni = &unicode_attributes[i];
/* Copy the strings. */
for (; (c = getc (stream)), (c != EOF && c != delim); )
{
/* The original unicode.org UnicodeData.txt file happens to have
- CR/LF line terminators. Silently convert to LF. */
+ CR/LF line terminators. Silently convert to LF. */
if (c == '\r')
- continue;
+ continue;
/* Put c into the buffer. */
if (++count >= FIELDLEN - 1)
- {
- fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
- exit (1);
- }
+ {
+ fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
+ exit (1);
+ }
*buffer++ = c;
}
n += getfield (stream, field13, ';');
n += getfield (stream, field14, '\n');
if (n == 0)
- break;
+ break;
if (n != 15)
- {
- fprintf (stderr, "short line in '%s':%d\n",
- unicodedata_filename, lineno);
- exit (1);
- }
+ {
+ fprintf (stderr, "short line in '%s':%d\n",
+ unicodedata_filename, lineno);
+ exit (1);
+ }
i = strtoul (field0, NULL, 16);
if (field1[0] == '<'
- && strlen (field1) >= 9
- && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
- {
- /* Deal with a range. */
- lineno++;
- n = getfield (stream, field0, ';');
- n += getfield (stream, field1, ';');
- n += getfield (stream, field2, ';');
- n += getfield (stream, field3, ';');
- n += getfield (stream, field4, ';');
- n += getfield (stream, field5, ';');
- n += getfield (stream, field6, ';');
- n += getfield (stream, field7, ';');
- n += getfield (stream, field8, ';');
- n += getfield (stream, field9, ';');
- n += getfield (stream, field10, ';');
- n += getfield (stream, field11, ';');
- n += getfield (stream, field12, ';');
- n += getfield (stream, field13, ';');
- n += getfield (stream, field14, '\n');
- if (n != 15)
- {
- fprintf (stderr, "missing end range in '%s':%d\n",
- unicodedata_filename, lineno);
- exit (1);
- }
- if (!(field1[0] == '<'
- && strlen (field1) >= 8
- && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
- {
- fprintf (stderr, "missing end range in '%s':%d\n",
- unicodedata_filename, lineno);
- exit (1);
- }
- field1[strlen (field1) - 7] = '\0';
- j = strtoul (field0, NULL, 16);
- for (; i <= j; i++)
- fill_attribute (i, field1+1, field2, field3, field4, field5,
- field6, field7, field8, field9, field10,
- field11, field12, field13, field14);
- }
+ && strlen (field1) >= 9
+ && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
+ {
+ /* Deal with a range. */
+ lineno++;
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ';');
+ n += getfield (stream, field2, ';');
+ n += getfield (stream, field3, ';');
+ n += getfield (stream, field4, ';');
+ n += getfield (stream, field5, ';');
+ n += getfield (stream, field6, ';');
+ n += getfield (stream, field7, ';');
+ n += getfield (stream, field8, ';');
+ n += getfield (stream, field9, ';');
+ n += getfield (stream, field10, ';');
+ n += getfield (stream, field11, ';');
+ n += getfield (stream, field12, ';');
+ n += getfield (stream, field13, ';');
+ n += getfield (stream, field14, '\n');
+ if (n != 15)
+ {
+ fprintf (stderr, "missing end range in '%s':%d\n",
+ unicodedata_filename, lineno);
+ exit (1);
+ }
+ if (!(field1[0] == '<'
+ && strlen (field1) >= 8
+ && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
+ {
+ fprintf (stderr, "missing end range in '%s':%d\n",
+ unicodedata_filename, lineno);
+ exit (1);
+ }
+ field1[strlen (field1) - 7] = '\0';
+ j = strtoul (field0, NULL, 16);
+ for (; i <= j; i++)
+ fill_attribute (i, field1+1, field2, field3, field4, field5,
+ field6, field7, field8, field9, field10,
+ field11, field12, field13, field14);
+ }
else
- {
- /* Single character line */
- fill_attribute (i, field1, field2, field3, field4, field5,
- field6, field7, field8, field9, field10,
- field11, field12, field13, field14);
- }
+ {
+ /* Single character line */
+ fill_attribute (i, field1, field2, field3, field4, field5,
+ field6, field7, field8, field9, field10,
+ field11, field12, field13, field14);
+ }
}
+
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
is_category_L (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L');
+ && unicode_attributes[ch].category[0] == 'L');
+}
+
+static bool
+is_category_LC (unsigned int ch)
+{
+ /* See PropertyValueAliases.txt. */
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'u'
+ || unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 't'));
}
static bool
is_category_Lu (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L'
- && unicode_attributes[ch].category[1] == 'u');
+ && unicode_attributes[ch].category[0] == 'L'
+ && unicode_attributes[ch].category[1] == 'u');
}
static bool
is_category_Ll (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L'
- && unicode_attributes[ch].category[1] == 'l');
+ && unicode_attributes[ch].category[0] == 'L'
+ && unicode_attributes[ch].category[1] == 'l');
}
static bool
is_category_Lt (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L'
- && unicode_attributes[ch].category[1] == 't');
+ && unicode_attributes[ch].category[0] == 'L'
+ && unicode_attributes[ch].category[1] == 't');
}
static bool
is_category_Lm (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L'
- && unicode_attributes[ch].category[1] == 'm');
+ && unicode_attributes[ch].category[0] == 'L'
+ && unicode_attributes[ch].category[1] == 'm');
}
static bool
is_category_Lo (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'L'
- && unicode_attributes[ch].category[1] == 'o');
+ && unicode_attributes[ch].category[0] == 'L'
+ && unicode_attributes[ch].category[1] == 'o');
}
static bool
is_category_M (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'M');
+ && unicode_attributes[ch].category[0] == 'M');
}
static bool
is_category_Mn (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'M'
- && unicode_attributes[ch].category[1] == 'n');
+ && unicode_attributes[ch].category[0] == 'M'
+ && unicode_attributes[ch].category[1] == 'n');
}
static bool
is_category_Mc (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'M'
- && unicode_attributes[ch].category[1] == 'c');
+ && unicode_attributes[ch].category[0] == 'M'
+ && unicode_attributes[ch].category[1] == 'c');
}
static bool
is_category_Me (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'M'
- && unicode_attributes[ch].category[1] == 'e');
+ && unicode_attributes[ch].category[0] == 'M'
+ && unicode_attributes[ch].category[1] == 'e');
}
static bool
is_category_N (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'N');
+ && unicode_attributes[ch].category[0] == 'N');
}
static bool
is_category_Nd (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'd');
+ && unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd');
}
static bool
is_category_Nl (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'l');
+ && unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'l');
}
static bool
is_category_No (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'o');
+ && unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'o');
}
static bool
is_category_P (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P');
+ && unicode_attributes[ch].category[0] == 'P');
}
static bool
is_category_Pc (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'c');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'c');
}
static bool
is_category_Pd (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'd');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'd');
}
static bool
is_category_Ps (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 's');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 's');
}
static bool
is_category_Pe (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'e');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'e');
}
static bool
is_category_Pi (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'i');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'i');
}
static bool
is_category_Pf (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'f');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'f');
}
static bool
is_category_Po (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'o');
+ && unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'o');
}
static bool
is_category_S (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'S');
+ && unicode_attributes[ch].category[0] == 'S');
}
static bool
is_category_Sm (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'm');
+ && unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'm');
}
static bool
is_category_Sc (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'c');
+ && unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'c');
}
static bool
is_category_Sk (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'k');
+ && unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'k');
}
static bool
is_category_So (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'o');
+ && unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'o');
}
static bool
is_category_Z (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z');
+ && unicode_attributes[ch].category[0] == 'Z');
}
static bool
is_category_Zs (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && unicode_attributes[ch].category[1] == 's');
+ && unicode_attributes[ch].category[0] == 'Z'
+ && unicode_attributes[ch].category[1] == 's');
}
static bool
is_category_Zl (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && unicode_attributes[ch].category[1] == 'l');
+ && unicode_attributes[ch].category[0] == 'Z'
+ && unicode_attributes[ch].category[1] == 'l');
}
static bool
is_category_Zp (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && unicode_attributes[ch].category[1] == 'p');
+ && unicode_attributes[ch].category[0] == 'Z'
+ && unicode_attributes[ch].category[1] == 'p');
}
static bool
is_category_C (unsigned int ch)
{
return (unicode_attributes[ch].name == NULL
- || unicode_attributes[ch].category[0] == 'C');
+ || unicode_attributes[ch].category[0] == 'C');
}
static bool
is_category_Cc (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'C'
- && unicode_attributes[ch].category[1] == 'c');
+ && unicode_attributes[ch].category[0] == 'C'
+ && unicode_attributes[ch].category[1] == 'c');
}
static bool
is_category_Cf (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'C'
- && unicode_attributes[ch].category[1] == 'f');
+ && unicode_attributes[ch].category[0] == 'C'
+ && unicode_attributes[ch].category[1] == 'f');
}
static bool
is_category_Co (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'C'
- && unicode_attributes[ch].category[1] == 'o');
+ && unicode_attributes[ch].category[0] == 'C'
+ && unicode_attributes[ch].category[1] == 'o');
}
static bool
is_category_Cn (unsigned int ch)
{
return (unicode_attributes[ch].name == NULL
- && !(ch >= 0xd800 && ch < 0xe000));
+ && !(ch >= 0xd800 && ch < 0xe000));
}
/* Output a boolean property in a human readable format. */
for (ch = 0; ch < 0x110000; ch++)
if (predicate (ch))
{
- fprintf (stream, "0x%04X\n", ch);
+ fprintf (stream, "0x%04X\n", ch);
}
#else
for (ch = 0; ch < 0x110000; ch++)
if (predicate (ch))
{
- unsigned int first = ch;
- unsigned int last;
-
- while (ch + 1 < 0x110000 && predicate (ch + 1))
- ch++;
- last = ch;
- if (first < last)
- fprintf (stream, "0x%04X..0x%04X\n", first, last);
- else
- fprintf (stream, "0x%04X\n", ch);
+ unsigned int first = ch;
+ unsigned int last;
+
+ while (ch + 1 < 0x110000 && predicate (ch + 1))
+ ch++;
+ last = ch;
+ if (first < last)
+ fprintf (stream, "0x%04X..0x%04X\n", first, last);
+ else
+ fprintf (stream, "0x%04X\n", ch);
}
#endif
for (ch = 0; ch < 0x110000; ch++)
if (predicate (ch))
{
- unsigned int first = ch;
- unsigned int last;
-
- while (ch + 1 < 0x110000 && predicate (ch + 1))
- ch++;
- last = ch;
- if (need_comma)
- fprintf (stream, ",\n");
- fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
- need_comma = true;
+ unsigned int first = ch;
+ unsigned int last;
+
+ while (ch + 1 < 0x110000 && predicate (ch + 1))
+ ch++;
+ last = ch;
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
+ need_comma = true;
}
if (need_comma)
fprintf (stream, "\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* %s of Unicode characters. */\n", comment);
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 4; /* or: 5 */
t.q = 7; /* or: 6 */
for (i = 0; i < 5; i++)
if (i != 1)
fprintf (stream, "#define header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
{
uint32_t offset;
if (i > 0 && (i % 1) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
- 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
+ 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
}
{
uint32_t offset;
if (i > 0 && (i % 1) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
- 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
+ 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 1)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
if (i > 0 && (i % 4) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%08X",
- ((uint32_t *) (t.result + level3_offset))[i]);
+ ((uint32_t *) (t.result + level3_offset))[i]);
if (i+1 < t.level3_size << t.p)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << t.p > 4)
fprintf (stream, "\n ");
output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
CATEGORY (L)
+ CATEGORY (LC)
CATEGORY (Lu)
CATEGORY (Ll)
CATEGORY (Lt)
enum
{
UC_CATEGORY_MASK_L = 0x0000001f,
+ UC_CATEGORY_MASK_LC = 0x00000007,
UC_CATEGORY_MASK_Lu = 0x00000001,
UC_CATEGORY_MASK_Ll = 0x00000002,
UC_CATEGORY_MASK_Lt = 0x00000004,
switch (category_name[0])
{
case 'L':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_L;
- case 'u': return UC_CATEGORY_MASK_Lu;
- case 'l': return UC_CATEGORY_MASK_Ll;
- case 't': return UC_CATEGORY_MASK_Lt;
- case 'm': return UC_CATEGORY_MASK_Lm;
- case 'o': return UC_CATEGORY_MASK_Lo;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_L;
+ case 'C': return UC_CATEGORY_MASK_LC;
+ case 'u': return UC_CATEGORY_MASK_Lu;
+ case 'l': return UC_CATEGORY_MASK_Ll;
+ case 't': return UC_CATEGORY_MASK_Lt;
+ case 'm': return UC_CATEGORY_MASK_Lm;
+ case 'o': return UC_CATEGORY_MASK_Lo;
+ }
+ break;
case 'M':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_M;
- case 'n': return UC_CATEGORY_MASK_Mn;
- case 'c': return UC_CATEGORY_MASK_Mc;
- case 'e': return UC_CATEGORY_MASK_Me;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_M;
+ case 'n': return UC_CATEGORY_MASK_Mn;
+ case 'c': return UC_CATEGORY_MASK_Mc;
+ case 'e': return UC_CATEGORY_MASK_Me;
+ }
+ break;
case 'N':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_N;
- case 'd': return UC_CATEGORY_MASK_Nd;
- case 'l': return UC_CATEGORY_MASK_Nl;
- case 'o': return UC_CATEGORY_MASK_No;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_N;
+ case 'd': return UC_CATEGORY_MASK_Nd;
+ case 'l': return UC_CATEGORY_MASK_Nl;
+ case 'o': return UC_CATEGORY_MASK_No;
+ }
+ break;
case 'P':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_P;
- case 'c': return UC_CATEGORY_MASK_Pc;
- case 'd': return UC_CATEGORY_MASK_Pd;
- case 's': return UC_CATEGORY_MASK_Ps;
- case 'e': return UC_CATEGORY_MASK_Pe;
- case 'i': return UC_CATEGORY_MASK_Pi;
- case 'f': return UC_CATEGORY_MASK_Pf;
- case 'o': return UC_CATEGORY_MASK_Po;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_P;
+ case 'c': return UC_CATEGORY_MASK_Pc;
+ case 'd': return UC_CATEGORY_MASK_Pd;
+ case 's': return UC_CATEGORY_MASK_Ps;
+ case 'e': return UC_CATEGORY_MASK_Pe;
+ case 'i': return UC_CATEGORY_MASK_Pi;
+ case 'f': return UC_CATEGORY_MASK_Pf;
+ case 'o': return UC_CATEGORY_MASK_Po;
+ }
+ break;
case 'S':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_S;
- case 'm': return UC_CATEGORY_MASK_Sm;
- case 'c': return UC_CATEGORY_MASK_Sc;
- case 'k': return UC_CATEGORY_MASK_Sk;
- case 'o': return UC_CATEGORY_MASK_So;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_S;
+ case 'm': return UC_CATEGORY_MASK_Sm;
+ case 'c': return UC_CATEGORY_MASK_Sc;
+ case 'k': return UC_CATEGORY_MASK_Sk;
+ case 'o': return UC_CATEGORY_MASK_So;
+ }
+ break;
case 'Z':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_Z;
- case 's': return UC_CATEGORY_MASK_Zs;
- case 'l': return UC_CATEGORY_MASK_Zl;
- case 'p': return UC_CATEGORY_MASK_Zp;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_Z;
+ case 's': return UC_CATEGORY_MASK_Zs;
+ case 'l': return UC_CATEGORY_MASK_Zl;
+ case 'p': return UC_CATEGORY_MASK_Zp;
+ }
+ break;
case 'C':
- switch (category_name[1])
- {
- case '\0': return UC_CATEGORY_MASK_C;
- case 'c': return UC_CATEGORY_MASK_Cc;
- case 'f': return UC_CATEGORY_MASK_Cf;
- case 's': return UC_CATEGORY_MASK_Cs;
- case 'o': return UC_CATEGORY_MASK_Co;
- case 'n': return UC_CATEGORY_MASK_Cn;
- }
- break;
+ switch (category_name[1])
+ {
+ case '\0': return UC_CATEGORY_MASK_C;
+ case 'c': return UC_CATEGORY_MASK_Cc;
+ case 'f': return UC_CATEGORY_MASK_Cf;
+ case 's': return UC_CATEGORY_MASK_Cs;
+ case 'o': return UC_CATEGORY_MASK_Co;
+ case 'n': return UC_CATEGORY_MASK_Cn;
+ }
+ break;
}
/* Invalid category name. */
abort ();
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Categories of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
unsigned int log2_value;
if (is_category_Cs (ch))
- value = UC_CATEGORY_MASK_Cs;
+ value = UC_CATEGORY_MASK_Cs;
else if (unicode_attributes[ch].name != NULL)
- value = general_category_byname (unicode_attributes[ch].category);
+ value = general_category_byname (unicode_attributes[ch].category);
else
- continue;
+ continue;
/* Now value should contain exactly one bit. */
if (value == 0 || ((value & (value - 1)) != 0))
- abort ();
+ abort ();
for (log2_value = 0; value > 1; value >>= 1, log2_value++);
for (i = 0; i < 5; i++)
fprintf (stream, "#define category_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
- (1 << t.p) * 5 / 16);
+ (1 << t.p) * 5 / 16);
fprintf (stream, " }\n");
fprintf (stream, "u_category =\n");
fprintf (stream, "{\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%04x", level3_packed[i]);
if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
fprintf (stream, "\n ");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Combining class of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
for (ch = 0; ch < 0x110000; ch++)
if (unicode_attributes[ch].name != NULL)
{
- int value = atoi (unicode_attributes[ch].combining);
+ int value = atoi (unicode_attributes[ch].combining);
if (!(value >= 0 && value <= 255))
- abort ();
- combclass_table_add (&t, ch, value);
+ abort ();
+ combclass_table_add (&t, ch, value);
}
combclass_table_finalize (&t);
for (i = 0; i < 5; i++)
fprintf (stream, "#define combclass_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
if (i+1 < t.level3_size << t.p)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << t.p > 8)
fprintf (stream, "\n ");
{
case 'A':
switch (category_name[1])
- {
- case 'L':
- if (category_name[2] == '\0')
- return UC_BIDI_AL;
- break;
- case 'N':
- if (category_name[2] == '\0')
- return UC_BIDI_AN;
- break;
- }
+ {
+ case 'L':
+ if (category_name[2] == '\0')
+ return UC_BIDI_AL;
+ break;
+ case 'N':
+ if (category_name[2] == '\0')
+ return UC_BIDI_AN;
+ break;
+ }
break;
case 'B':
switch (category_name[1])
- {
- case '\0':
- return UC_BIDI_B;
- case 'N':
- if (category_name[2] == '\0')
- return UC_BIDI_BN;
- break;
- }
+ {
+ case '\0':
+ return UC_BIDI_B;
+ case 'N':
+ if (category_name[2] == '\0')
+ return UC_BIDI_BN;
+ break;
+ }
break;
case 'C':
switch (category_name[1])
- {
- case 'S':
- if (category_name[2] == '\0')
- return UC_BIDI_CS;
- break;
- }
+ {
+ case 'S':
+ if (category_name[2] == '\0')
+ return UC_BIDI_CS;
+ break;
+ }
break;
case 'E':
switch (category_name[1])
- {
- case 'N':
- if (category_name[2] == '\0')
- return UC_BIDI_EN;
- break;
- case 'S':
- if (category_name[2] == '\0')
- return UC_BIDI_ES;
- break;
- case 'T':
- if (category_name[2] == '\0')
- return UC_BIDI_ET;
- break;
- }
+ {
+ case 'N':
+ if (category_name[2] == '\0')
+ return UC_BIDI_EN;
+ break;
+ case 'S':
+ if (category_name[2] == '\0')
+ return UC_BIDI_ES;
+ break;
+ case 'T':
+ if (category_name[2] == '\0')
+ return UC_BIDI_ET;
+ break;
+ }
break;
case 'L':
switch (category_name[1])
- {
- case '\0':
- return UC_BIDI_L;
- case 'R':
- switch (category_name[2])
- {
- case 'E':
- if (category_name[3] == '\0')
- return UC_BIDI_LRE;
- break;
- case 'O':
- if (category_name[3] == '\0')
- return UC_BIDI_LRO;
- break;
- }
- break;
- }
+ {
+ case '\0':
+ return UC_BIDI_L;
+ case 'R':
+ switch (category_name[2])
+ {
+ case 'E':
+ if (category_name[3] == '\0')
+ return UC_BIDI_LRE;
+ break;
+ case 'O':
+ if (category_name[3] == '\0')
+ return UC_BIDI_LRO;
+ break;
+ }
+ break;
+ }
break;
case 'N':
switch (category_name[1])
- {
- case 'S':
- switch (category_name[2])
- {
- case 'M':
- if (category_name[3] == '\0')
- return UC_BIDI_NSM;
- break;
- }
- break;
- }
+ {
+ case 'S':
+ switch (category_name[2])
+ {
+ case 'M':
+ if (category_name[3] == '\0')
+ return UC_BIDI_NSM;
+ break;
+ }
+ break;
+ }
break;
case 'O':
switch (category_name[1])
- {
- case 'N':
- if (category_name[2] == '\0')
- return UC_BIDI_ON;
- break;
- }
+ {
+ case 'N':
+ if (category_name[2] == '\0')
+ return UC_BIDI_ON;
+ break;
+ }
break;
case 'P':
switch (category_name[1])
- {
- case 'D':
- switch (category_name[2])
- {
- case 'F':
- if (category_name[3] == '\0')
- return UC_BIDI_PDF;
- break;
- }
- break;
- }
+ {
+ case 'D':
+ switch (category_name[2])
+ {
+ case 'F':
+ if (category_name[3] == '\0')
+ return UC_BIDI_PDF;
+ break;
+ }
+ break;
+ }
break;
case 'R':
switch (category_name[1])
- {
- case '\0':
- return UC_BIDI_R;
- case 'L':
- switch (category_name[2])
- {
- case 'E':
- if (category_name[3] == '\0')
- return UC_BIDI_RLE;
- break;
- case 'O':
- if (category_name[3] == '\0')
- return UC_BIDI_RLO;
- break;
- }
- break;
- }
+ {
+ case '\0':
+ return UC_BIDI_R;
+ case 'L':
+ switch (category_name[2])
+ {
+ case 'E':
+ if (category_name[3] == '\0')
+ return UC_BIDI_RLE;
+ break;
+ case 'O':
+ if (category_name[3] == '\0')
+ return UC_BIDI_RLO;
+ break;
+ }
+ break;
+ }
break;
case 'S':
if (category_name[1] == '\0')
- return UC_BIDI_S;
+ return UC_BIDI_S;
break;
case 'W':
switch (category_name[1])
- {
- case 'S':
- if (category_name[2] == '\0')
- return UC_BIDI_WS;
- break;
- }
+ {
+ case 'S':
+ if (category_name[2] == '\0')
+ return UC_BIDI_WS;
+ break;
+ }
break;
}
/* Invalid bidi category name. */
else
{
/* The bidi category of unassigned characters depends on the range.
- See UTR #9 and DerivedBidiClass.txt. */
+ See UTR #9 and DerivedBidiClass.txt. */
if ((ch >= 0x0590 && ch <= 0x05FF)
- || (ch >= 0x07FB && ch <= 0x08FF)
- || (ch >= 0xFB37 && ch <= 0xFB45)
- || (ch >= 0x10800 && ch <= 0x10FFF))
- return UC_BIDI_R;
+ || (ch >= 0x07FB && ch <= 0x08FF)
+ || (ch >= 0xFB37 && ch <= 0xFB45)
+ || (ch >= 0x10800 && ch <= 0x10FFF))
+ return UC_BIDI_R;
else if ((ch >= 0x0600 && ch <= 0x07BF)
- || (ch >= 0x2064 && ch <= 0x2069)
- || (ch >= 0xFBB2 && ch <= 0xFDCF)
- || (ch >= 0xFDFE && ch <= 0xFEFE))
- return UC_BIDI_AL;
+ || (ch >= 0x2064 && ch <= 0x2069)
+ || (ch >= 0xFBB2 && ch <= 0xFDCF)
+ || (ch >= 0xFDFE && ch <= 0xFEFE))
+ return UC_BIDI_AL;
else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
- || (ch >= 0xFFF0 && ch <= 0xFFFF)
- || (ch & 0xFFFF) == 0xFFFE
- || (ch & 0xFFFF) == 0xFFFF
- || (ch >= 0xE0000 && ch <= 0xE0FFF))
- return UC_BIDI_BN;
+ || (ch >= 0xFFF0 && ch <= 0xFFFF)
+ || (ch & 0xFFFF) == 0xFFFE
+ || (ch & 0xFFFF) == 0xFFFF
+ || (ch >= 0xE0000 && ch <= 0xE0FFF))
+ return UC_BIDI_BN;
else
- return UC_BIDI_L;
+ return UC_BIDI_L;
}
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
for (i = 0; i < 5; i++)
fprintf (stream, "#define bidi_category_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
- (1 << t.p) * 5 / 16);
+ (1 << t.p) * 5 / 16);
fprintf (stream, " }\n");
fprintf (stream, "u_bidi_category =\n");
fprintf (stream, "{\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%04x", level3_packed[i]);
if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
fprintf (stream, "\n ");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
need_comma = false;
for (ch = 0; ch < 0x110000; ch++)
int value = get_decdigit_value (ch);
if (!(value >= -1 && value < 10))
- abort ();
+ abort ();
if (value >= 0)
- {
- if (need_comma)
- fprintf (stream, ",\n");
- fprintf (stream, " { 0x%04X, %d }", ch, value);
- need_comma = true;
- }
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %d }", ch, value);
+ need_comma = true;
+ }
}
if (need_comma)
fprintf (stream, "\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
int value = 1 + get_decdigit_value (ch);
if (!(value >= 0 && value <= 10))
- abort ();
+ abort ();
decdigit_table_add (&t, ch, value);
}
for (i = 0; i < 5; i++)
fprintf (stream, "#define decdigit_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
- t.p - 1);
+ t.p - 1);
fprintf (stream, " }\n");
fprintf (stream, "u_decdigit =\n");
fprintf (stream, "{\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << (t.p - 1); i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%02x",
- ((uint8_t *) (t.result + level3_offset))[2*i]
- + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
+ ((uint8_t *) (t.result + level3_offset))[2*i]
+ + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
if (i+1 < t.level3_size << (t.p - 1))
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << (t.p - 1) > 8)
fprintf (stream, "\n ");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
need_comma = false;
for (ch = 0; ch < 0x110000; ch++)
int value = get_digit_value (ch);
if (!(value >= -1 && value < 10))
- abort ();
+ abort ();
if (value >= 0)
- {
- if (need_comma)
- fprintf (stream, ",\n");
- fprintf (stream, " { 0x%04X, %d }", ch, value);
- need_comma = true;
- }
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %d }", ch, value);
+ need_comma = true;
+ }
}
if (need_comma)
fprintf (stream, "\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Digit values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
int value = 1 + get_digit_value (ch);
if (!(value >= 0 && value <= 10))
- abort ();
+ abort ();
decdigit_table_add (&t, ch, value);
}
for (i = 0; i < 5; i++)
fprintf (stream, "#define digit_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
- t.p - 1);
+ t.p - 1);
fprintf (stream, " }\n");
fprintf (stream, "u_digit =\n");
fprintf (stream, "{\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << (t.p - 1); i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%02x",
- ((uint8_t *) (t.result + level3_offset))[2*i]
- + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
+ ((uint8_t *) (t.result + level3_offset))[2*i]
+ + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
if (i+1 < t.level3_size << (t.p - 1))
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << (t.p - 1) > 8)
fprintf (stream, "\n ");
/* str is of the form "integer" or "integer/posinteger". */
value.numerator = atoi (str);
if (strchr (str, '/') != NULL)
- value.denominator = atoi (strchr (str, '/') + 1);
+ value.denominator = atoi (strchr (str, '/') + 1);
else
- value.denominator = 1;
+ value.denominator = 1;
}
else
{
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Numeric values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
need_comma = false;
for (ch = 0; ch < 0x110000; ch++)
uc_fraction_t value = get_numeric_value (ch);
if (value.numerator != 0 || value.denominator != 0)
- {
- if (need_comma)
- fprintf (stream, ",\n");
- fprintf (stream, " { 0x%04X, %d, %d }",
- ch, value.numerator, value.denominator);
- need_comma = true;
- }
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %d, %d }",
+ ch, value.numerator, value.denominator);
+ need_comma = true;
+ }
}
if (need_comma)
fprintf (stream, "\n");
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Numeric values of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
/* Create table of occurring fractions. */
nfractions = 0;
uc_fraction_t value = get_numeric_value (ch);
for (i = 0; i < nfractions; i++)
- if (value.numerator == fractions[i].numerator
- && value.denominator == fractions[i].denominator)
- break;
+ if (value.numerator == fractions[i].numerator
+ && value.denominator == fractions[i].denominator)
+ break;
if (i == nfractions)
- {
- if (nfractions == 128)
- abort ();
- for (i = 0; i < nfractions; i++)
- if (value.denominator < fractions[i].denominator
- || (value.denominator == fractions[i].denominator
- && value.numerator < fractions[i].numerator))
- break;
- for (j = nfractions; j > i; j--)
- fractions[j] = fractions[j - 1];
- fractions[i] = value;
- nfractions++;
- }
+ {
+ if (nfractions == 128)
+ abort ();
+ for (i = 0; i < nfractions; i++)
+ if (value.denominator < fractions[i].denominator
+ || (value.denominator == fractions[i].denominator
+ && value.numerator < fractions[i].numerator))
+ break;
+ for (j = nfractions; j > i; j--)
+ fractions[j] = fractions[j - 1];
+ fractions[i] = value;
+ nfractions++;
+ }
}
fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
- nfractions);
+ nfractions);
fprintf (stream, "{\n");
for (i = 0; i < nfractions; i++)
{
fprintf (stream, " { %d, %d }", fractions[i].numerator,
- fractions[i].denominator);
+ fractions[i].denominator);
if (i+1 < nfractions)
- fprintf (stream, ",");
+ fprintf (stream, ",");
fprintf (stream, "\n");
}
fprintf (stream, "};\n");
uc_fraction_t value = get_numeric_value (ch);
for (i = 0; i < nfractions; i++)
- if (value.numerator == fractions[i].numerator
- && value.denominator == fractions[i].denominator)
- break;
+ if (value.numerator == fractions[i].numerator
+ && value.denominator == fractions[i].denominator)
+ break;
if (i == nfractions)
- abort ();
+ abort ();
numeric_table_add (&t, ch, i);
}
for (i = 0; i < 5; i++)
fprintf (stream, "#define numeric_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
- (1 << t.p) * 7 / 16);
+ (1 << t.p) * 7 / 16);
fprintf (stream, " }\n");
fprintf (stream, "u_numeric =\n");
fprintf (stream, "{\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " 0x%04x", level3_packed[i]);
if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
fprintf (stream, "\n ");
unsigned int i;
mirrored = (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].mirrored);
+ && unicode_attributes[ch].mirrored);
mirror_char = 0xfffd;
for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
if (ch == mirror_pairs[i][0])
{
- mirror_char = mirror_pairs[i][1];
- break;
+ mirror_char = mirror_pairs[i][1];
+ break;
}
else if (ch == mirror_pairs[i][1])
{
- mirror_char = mirror_pairs[i][0];
- break;
+ mirror_char = mirror_pairs[i][0];
+ break;
}
if (mirrored)
return (int) mirror_char - (int) ch;
else
{
if (mirror_char != 0xfffd)
- abort ();
+ abort ();
return 0;
}
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Mirrored Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
for (i = 0; i < 5; i++)
fprintf (stream, "#define mirror_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (int32_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (int32_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
if (i+1 < t.level3_size << t.p)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << t.p > 8)
fprintf (stream, "\n ");
/* ========================================================================= */
+/* Particular values of the word break property. */
+
+static bool
+is_WBP_MIDNUMLET (unsigned int ch)
+{
+ return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+ || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
+}
+
+static bool
+is_WBP_MIDLETTER (unsigned int ch)
+{
+ return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
+ || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
+}
+
+/* ========================================================================= */
+
/* Properties. */
/* Reading PropList.txt and DerivedCoreProperties.txt. */
PROP_ALPHABETIC,
PROP_LOWERCASE,
PROP_UPPERCASE,
+ PROP_CASED,
+ PROP_CASE_IGNORABLE,
+ PROP_CHANGES_WHEN_LOWERCASED,
+ PROP_CHANGES_WHEN_UPPERCASED,
+ PROP_CHANGES_WHEN_TITLECASED,
+ PROP_CHANGES_WHEN_CASEFOLDED,
+ PROP_CHANGES_WHEN_CASEMAPPED,
PROP_ID_START,
PROP_ID_CONTINUE,
PROP_XID_START,
unsigned int propvalue;
if (fscanf (stream, "%200[^\n]\n", buf) < 1)
- break;
+ break;
if (buf[0] == '\0' || buf[0] == '#')
- continue;
+ continue;
if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
- {
- if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
- {
- fprintf (stderr, "parse error in '%s'\n", proplist_filename);
- exit (1);
- }
- i2 = i1;
- }
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n", proplist_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
#define PROP(name,value) \
if (strcmp (propname, name) == 0) propvalue = value; else
/* PropList.txt */
PROP ("Alphabetic", PROP_ALPHABETIC)
PROP ("Lowercase", PROP_LOWERCASE)
PROP ("Uppercase", PROP_UPPERCASE)
+ PROP ("Cased", PROP_CASED)
+ PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
+ PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
+ PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
+ PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
+ PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
+ PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
PROP ("ID_Start", PROP_ID_START)
PROP ("ID_Continue", PROP_ID_CONTINUE)
PROP ("XID_Start", PROP_XID_START)
PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
#undef PROP
- {
- fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
- proplist_filename);
- exit (1);
- }
+ {
+ fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
+ proplist_filename);
+ exit (1);
+ }
if (!(i1 <= i2 && i2 < 0x110000))
- abort ();
+ abort ();
for (i = i1; i <= i2; i++)
- unicode_properties[i] |= 1ULL << propvalue;
+ unicode_properties[i] |= 1ULL << propvalue;
}
if (ferror (stream) || fclose (stream))
do
{
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
- {
- fprintf (stderr, "no property found in '%s'\n", proplist_filename);
- exit (1);
- }
+ {
+ fprintf (stderr, "no property found in '%s'\n", proplist_filename);
+ exit (1);
+ }
}
while (strstr (buf, property_name) == NULL);
unsigned int i1, i2;
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
- break;
+ break;
if (buf[0] == '*')
- break;
+ break;
if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
- {
- if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
- {
- fprintf (stderr, "parse error in property in '%s'\n",
- proplist_filename);
- exit (1);
- }
- }
+ {
+ if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
+ {
+ fprintf (stderr, "parse error in property in '%s'\n",
+ proplist_filename);
+ exit (1);
+ }
+ }
else if (strlen (buf) >= 4)
- {
- if (sscanf (buf, "%4X", &i1) < 1)
- {
- fprintf (stderr, "parse error in property in '%s'\n",
- proplist_filename);
- exit (1);
- }
- i2 = i1;
- }
+ {
+ if (sscanf (buf, "%4X", &i1) < 1)
+ {
+ fprintf (stderr, "parse error in property in '%s'\n",
+ proplist_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
else
- {
- fprintf (stderr, "parse error in property in '%s'\n",
- proplist_filename);
- exit (1);
- }
+ {
+ fprintf (stderr, "parse error in property in '%s'\n",
+ proplist_filename);
+ exit (1);
+ }
if (!(i1 <= i2 && i2 < 0x110000))
- abort ();
+ abort ();
for (i = i1; i <= i2; i++)
- array[i] = 1;
+ array[i] = 1;
}
+
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", proplist_filename);
|| (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
|| (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
|| (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
+ || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
|| (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
|| (ch == 0x10341) /* GOTHIC LETTER NINETY */
|| (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
bool result1 =
(is_category_Cf (ch)
&& !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
- && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
+ && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
+ /* For some reason, the following are not listed as having property
+ Default_Ignorable_Code_Point. */
+ && !(ch == 0x110BD))
|| ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
|| ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
bool result2 =
{
/* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
return (ch >= 0xE000 && ch <= 0xF8FF)
- || (ch >= 0xF0000 && ch <= 0xFFFFD)
- || (ch >= 0x100000 && ch <= 0x10FFFD);
+ || (ch >= 0xF0000 && ch <= 0xFFFFD)
+ || (ch >= 0x100000 && ch <= 0x10FFFD);
}
/* See PropList-3.0.1.txt. */
return is_category_Lt (ch);
}
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_cased (unsigned int ch)
+{
+ bool result1 = (is_property_lowercase (ch)
+ || is_property_uppercase (ch)
+ || is_category_Lt (ch));
+ bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_case_ignorable (unsigned int ch)
+{
+ bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+ || is_category_Mn (ch)
+ || is_category_Me (ch)
+ || is_category_Cf (ch)
+ || is_category_Lm (ch)
+ || is_category_Sk (ch));
+ bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_lowercased (unsigned int ch)
+{
+ bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
+ bool result2 = (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].lower != NONE
+ && unicode_attributes[ch].lower != ch);
+
+ if (result1 != result2)
+ abort ();
+ return result1;
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_uppercased (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_titlecased (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_casefolded (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt. */
+static bool
+is_property_changes_when_casemapped (unsigned int ch)
+{
+ return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
+}
+
/* See PropList.txt, UCD.html. */
static bool
is_property_soft_dotted (unsigned int ch)
{
int category = get_bidi_category (ch);
return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
- || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
+ || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
}
/* See PropList-3.0.1.txt. */
is_property_zero_width (unsigned int ch)
{
return is_category_Cf (ch)
- || (unicode_attributes[ch].name != NULL
- && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
+ || (unicode_attributes[ch].name != NULL
+ && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
}
/* See PropList-3.0.1.txt. */
/* This is exactly the set of characters having line breaking
property GL. */
return (ch == 0x00A0 /* NO-BREAK SPACE */
- || ch == 0x034F /* COMBINING GRAPHEME JOINER */
- || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
- || ch == 0x035D /* COMBINING DOUBLE BREVE */
- || ch == 0x035E /* COMBINING DOUBLE MACRON */
- || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
- || ch == 0x0360 /* COMBINING DOUBLE TILDE */
- || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
- || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
- || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
- || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
- || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
- || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
- || ch == 0x2007 /* FIGURE SPACE */
- || ch == 0x2011 /* NON-BREAKING HYPHEN */
- || ch == 0x202F /* NARROW NO-BREAK SPACE */);
+ || ch == 0x034F /* COMBINING GRAPHEME JOINER */
+ || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
+ || ch == 0x035D /* COMBINING DOUBLE BREVE */
+ || ch == 0x035E /* COMBINING DOUBLE MACRON */
+ || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
+ || ch == 0x0360 /* COMBINING DOUBLE TILDE */
+ || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
+ || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
+ || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
+ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
+ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
+ || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
+ || ch == 0x2007 /* FIGURE SPACE */
+ || ch == 0x2011 /* NON-BREAKING HYPHEN */
+ || ch == 0x202F /* NARROW NO-BREAK SPACE */);
}
/* See PropList-3.0.1.txt. */
is_property_format_control (unsigned int ch)
{
return (is_category_Cf (ch)
- && get_bidi_category (ch) == UC_BIDI_BN
- && !is_property_join_control (ch)
- && ch != 0xFEFF);
+ && get_bidi_category (ch) == UC_BIDI_BN
+ && !is_property_join_control (ch)
+ && ch != 0xFEFF);
}
/* See PropList.txt, UCD.html. */
is_property_combining (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && (strcmp (unicode_attributes[ch].combining, "0") != 0
- || is_category_Mc (ch)
- || is_category_Me (ch)
- || is_category_Mn (ch)));
+ && (strcmp (unicode_attributes[ch].combining, "0") != 0
+ || is_category_Mc (ch)
+ || is_category_Me (ch)
+ || is_category_Mn (ch)));
}
#if 0 /* same as is_property_bidi_non_spacing_mark */
is_property_non_spacing (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
- && get_bidi_category (ch) == UC_BIDI_NSM);
+ && get_bidi_category (ch) == UC_BIDI_NSM);
}
#endif
&& unicode_attributes[ch].decomposition != NULL)
{
/* Test whether the decomposition contains more than one character,
- and the first is not a space. */
+ and the first is not a space. */
const char *decomp = unicode_attributes[ch].decomposition;
if (decomp[0] == '<')
- {
- decomp = strchr (decomp, '>') + 1;
- if (decomp[0] == ' ')
- decomp++;
- }
+ {
+ decomp = strchr (decomp, '>') + 1;
+ if (decomp[0] == ' ')
+ decomp++;
+ }
return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
}
return false;
is_property_numeric (unsigned int ch)
{
return ((get_numeric_value (ch)).denominator > 0)
- || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
- || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
+ || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
+ || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
}
/* See PropList.txt, UCD.html. */
is_property_ignorable_control (unsigned int ch)
{
return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
- || is_category_Cf (ch))
- && ch != 0x0000;
+ || is_category_Cf (ch))
+ && ch != 0x0000;
}
/* ------------------------------------------------------------------------- */
PROPERTY(lowercase)
PROPERTY(other_lowercase)
PROPERTY(titlecase)
+ PROPERTY(cased)
+ PROPERTY(case_ignorable)
+ PROPERTY(changes_when_lowercased)
+ PROPERTY(changes_when_uppercased)
+ PROPERTY(changes_when_titlecased)
+ PROPERTY(changes_when_casefolded)
+ PROPERTY(changes_when_casemapped)
PROPERTY(soft_dotted)
PROPERTY(id_start)
PROPERTY(other_id_start)
/* ========================================================================= */
-/* Scripts. */
+/* Arabic Shaping. */
-static const char *scripts[256];
-static unsigned int numscripts;
+enum
+{
+ UC_JOINING_TYPE_U, /* Non_Joining */
+ UC_JOINING_TYPE_T, /* Transparent */
+ UC_JOINING_TYPE_C, /* Join_Causing */
+ UC_JOINING_TYPE_L, /* Left_Joining */
+ UC_JOINING_TYPE_R, /* Right_Joining */
+ UC_JOINING_TYPE_D /* Dual_Joining */
+};
-static uint8_t unicode_scripts[0x110000];
+static uint8_t unicode_joining_type[0x110000];
+
+enum
+{
+ UC_JOINING_GROUP_NONE, /* No_Joining_Group */
+ UC_JOINING_GROUP_AIN, /* Ain */
+ UC_JOINING_GROUP_ALAPH, /* Alaph */
+ UC_JOINING_GROUP_ALEF, /* Alef */
+ UC_JOINING_GROUP_BEH, /* Beh */
+ UC_JOINING_GROUP_BETH, /* Beth */
+ UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
+ UC_JOINING_GROUP_DAL, /* Dal */
+ UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
+ UC_JOINING_GROUP_E, /* E */
+ UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
+ UC_JOINING_GROUP_FE, /* Fe */
+ UC_JOINING_GROUP_FEH, /* Feh */
+ UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
+ UC_JOINING_GROUP_GAF, /* Gaf */
+ UC_JOINING_GROUP_GAMAL, /* Gamal */
+ UC_JOINING_GROUP_HAH, /* Hah */
+ UC_JOINING_GROUP_HE, /* He */
+ UC_JOINING_GROUP_HEH, /* Heh */
+ UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
+ UC_JOINING_GROUP_HETH, /* Heth */
+ UC_JOINING_GROUP_KAF, /* Kaf */
+ UC_JOINING_GROUP_KAPH, /* Kaph */
+ UC_JOINING_GROUP_KHAPH, /* Khaph */
+ UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
+ UC_JOINING_GROUP_LAM, /* Lam */
+ UC_JOINING_GROUP_LAMADH, /* Lamadh */
+ UC_JOINING_GROUP_MEEM, /* Meem */
+ UC_JOINING_GROUP_MIM, /* Mim */
+ UC_JOINING_GROUP_NOON, /* Noon */
+ UC_JOINING_GROUP_NUN, /* Nun */
+ UC_JOINING_GROUP_NYA, /* Nya */
+ UC_JOINING_GROUP_PE, /* Pe */
+ UC_JOINING_GROUP_QAF, /* Qaf */
+ UC_JOINING_GROUP_QAPH, /* Qaph */
+ UC_JOINING_GROUP_REH, /* Reh */
+ UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
+ UC_JOINING_GROUP_SAD, /* Sad */
+ UC_JOINING_GROUP_SADHE, /* Sadhe */
+ UC_JOINING_GROUP_SEEN, /* Seen */
+ UC_JOINING_GROUP_SEMKATH, /* Semkath */
+ UC_JOINING_GROUP_SHIN, /* Shin */
+ UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
+ UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
+ UC_JOINING_GROUP_TAH, /* Tah */
+ UC_JOINING_GROUP_TAW, /* Taw */
+ UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
+ UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
+ UC_JOINING_GROUP_TETH, /* Teth */
+ UC_JOINING_GROUP_WAW, /* Waw */
+ UC_JOINING_GROUP_YEH, /* Yeh */
+ UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
+ UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
+ UC_JOINING_GROUP_YUDH, /* Yudh */
+ UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
+ UC_JOINING_GROUP_ZAIN, /* Zain */
+ UC_JOINING_GROUP_ZHAIN /* Zhain */
+};
+
+static uint8_t unicode_joining_group[0x110000];
static void
-fill_scripts (const char *scripts_filename)
+fill_arabicshaping (const char *arabicshaping_filename)
{
FILE *stream;
unsigned int i;
+ int lineno;
- stream = fopen (scripts_filename, "r");
+ stream = fopen (arabicshaping_filename, "r");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+ fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
exit (1);
}
- numscripts = 0;
-
for (i = 0; i < 0x110000; i++)
- unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+ {
+ unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
+ unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
+ }
+ lineno = 0;
for (;;)
{
- char buf[200+1];
- unsigned int i1, i2;
- char padding[200+1];
- char scriptname[200+1];
- int script;
+ char buf[100+1];
+ char separator1[100+1];
+ char padding1[100+1];
+ char schematic_name[100+1];
+ char separator2[100+1];
+ char padding2[100+1];
+ char joining_type_name[100+1];
+ char separator3[100+1];
+ char padding3[100+1];
+ char joining_group_name[100+1];
+ int joining_type;
+ int joining_group;
- if (fscanf (stream, "%200[^\n]\n", buf) < 1)
- break;
+ lineno++;
+ if (fscanf (stream, "%100[^\n]\n", buf) < 1)
+ break;
if (buf[0] == '\0' || buf[0] == '#')
- continue;
-
- if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
- {
- if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
- {
- fprintf (stderr, "parse error in '%s'\n", scripts_filename);
- exit (1);
- }
- i2 = i1;
- }
- if (i2 < i1)
- abort ();
- if (i2 >= 0x110000)
- abort ();
-
- for (script = numscripts - 1; script >= 0; script--)
- if (strcmp (scripts[script], scriptname) == 0)
- break;
- if (script < 0)
- {
- scripts[numscripts] = strdup (scriptname);
- script = numscripts;
- numscripts++;
- if (numscripts == 256)
- abort ();
- }
+ continue;
+
+ if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
+ &i, separator1, padding1, schematic_name, separator2,
+ padding2, joining_type_name, separator3, padding3,
+ joining_group_name) != 10)
+ {
+ fprintf (stderr, "parse error in '%s':%d\n",
+ arabicshaping_filename, lineno);
+ exit (1);
+ }
+ if (i >= 0x110000)
+ abort ();
+
+#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
+ if (false) {}
+ TRY(UC_JOINING_TYPE_U)
+ TRY(UC_JOINING_TYPE_T)
+ TRY(UC_JOINING_TYPE_C)
+ TRY(UC_JOINING_TYPE_L)
+ TRY(UC_JOINING_TYPE_R)
+ TRY(UC_JOINING_TYPE_D)
+#undef TRY
+ else
+ {
+ fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
+ joining_type_name, arabicshaping_filename, lineno);
+ exit (1);
+ }
+
+ /* Remove trailing spaces. */
+ while (joining_group_name[0] != '\0'
+ && joining_group_name[strlen (joining_group_name) - 1] == ' ')
+ joining_group_name[strlen (joining_group_name) - 1] = '\0';
+
+#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
+ if (false) {}
+ TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
+ TRY(UC_JOINING_GROUP_AIN, "AIN")
+ TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
+ TRY(UC_JOINING_GROUP_ALEF, "ALEF")
+ TRY(UC_JOINING_GROUP_BEH, "BEH")
+ TRY(UC_JOINING_GROUP_BETH, "BETH")
+ TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
+ TRY(UC_JOINING_GROUP_DAL, "DAL")
+ TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
+ TRY(UC_JOINING_GROUP_E, "E")
+ TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
+ TRY(UC_JOINING_GROUP_FE, "FE")
+ TRY(UC_JOINING_GROUP_FEH, "FEH")
+ TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
+ TRY(UC_JOINING_GROUP_GAF, "GAF")
+ TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
+ TRY(UC_JOINING_GROUP_HAH, "HAH")
+ TRY(UC_JOINING_GROUP_HE, "HE")
+ TRY(UC_JOINING_GROUP_HEH, "HEH")
+ TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
+ TRY(UC_JOINING_GROUP_HETH, "HETH")
+ TRY(UC_JOINING_GROUP_KAF, "KAF")
+ TRY(UC_JOINING_GROUP_KAPH, "KAPH")
+ TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
+ TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
+ TRY(UC_JOINING_GROUP_LAM, "LAM")
+ TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
+ TRY(UC_JOINING_GROUP_MEEM, "MEEM")
+ TRY(UC_JOINING_GROUP_MIM, "MIM")
+ TRY(UC_JOINING_GROUP_NOON, "NOON")
+ TRY(UC_JOINING_GROUP_NUN, "NUN")
+ TRY(UC_JOINING_GROUP_NYA, "NYA")
+ TRY(UC_JOINING_GROUP_PE, "PE")
+ TRY(UC_JOINING_GROUP_QAF, "QAF")
+ TRY(UC_JOINING_GROUP_QAPH, "QAPH")
+ TRY(UC_JOINING_GROUP_REH, "REH")
+ TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
+ TRY(UC_JOINING_GROUP_SAD, "SAD")
+ TRY(UC_JOINING_GROUP_SADHE, "SADHE")
+ TRY(UC_JOINING_GROUP_SEEN, "SEEN")
+ TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
+ TRY(UC_JOINING_GROUP_SHIN, "SHIN")
+ TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
+ TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
+ TRY(UC_JOINING_GROUP_TAH, "TAH")
+ TRY(UC_JOINING_GROUP_TAW, "TAW")
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
+ TRY(UC_JOINING_GROUP_TETH, "TETH")
+ TRY(UC_JOINING_GROUP_WAW, "WAW")
+ TRY(UC_JOINING_GROUP_YEH, "YEH")
+ TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
+ TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
+ TRY(UC_JOINING_GROUP_YUDH, "YUDH")
+ TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
+ TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
+ TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
+#undef TRY
+ else
+ {
+ fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
+ joining_group_name, arabicshaping_filename, lineno);
+ exit (1);
+ }
- for (i = i1; i <= i2; i++)
- {
- if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
- fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
- unicode_scripts[i] = script;
- }
+ unicode_joining_type[i] = joining_type;
+ unicode_joining_group[i] = joining_group;
}
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+ fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
exit (1);
}
}
-/* Construction of sparse 3-level tables. */
-#define TABLE script_table
-#define ELEMENT uint8_t
-#define DEFAULT (uint8_t)~(uint8_t)0
-#define xmalloc malloc
-#define xrealloc realloc
-#include "3level.h"
+/* Convert a Joining_Type value to a C identifier. */
+static const char *
+joining_type_as_c_identifier (int joining_type)
+{
+#define TRY(value) if (joining_type == value) return #value;
+ TRY(UC_JOINING_TYPE_U)
+ TRY(UC_JOINING_TYPE_T)
+ TRY(UC_JOINING_TYPE_C)
+ TRY(UC_JOINING_TYPE_L)
+ TRY(UC_JOINING_TYPE_R)
+ TRY(UC_JOINING_TYPE_D)
+#undef TRY
+ abort ();
+}
static void
-output_scripts (const char *version)
+output_joining_type_test (const char *filename, const char *version)
{
- const char *filename = "unictype/scripts.h";
FILE *stream;
- unsigned int ch, s, i;
- struct script_table t;
- unsigned int level1_offset, level2_offset, level3_offset;
-
- typedef struct
- {
- const char *lowercase_name;
- }
- scriptinfo_t;
- scriptinfo_t scriptinfo[256];
+ bool need_comma;
+ unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Unicode scripts. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
- for (s = 0; s < numscripts; s++)
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
{
- char *lcp = strdup (scripts[s]);
- char *cp;
+ int value = unicode_joining_type[ch];
- for (cp = lcp; *cp != '\0'; cp++)
- if (*cp >= 'A' && *cp <= 'Z')
- *cp += 'a' - 'A';
-
- scriptinfo[s].lowercase_name = lcp;
+ if (value != (uint8_t)~(uint8_t)0)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
+ need_comma = true;
+ }
}
+ if (need_comma)
+ fprintf (stream, "\n");
- for (s = 0; s < numscripts; s++)
+ if (ferror (stream) || fclose (stream))
{
- fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, "{\n");
- i = 0;
- for (ch = 0; ch < 0x110000; ch++)
- if (unicode_scripts[ch] == s)
- {
- unsigned int start;
- unsigned int end;
-
- start = ch;
- while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
- ch++;
- end = ch;
-
- if (i > 0)
- fprintf (stream, ",\n");
- if (start == end)
- fprintf (stream, " { 0x%04X, 1, 1 }", start);
- else
- fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
- start, end);
- i++;
- }
- fprintf (stream, "\n");
- fprintf (stream, "};\n");
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
}
+}
- fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
- fprintf (stream, "{\n");
- for (s = 0; s < numscripts; s++)
+/* Construction of sparse 3-level tables. */
+#define TABLE joining_type_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_joining_type (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct joining_type_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+ uint8_t *level3_packed;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
{
- fprintf (stream, " {\n");
- fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, " script_%s_intervals,\n",
- scriptinfo[s].lowercase_name);
- fprintf (stream, " \"%s\"\n", scripts[s]);
- fprintf (stream, " }");
- if (s+1 < numscripts)
- fprintf (stream, ",");
- fprintf (stream, "\n");
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
}
- fprintf (stream, "};\n");
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
- script_table_init (&t);
+ joining_type_table_init (&t);
for (ch = 0; ch < 0x110000; ch++)
{
- unsigned int s = unicode_scripts[ch];
- if (s != (uint8_t)~(uint8_t)0)
- script_table_add (&t, ch, s);
+ uint8_t value = unicode_joining_type[ch];
+
+ joining_type_table_add (&t, ch, value);
}
- script_table_finalize (&t);
+ joining_type_table_finalize (&t);
/* Offsets in t.result, in memory of this process. */
level1_offset =
+ (t.level2_size << t.q) * sizeof (uint32_t);
for (i = 0; i < 5; i++)
- fprintf (stream, "#define script_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ fprintf (stream, "#define joining_type_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
+ (1 << t.p) * 4 / 8);
fprintf (stream, " }\n");
- fprintf (stream, "u_script =\n");
+ fprintf (stream, "u_joining_type =\n");
fprintf (stream, "{\n");
fprintf (stream, " {");
if (t.level1_size > 8)
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
fprintf (stream, " },\n");
+ /* Pack the level3 array. Each entry needs 4 bits only. */
+ level3_packed =
+ (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ unsigned int j = (i * 4) / 8;
+ unsigned int k = (i * 4) % 8;
+ uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
+ level3_packed[j] |= (value << k);
+ }
fprintf (stream, " {");
- if (t.level3_size << t.p > 8)
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
fprintf (stream, "\n ");
- for (i = 0; i < t.level3_size << t.p; i++)
+ for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
- if (i+1 < t.level3_size << t.p)
- fprintf (stream, ",");
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%02x", level3_packed[i]);
+ if (i+1 < (t.level3_size << t.p) * 4 / 8)
+ fprintf (stream, ",");
}
- if (t.level3_size << t.p > 8)
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
fprintf (stream, "\n ");
fprintf (stream, " }\n");
+ free (level3_packed);
fprintf (stream, "};\n");
if (ferror (stream) || fclose (stream))
}
}
+/* Convert a Joining_Group value to a C identifier. */
+static const char *
+joining_group_as_c_identifier (int joining_group)
+{
+#define TRY(value) if (joining_group == value) return #value;
+ TRY(UC_JOINING_GROUP_NONE)
+ TRY(UC_JOINING_GROUP_AIN)
+ TRY(UC_JOINING_GROUP_ALAPH)
+ TRY(UC_JOINING_GROUP_ALEF)
+ TRY(UC_JOINING_GROUP_BEH)
+ TRY(UC_JOINING_GROUP_BETH)
+ TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_DAL)
+ TRY(UC_JOINING_GROUP_DALATH_RISH)
+ TRY(UC_JOINING_GROUP_E)
+ TRY(UC_JOINING_GROUP_FARSI_YEH)
+ TRY(UC_JOINING_GROUP_FE)
+ TRY(UC_JOINING_GROUP_FEH)
+ TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
+ TRY(UC_JOINING_GROUP_GAF)
+ TRY(UC_JOINING_GROUP_GAMAL)
+ TRY(UC_JOINING_GROUP_HAH)
+ TRY(UC_JOINING_GROUP_HE)
+ TRY(UC_JOINING_GROUP_HEH)
+ TRY(UC_JOINING_GROUP_HEH_GOAL)
+ TRY(UC_JOINING_GROUP_HETH)
+ TRY(UC_JOINING_GROUP_KAF)
+ TRY(UC_JOINING_GROUP_KAPH)
+ TRY(UC_JOINING_GROUP_KHAPH)
+ TRY(UC_JOINING_GROUP_KNOTTED_HEH)
+ TRY(UC_JOINING_GROUP_LAM)
+ TRY(UC_JOINING_GROUP_LAMADH)
+ TRY(UC_JOINING_GROUP_MEEM)
+ TRY(UC_JOINING_GROUP_MIM)
+ TRY(UC_JOINING_GROUP_NOON)
+ TRY(UC_JOINING_GROUP_NUN)
+ TRY(UC_JOINING_GROUP_NYA)
+ TRY(UC_JOINING_GROUP_PE)
+ TRY(UC_JOINING_GROUP_QAF)
+ TRY(UC_JOINING_GROUP_QAPH)
+ TRY(UC_JOINING_GROUP_REH)
+ TRY(UC_JOINING_GROUP_REVERSED_PE)
+ TRY(UC_JOINING_GROUP_SAD)
+ TRY(UC_JOINING_GROUP_SADHE)
+ TRY(UC_JOINING_GROUP_SEEN)
+ TRY(UC_JOINING_GROUP_SEMKATH)
+ TRY(UC_JOINING_GROUP_SHIN)
+ TRY(UC_JOINING_GROUP_SWASH_KAF)
+ TRY(UC_JOINING_GROUP_SYRIAC_WAW)
+ TRY(UC_JOINING_GROUP_TAH)
+ TRY(UC_JOINING_GROUP_TAW)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
+ TRY(UC_JOINING_GROUP_TETH)
+ TRY(UC_JOINING_GROUP_WAW)
+ TRY(UC_JOINING_GROUP_YEH)
+ TRY(UC_JOINING_GROUP_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
+ TRY(UC_JOINING_GROUP_YUDH)
+ TRY(UC_JOINING_GROUP_YUDH_HE)
+ TRY(UC_JOINING_GROUP_ZAIN)
+ TRY(UC_JOINING_GROUP_ZHAIN)
+#undef TRY
+ abort ();
+}
+
static void
-output_scripts_byname (const char *version)
+output_joining_group_test (const char *filename, const char *version)
{
- const char *filename = "unictype/scripts_byname.gperf";
FILE *stream;
- unsigned int s;
+ bool need_comma;
+ unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Unicode scripts. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
- fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
- fprintf (stream, "%%struct-type\n");
- fprintf (stream, "%%language=ANSI-C\n");
- fprintf (stream, "%%define hash-function-name scripts_hash\n");
- fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
- fprintf (stream, "%%readonly-tables\n");
- fprintf (stream, "%%global-table\n");
- fprintf (stream, "%%define word-array-name script_names\n");
- fprintf (stream, "%%%%\n");
- for (s = 0; s < numscripts; s++)
- fprintf (stream, "%s, %u\n", scripts[s], s);
+ fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int value = unicode_joining_group[ch];
+
+ if (value != UC_JOINING_GROUP_NONE)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
+ need_comma = true;
+ }
+ }
+ if (need_comma)
+ fprintf (stream, "\n");
if (ferror (stream) || fclose (stream))
{
}
}
-/* ========================================================================= */
-
-/* Blocks. */
+static void
+output_joining_group (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch_min, ch_max, ch, i;
-typedef struct { unsigned int start; unsigned int end; const char *name; }
- block_t;
-static block_t blocks[256];
-static unsigned int numblocks;
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ ch_min = 0x10FFFF;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_min = ch;
+ break;
+ }
+
+ ch_max = 0;
+ for (ch = 0x10FFFF; ch > 0; ch--)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_max = ch;
+ break;
+ }
+
+ if (!(ch_min <= ch_max))
+ abort ();
+
+ /* If the interval [ch_min, ch_max] is too large, we should better use a
+ 3-level table. */
+ if (!(ch_max - ch_min < 0x200))
+ abort ();
+
+ fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
+ fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
+ ch_max + 1, ch_min);
+ fprintf (stream, "{");
+ for (i = 0; i <= ch_max - ch_min; i++)
+ {
+ const char *s;
+
+ ch = ch_min + i;
+ if ((i % 2) == 0)
+ fprintf (stream, "\n ");
+ s = joining_group_as_c_identifier (unicode_joining_group[ch]);
+ fprintf (stream, " %s", s);
+ if (i+1 <= ch_max - ch_min)
+ {
+ fprintf (stream, ",");
+ if (((i+1) % 2) != 0)
+ fprintf (stream, "%*s", 38 - (int) strlen (s), "");
+ }
+ }
+ fprintf (stream, "\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Scripts. */
+
+static const char *scripts[256];
+static unsigned int numscripts;
+
+static uint8_t unicode_scripts[0x110000];
static void
-fill_blocks (const char *blocks_filename)
+fill_scripts (const char *scripts_filename)
{
FILE *stream;
+ unsigned int i;
- stream = fopen (blocks_filename, "r");
+ stream = fopen (scripts_filename, "r");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
+ fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
exit (1);
}
+ numscripts = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+
for (;;)
{
char buf[200+1];
unsigned int i1, i2;
char padding[200+1];
- char blockname[200+1];
+ char scriptname[200+1];
+ int script;
if (fscanf (stream, "%200[^\n]\n", buf) < 1)
- break;
+ break;
if (buf[0] == '\0' || buf[0] == '#')
- continue;
+ continue;
- if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
- {
- fprintf (stderr, "parse error in '%s'\n", blocks_filename);
- exit (1);
- }
- blocks[numblocks].start = i1;
- blocks[numblocks].end = i2;
- blocks[numblocks].name = strdup (blockname);
- /* It must be sorted. */
- if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
- abort ();
- numblocks++;
- if (numblocks == 256)
- abort ();
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n", scripts_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+ if (i2 < i1)
+ abort ();
+ if (i2 >= 0x110000)
+ abort ();
+
+ for (script = numscripts - 1; script >= 0; script--)
+ if (strcmp (scripts[script], scriptname) == 0)
+ break;
+ if (script < 0)
+ {
+ scripts[numscripts] = strdup (scriptname);
+ script = numscripts;
+ numscripts++;
+ if (numscripts == 256)
+ abort ();
+ }
+
+ for (i = i1; i <= i2; i++)
+ {
+ if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
+ fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
+ unicode_scripts[i] = script;
+ }
}
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", blocks_filename);
+ fprintf (stderr, "error reading from '%s'\n", scripts_filename);
exit (1);
}
}
-/* Return the smallest block index among the blocks for characters >= ch. */
-static unsigned int
-block_first_index (unsigned int ch)
-{
- /* Binary search. */
- unsigned int lo = 0;
- unsigned int hi = numblocks;
- /* Invariants:
- All blocks[i], i < lo, have blocks[i].end < ch,
- all blocks[i], i >= hi, have blocks[i].end >= ch. */
- while (lo < hi)
- {
- unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
- if (blocks[mid].end < ch)
- lo = mid + 1;
- else
- hi = mid;
- }
- return hi;
-}
-
-/* Return the largest block index among the blocks for characters <= ch,
- plus 1. */
-static unsigned int
-block_last_index (unsigned int ch)
-{
- /* Binary search. */
- unsigned int lo = 0;
- unsigned int hi = numblocks;
- /* Invariants:
- All blocks[i], i < lo, have blocks[i].start <= ch,
- all blocks[i], i >= hi, have blocks[i].start > ch. */
- while (lo < hi)
- {
- unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
- if (blocks[mid].start <= ch)
- lo = mid + 1;
- else
- hi = mid;
- }
- return hi;
-}
+/* Construction of sparse 3-level tables. */
+#define TABLE script_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
static void
-output_blocks (const char *version)
+output_scripts (const char *version)
{
- const char *filename = "unictype/blocks.h";
- const unsigned int shift = 8; /* bits to shift away for array access */
- const unsigned int threshold = 0x30000; /* cut-off table here to save space */
+ const char *filename = "unictype/scripts.h";
FILE *stream;
- unsigned int i;
- unsigned int i1;
+ unsigned int ch, s, i;
+ struct script_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ typedef struct
+ {
+ const char *lowercase_name;
+ }
+ scriptinfo_t;
+ scriptinfo_t scriptinfo[256];
stream = fopen (filename, "w");
if (stream == NULL)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Unicode blocks. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Unicode scripts. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
- fprintf (stream, "static const uc_block_t blocks[] =\n");
- fprintf (stream, "{\n");
- for (i = 0; i < numblocks; i++)
+ for (s = 0; s < numscripts; s++)
{
- fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
- blocks[i].end, blocks[i].name);
- if (i+1 < numblocks)
- fprintf (stream, ",");
+ char *lcp = strdup (scripts[s]);
+ char *cp;
+
+ for (cp = lcp; *cp != '\0'; cp++)
+ if (*cp >= 'A' && *cp <= 'Z')
+ *cp += 'a' - 'A';
+
+ scriptinfo[s].lowercase_name = lcp;
+ }
+
+ for (s = 0; s < numscripts; s++)
+ {
+ fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, "{\n");
+ i = 0;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_scripts[ch] == s)
+ {
+ unsigned int start;
+ unsigned int end;
+
+ start = ch;
+ while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
+ ch++;
+ end = ch;
+
+ if (i > 0)
+ fprintf (stream, ",\n");
+ if (start == end)
+ fprintf (stream, " { 0x%04X, 1, 1 }", start);
+ else
+ fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
+ start, end);
+ i++;
+ }
fprintf (stream, "\n");
+ fprintf (stream, "};\n");
}
- fprintf (stream, "};\n");
- fprintf (stream, "#define blocks_level1_shift %d\n", shift);
- fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
- fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
- threshold >> shift);
+
+ fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
fprintf (stream, "{\n");
- for (i1 = 0; i1 < (threshold >> shift); i1++)
+ for (s = 0; s < numscripts; s++)
{
- unsigned int first_index = block_first_index (i1 << shift);
- unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
- fprintf (stream, " %3d, %3d", first_index, last_index);
- if (i1+1 < (threshold >> shift))
- fprintf (stream, ",");
+ fprintf (stream, " {\n");
+ fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " script_%s_intervals,\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " \"%s\"\n", scripts[s]);
+ fprintf (stream, " }");
+ if (s+1 < numscripts)
+ fprintf (stream, ",");
fprintf (stream, "\n");
}
fprintf (stream, "};\n");
- fprintf (stream, "#define blocks_upper_first_index %d\n",
- block_first_index (threshold));
- fprintf (stream, "#define blocks_upper_last_index %d\n",
- block_last_index (0x10FFFF));
- if (ferror (stream) || fclose (stream))
+ t.p = 7;
+ t.q = 9;
+ script_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
{
- fprintf (stderr, "error writing to '%s'\n", filename);
- exit (1);
+ unsigned int s = unicode_scripts[ch];
+ if (s != (uint8_t)~(uint8_t)0)
+ script_table_add (&t, ch, s);
}
-}
-
-/* ========================================================================= */
-
-/* C and Java syntax. */
-enum
-{
- UC_IDENTIFIER_START, /* valid as first or subsequent character */
- UC_IDENTIFIER_VALID, /* valid as subsequent character only */
- UC_IDENTIFIER_INVALID, /* not valid */
- UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
-};
+ script_table_finalize (&t);
-/* ISO C 99 section 6.4.(3). */
-static bool
-is_c_whitespace (unsigned int ch)
-{
- return (ch == ' ' /* space */
- || ch == '\t' /* horizontal tab */
- || ch == '\n' || ch == '\r' /* new-line */
- || ch == '\v' /* vertical tab */
- || ch == '\f'); /* form-feed */
-}
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
-/* ISO C 99 section 6.4.2.1 and appendix D. */
-static int
-c_ident_category (unsigned int ch)
-{
- /* Section 6.4.2.1. */
- if (ch >= '0' && ch <= '9')
- return UC_IDENTIFIER_VALID;
- if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
- return UC_IDENTIFIER_START;
- /* Appendix D. */
- if (0
- /* Latin */
- || (ch == 0x00AA)
- || (ch == 0x00BA)
- || (ch >= 0x00C0 && ch <= 0x00D6)
- || (ch >= 0x00D8 && ch <= 0x00F6)
- || (ch >= 0x00F8 && ch <= 0x01F5)
- || (ch >= 0x01FA && ch <= 0x0217)
- || (ch >= 0x0250 && ch <= 0x02A8)
- || (ch >= 0x1E00 && ch <= 0x1E9B)
- || (ch >= 0x1EA0 && ch <= 0x1EF9)
- || (ch == 0x207F)
- /* Greek */
- || (ch == 0x0386)
- || (ch >= 0x0388 && ch <= 0x038A)
- || (ch == 0x038C)
- || (ch >= 0x038E && ch <= 0x03A1)
- || (ch >= 0x03A3 && ch <= 0x03CE)
- || (ch >= 0x03D0 && ch <= 0x03D6)
- || (ch == 0x03DA)
- || (ch == 0x03DC)
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define script_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_script =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
+ if (i+1 < t.level3_size << t.p)
+ fprintf (stream, ",");
+ }
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_scripts_byname (const char *version)
+{
+ const char *filename = "unictype/scripts_byname.gperf";
+ FILE *stream;
+ unsigned int s;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Unicode scripts. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
+ fprintf (stream, "%%struct-type\n");
+ fprintf (stream, "%%language=ANSI-C\n");
+ fprintf (stream, "%%define hash-function-name scripts_hash\n");
+ fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
+ fprintf (stream, "%%readonly-tables\n");
+ fprintf (stream, "%%global-table\n");
+ fprintf (stream, "%%define word-array-name script_names\n");
+ fprintf (stream, "%%pic\n");
+ fprintf (stream, "%%define string-pool-name script_stringpool\n");
+ fprintf (stream, "%%%%\n");
+ for (s = 0; s < numscripts; s++)
+ fprintf (stream, "%s, %u\n", scripts[s], s);
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Blocks. */
+
+typedef struct { unsigned int start; unsigned int end; const char *name; }
+ block_t;
+static block_t blocks[256];
+static unsigned int numblocks;
+
+static void
+fill_blocks (const char *blocks_filename)
+{
+ FILE *stream;
+
+ stream = fopen (blocks_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
+ exit (1);
+ }
+
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char blockname[200+1];
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
+ {
+ fprintf (stderr, "parse error in '%s'\n", blocks_filename);
+ exit (1);
+ }
+ blocks[numblocks].start = i1;
+ blocks[numblocks].end = i2;
+ blocks[numblocks].name = strdup (blockname);
+ /* It must be sorted. */
+ if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
+ abort ();
+ numblocks++;
+ if (numblocks == 256)
+ abort ();
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", blocks_filename);
+ exit (1);
+ }
+}
+
+/* Return the smallest block index among the blocks for characters >= ch. */
+static unsigned int
+block_first_index (unsigned int ch)
+{
+ /* Binary search. */
+ unsigned int lo = 0;
+ unsigned int hi = numblocks;
+ /* Invariants:
+ All blocks[i], i < lo, have blocks[i].end < ch,
+ all blocks[i], i >= hi, have blocks[i].end >= ch. */
+ while (lo < hi)
+ {
+ unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
+ if (blocks[mid].end < ch)
+ lo = mid + 1;
+ else
+ hi = mid;
+ }
+ return hi;
+}
+
+/* Return the largest block index among the blocks for characters <= ch,
+ plus 1. */
+static unsigned int
+block_last_index (unsigned int ch)
+{
+ /* Binary search. */
+ unsigned int lo = 0;
+ unsigned int hi = numblocks;
+ /* Invariants:
+ All blocks[i], i < lo, have blocks[i].start <= ch,
+ all blocks[i], i >= hi, have blocks[i].start > ch. */
+ while (lo < hi)
+ {
+ unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
+ if (blocks[mid].start <= ch)
+ lo = mid + 1;
+ else
+ hi = mid;
+ }
+ return hi;
+}
+
+static void
+output_blocks (const char *version)
+{
+ const char *filename = "unictype/blocks.h";
+ const unsigned int shift = 8; /* bits to shift away for array access */
+ const unsigned int threshold = 0x30000; /* cut-off table here to save space */
+ FILE *stream;
+ unsigned int i;
+ unsigned int i1;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Unicode blocks. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ fprintf (stream, "static const uc_block_t blocks[] =\n");
+ fprintf (stream, "{\n");
+ for (i = 0; i < numblocks; i++)
+ {
+ fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
+ blocks[i].end, blocks[i].name);
+ if (i+1 < numblocks)
+ fprintf (stream, ",");
+ fprintf (stream, "\n");
+ }
+ fprintf (stream, "};\n");
+ fprintf (stream, "#define blocks_level1_shift %d\n", shift);
+ fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
+ fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
+ threshold >> shift);
+ fprintf (stream, "{\n");
+ for (i1 = 0; i1 < (threshold >> shift); i1++)
+ {
+ unsigned int first_index = block_first_index (i1 << shift);
+ unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
+ fprintf (stream, " %3d, %3d", first_index, last_index);
+ if (i1+1 < (threshold >> shift))
+ fprintf (stream, ",");
+ fprintf (stream, "\n");
+ }
+ fprintf (stream, "};\n");
+ fprintf (stream, "#define blocks_upper_first_index %d\n",
+ block_first_index (threshold));
+ fprintf (stream, "#define blocks_upper_last_index %d\n",
+ block_last_index (0x10FFFF));
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* C and Java syntax. */
+
+enum
+{
+ UC_IDENTIFIER_START, /* valid as first or subsequent character */
+ UC_IDENTIFIER_VALID, /* valid as subsequent character only */
+ UC_IDENTIFIER_INVALID, /* not valid */
+ UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
+};
+
+/* ISO C 99 section 6.4.(3). */
+static bool
+is_c_whitespace (unsigned int ch)
+{
+ return (ch == ' ' /* space */
+ || ch == '\t' /* horizontal tab */
+ || ch == '\n' || ch == '\r' /* new-line */
+ || ch == '\v' /* vertical tab */
+ || ch == '\f'); /* form-feed */
+}
+
+/* ISO C 99 section 6.4.2.1 and appendix D. */
+static int
+c_ident_category (unsigned int ch)
+{
+ /* Section 6.4.2.1. */
+ if (ch >= '0' && ch <= '9')
+ return UC_IDENTIFIER_VALID;
+ if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
+ return UC_IDENTIFIER_START;
+ /* Appendix D. */
+ if (0
+ /* Latin */
+ || (ch == 0x00AA)
+ || (ch == 0x00BA)
+ || (ch >= 0x00C0 && ch <= 0x00D6)
+ || (ch >= 0x00D8 && ch <= 0x00F6)
+ || (ch >= 0x00F8 && ch <= 0x01F5)
+ || (ch >= 0x01FA && ch <= 0x0217)
+ || (ch >= 0x0250 && ch <= 0x02A8)
+ || (ch >= 0x1E00 && ch <= 0x1E9B)
+ || (ch >= 0x1EA0 && ch <= 0x1EF9)
+ || (ch == 0x207F)
+ /* Greek */
+ || (ch == 0x0386)
+ || (ch >= 0x0388 && ch <= 0x038A)
+ || (ch == 0x038C)
+ || (ch >= 0x038E && ch <= 0x03A1)
+ || (ch >= 0x03A3 && ch <= 0x03CE)
+ || (ch >= 0x03D0 && ch <= 0x03D6)
+ || (ch == 0x03DA)
+ || (ch == 0x03DC)
|| (ch == 0x03DE)
|| (ch == 0x03E0)
|| (ch >= 0x03E2 && ch <= 0x03F3)
return UC_IDENTIFIER_INVALID;
}
-/* The Java Language Specification, 3rd edition, §3.6.
- http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
+/* The Java Language Specification, 3rd edition, §3.6.
+ http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
+static bool
+is_java_whitespace (unsigned int ch)
+{
+ return (ch == ' ' || ch == '\t' || ch == '\f'
+ || ch == '\n' || ch == '\r');
+}
+
+/* The Java Language Specification, 3rd edition, §3.8.
+ http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
+ and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
+static int
+java_ident_category (unsigned int ch)
+{
+ /* FIXME: Check this against Sun's JDK implementation. */
+ if (is_category_L (ch) /* = Character.isLetter(ch) */
+ || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
+ || is_category_Sc (ch) /* currency symbol */
+ || is_category_Pc (ch) /* connector punctuation */
+ )
+ return UC_IDENTIFIER_START;
+ if (is_category_Nd (ch) /* digit */
+ || is_category_Mc (ch) /* combining mark */
+ || is_category_Mn (ch) /* non-spacing mark */
+ )
+ return UC_IDENTIFIER_VALID;
+ if ((ch >= 0x0000 && ch <= 0x0008)
+ || (ch >= 0x000E && ch <= 0x001B)
+ || (ch >= 0x007F && ch <= 0x009F)
+ || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
+ )
+ return UC_IDENTIFIER_IGNORABLE;
+ return UC_IDENTIFIER_INVALID;
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE identsyntax_table
+#define ELEMENT uint8_t
+#define DEFAULT UC_IDENTIFIER_INVALID
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* Output an identifier syntax categorization in a three-level bitmap. */
+static void
+output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct identsyntax_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ t.p = 7; /* or 8 */
+ t.q = 5; /* or 4 */
+ identsyntax_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int syntaxcode = predicate (ch);
+ if (syntaxcode != UC_IDENTIFIER_INVALID)
+ identsyntax_table_add (&t, ch, syntaxcode);
+ }
+
+ identsyntax_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define identsyntax_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
+ (1 << t.p) * 2 / 16);
+ fprintf (stream, " }\n");
+ fprintf (stream, "%s =\n", name);
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ /* Pack the level3 array. Each entry needs 2 bits only. */
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 2 / 16 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%04x",
+ (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
+ | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
+ if (i+1 < (t.level3_size << t.p) * 2 / 16)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 2 / 16 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_ident_properties (const char *version)
+{
+#define PROPERTY(P) \
+ debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
+ output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
+ output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
+ PROPERTY(c_whitespace)
+ PROPERTY(java_whitespace)
+#undef PROPERTY
+
+ output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
+ output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
+}
+
+/* ========================================================================= */
+
+/* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
+ glibc/localedata/locales/i18n file, generated by
+ glibc/localedata/gen-unicode-ctype.c. */
+
+/* Character mappings. */
+
+static unsigned int
+to_upper (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].upper != NONE)
+ return unicode_attributes[ch].upper;
+ else
+ return ch;
+}
+
+static unsigned int
+to_lower (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].lower != NONE)
+ return unicode_attributes[ch].lower;
+ else
+ return ch;
+}
+
+static unsigned int
+to_title (unsigned int ch)
+{
+ if (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].title != NONE)
+ return unicode_attributes[ch].title;
+ else
+ return ch;
+}
+
+/* Character class properties. */
+
+static bool
+is_upper (unsigned int ch)
+{
+ return (to_lower (ch) != ch);
+}
+
+static bool
+is_lower (unsigned int ch)
+{
+ return (to_upper (ch) != ch)
+ /* <U00DF> is lowercase, but without simple to_upper mapping. */
+ || (ch == 0x00DF);
+}
+
+static bool
+is_alpha (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && ((unicode_attributes[ch].category[0] == 'L'
+ /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
+ <U0E2F>, <U0E46> should belong to is_punct. */
+ && (ch != 0x0E2F) && (ch != 0x0E46))
+ /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
+ <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
+ || (ch == 0x0E31)
+ || (ch >= 0x0E34 && ch <= 0x0E3A)
+ || (ch >= 0x0E47 && ch <= 0x0E4E)
+ /* Avoid warning for <U0345>. */
+ || (ch == 0x0345)
+ /* Avoid warnings for <U2160>..<U217F>. */
+ || (unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'l')
+ /* Avoid warnings for <U24B6>..<U24E9>. */
+ || (unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'o'
+ && strstr (unicode_attributes[ch].name, " LETTER ")
+ != NULL)
+ /* Consider all the non-ASCII digits as alphabetic.
+ ISO C 99 forbids us to have them in category "digit",
+ but we want iswalnum to return true on them. */
+ || (unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd'
+ && !(ch >= 0x0030 && ch <= 0x0039))));
+}
+
+static bool
+is_digit (unsigned int ch)
+{
+#if 0
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd');
+ /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+ a zero. Must add <0> in front of them by hand. */
+#else
+ /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+ takes it away:
+ 7.25.2.1.5:
+ The iswdigit function tests for any wide character that corresponds
+ to a decimal-digit character (as defined in 5.2.1).
+ 5.2.1:
+ the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+ */
+ return (ch >= 0x0030 && ch <= 0x0039);
+#endif
+}
+
+static bool
+is_outdigit (unsigned int ch)
+{
+ return (ch >= 0x0030 && ch <= 0x0039);
+}
+
+static bool
+is_alnum (unsigned int ch)
+{
+ return is_alpha (ch) || is_digit (ch);
+}
+
+static bool
+is_blank (unsigned int ch)
+{
+ return (ch == 0x0009 /* '\t' */
+ /* Category Zs without mention of "<noBreak>" */
+ || (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && unicode_attributes[ch].category[1] == 's'
+ && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
+}
+
+static bool
+is_space (unsigned int ch)
+{
+ /* Don't make U+00A0 a space. Non-breaking space means that all programs
+ should treat it like a punctuation character, not like a space. */
+ return (ch == 0x0020 /* ' ' */
+ || ch == 0x000C /* '\f' */
+ || ch == 0x000A /* '\n' */
+ || ch == 0x000D /* '\r' */
+ || ch == 0x0009 /* '\t' */
+ || ch == 0x000B /* '\v' */
+ /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
+ || (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p'
+ || (unicode_attributes[ch].category[1] == 's'
+ && !strstr (unicode_attributes[ch].decomposition,
+ "<noBreak>")))));
+}
+
+static bool
+is_cntrl (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && (strcmp (unicode_attributes[ch].name, "<control>") == 0
+ /* Categories Zl and Zp */
+ || (unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p'))));
+}
+
+static bool
+is_xdigit (unsigned int ch)
+{
+#if 0
+ return is_digit (ch)
+ || (ch >= 0x0041 && ch <= 0x0046)
+ || (ch >= 0x0061 && ch <= 0x0066);
+#else
+ /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+ takes it away:
+ 7.25.2.1.12:
+ The iswxdigit function tests for any wide character that corresponds
+ to a hexadecimal-digit character (as defined in 6.4.4.1).
+ 6.4.4.1:
+ hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+ */
+ return (ch >= 0x0030 && ch <= 0x0039)
+ || (ch >= 0x0041 && ch <= 0x0046)
+ || (ch >= 0x0061 && ch <= 0x0066);
+#endif
+}
+
+static bool
+is_graph (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && strcmp (unicode_attributes[ch].name, "<control>")
+ && !is_space (ch));
+}
+
+static bool
+is_print (unsigned int ch)
+{
+ return (unicode_attributes[ch].name != NULL
+ && strcmp (unicode_attributes[ch].name, "<control>")
+ /* Categories Zl and Zp */
+ && !(unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'Z'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'p')));
+}
+
+static bool
+is_punct (unsigned int ch)
+{
+#if 0
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'P');
+#else
+ /* The traditional POSIX definition of punctuation is every graphic,
+ non-alphanumeric character. */
+ return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
+#endif
+}
+
+/* Output all properties. */
+static void
+output_old_ctype (const char *version)
+{
+#define PROPERTY(P) \
+ debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
+ output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
+ output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
+ PROPERTY(alnum)
+ PROPERTY(alpha)
+ PROPERTY(cntrl)
+ PROPERTY(digit)
+ PROPERTY(graph)
+ PROPERTY(lower)
+ PROPERTY(print)
+ PROPERTY(punct)
+ PROPERTY(space)
+ PROPERTY(upper)
+ PROPERTY(xdigit)
+ PROPERTY(blank)
+#undef PROPERTY
+}
+
+#if 0
+
+static bool
+is_combining (unsigned int ch)
+{
+ /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+ file. In 3.0.1 it was identical to the union of the general categories
+ "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+ PropList.txt file, so we take the latter definition. */
+ return (unicode_attributes[ch].name != NULL
+ && unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'n'
+ || unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'e'));
+}
+
+static bool
+is_combining_level3 (unsigned int ch)
+{
+ return is_combining (ch)
+ && !(unicode_attributes[ch].combining[0] != '\0'
+ && unicode_attributes[ch].combining[0] != '0'
+ && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
+}
+
+/* Return the UCS symbol string for a Unicode character. */
+static const char *
+ucs_symbol (unsigned int i)
+{
+ static char buf[11+1];
+
+ sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
+ return buf;
+}
+
+/* Return the UCS symbol range string for a Unicode characters interval. */
+static const char *
+ucs_symbol_range (unsigned int low, unsigned int high)
+{
+ static char buf[24+1];
+
+ strcpy (buf, ucs_symbol (low));
+ strcat (buf, "..");
+ strcat (buf, ucs_symbol (high));
+ return buf;
+}
+
+/* Output a character class (= property) table. */
+
+static void
+output_charclass (FILE *stream, const char *classname,
+ bool (*func) (unsigned int))
+{
+ char table[0x110000];
+ unsigned int i;
+ bool need_semicolon;
+ const int max_column = 75;
+ int column;
+
+ for (i = 0; i < 0x110000; i++)
+ table[i] = (int) func (i);
+
+ fprintf (stream, "%s ", classname);
+ need_semicolon = false;
+ column = 1000;
+ for (i = 0; i < 0x110000; )
+ {
+ if (!table[i])
+ i++;
+ else
+ {
+ unsigned int low, high;
+ char buf[25];
+
+ low = i;
+ do
+ i++;
+ while (i < 0x110000 && table[i]);
+ high = i - 1;
+
+ if (low == high)
+ strcpy (buf, ucs_symbol (low));
+ else
+ strcpy (buf, ucs_symbol_range (low, high));
+
+ if (need_semicolon)
+ {
+ fprintf (stream, ";");
+ column++;
+ }
+
+ if (column + strlen (buf) > max_column)
+ {
+ fprintf (stream, "/\n ");
+ column = 3;
+ }
+
+ fprintf (stream, "%s", buf);
+ column += strlen (buf);
+ need_semicolon = true;
+ }
+ }
+ fprintf (stream, "\n");
+}
+
+/* Output a character mapping table. */
+
+static void
+output_charmap (FILE *stream, const char *mapname,
+ unsigned int (*func) (unsigned int))
+{
+ char table[0x110000];
+ unsigned int i;
+ bool need_semicolon;
+ const int max_column = 75;
+ int column;
+
+ for (i = 0; i < 0x110000; i++)
+ table[i] = (func (i) != i);
+
+ fprintf (stream, "%s ", mapname);
+ need_semicolon = false;
+ column = 1000;
+ for (i = 0; i < 0x110000; i++)
+ if (table[i])
+ {
+ char buf[25+1];
+
+ strcpy (buf, "(");
+ strcat (buf, ucs_symbol (i));
+ strcat (buf, ",");
+ strcat (buf, ucs_symbol (func (i)));
+ strcat (buf, ")");
+
+ if (need_semicolon)
+ {
+ fprintf (stream, ";");
+ column++;
+ }
+
+ if (column + strlen (buf) > max_column)
+ {
+ fprintf (stream, "/\n ");
+ column = 3;
+ }
+
+ fprintf (stream, "%s", buf);
+ column += strlen (buf);
+ need_semicolon = true;
+ }
+ fprintf (stream, "\n");
+}
+
+/* Output the width table. */
+
+static void
+output_widthmap (FILE *stream)
+{
+}
+
+/* Output the tables to the given file. */
+
+static void
+output_tables (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "escape_char /\n");
+ fprintf (stream, "comment_char %%\n");
+ fprintf (stream, "\n");
+ fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
+ version);
+ fprintf (stream, "\n");
+
+ fprintf (stream, "LC_IDENTIFICATION\n");
+ fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
+ fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
+ fprintf (stream, "address \"\"\n");
+ fprintf (stream, "contact \"\"\n");
+ fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
+ fprintf (stream, "tel \"\"\n");
+ fprintf (stream, "fax \"\"\n");
+ fprintf (stream, "language \"\"\n");
+ fprintf (stream, "territory \"Earth\"\n");
+ fprintf (stream, "revision \"%s\"\n", version);
+ {
+ time_t now;
+ char date[11];
+ now = time (NULL);
+ strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
+ fprintf (stream, "date \"%s\"\n", date);
+ }
+ fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
+ fprintf (stream, "END LC_IDENTIFICATION\n");
+ fprintf (stream, "\n");
+
+ /* Verification. */
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ /* toupper restriction: "Only characters specified for the keywords
+ lower and upper shall be specified. */
+ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
+ fprintf (stderr,
+ "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
+ ucs_symbol (ch), ch, to_upper (ch));
+
+ /* tolower restriction: "Only characters specified for the keywords
+ lower and upper shall be specified. */
+ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
+ fprintf (stderr,
+ "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
+ ucs_symbol (ch), ch, to_lower (ch));
+
+ /* alpha restriction: "Characters classified as either upper or lower
+ shall automatically belong to this class. */
+ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
+ fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
+
+ /* alpha restriction: "No character specified for the keywords cntrl,
+ digit, punct or space shall be specified." */
+ if (is_alpha (ch) && is_cntrl (ch))
+ fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_digit (ch))
+ fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_punct (ch))
+ fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
+ if (is_alpha (ch) && is_space (ch))
+ fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
+
+ /* space restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, graph or xdigit shall be specified."
+ upper, lower, alpha already checked above. */
+ if (is_space (ch) && is_digit (ch))
+ fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
+ if (is_space (ch) && is_graph (ch))
+ fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
+ if (is_space (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
+
+ /* cntrl restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, punct, graph, print or xdigit shall be
+ specified." upper, lower, alpha already checked above. */
+ if (is_cntrl (ch) && is_digit (ch))
+ fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_punct (ch))
+ fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_graph (ch))
+ fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_print (ch))
+ fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
+ if (is_cntrl (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
+
+ /* punct restriction: "No character specified for the keywords upper,
+ lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+ be specified." upper, lower, alpha, cntrl already checked above. */
+ if (is_punct (ch) && is_digit (ch))
+ fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
+ if (is_punct (ch) && is_xdigit (ch))
+ fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
+ if (is_punct (ch) && (ch == 0x0020))
+ fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
+
+ /* graph restriction: "No character specified for the keyword cntrl
+ shall be specified." Already checked above. */
+
+ /* print restriction: "No character specified for the keyword cntrl
+ shall be specified." Already checked above. */
+
+ /* graph - print relation: differ only in the <space> character.
+ How is this possible if there are more than one space character?!
+ I think susv2/xbd/locale.html should speak of "space characters",
+ not "space character". */
+ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
+ fprintf (stderr,
+ "%s is print but not graph|<space>\n", ucs_symbol (ch));
+ if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
+ fprintf (stderr,
+ "%s is graph|<space> but not print\n", ucs_symbol (ch));
+ }
+
+ fprintf (stream, "LC_CTYPE\n");
+ output_charclass (stream, "upper", is_upper);
+ output_charclass (stream, "lower", is_lower);
+ output_charclass (stream, "alpha", is_alpha);
+ output_charclass (stream, "digit", is_digit);
+ output_charclass (stream, "outdigit", is_outdigit);
+ output_charclass (stream, "blank", is_blank);
+ output_charclass (stream, "space", is_space);
+ output_charclass (stream, "cntrl", is_cntrl);
+ output_charclass (stream, "punct", is_punct);
+ output_charclass (stream, "xdigit", is_xdigit);
+ output_charclass (stream, "graph", is_graph);
+ output_charclass (stream, "print", is_print);
+ output_charclass (stream, "class \"combining\";", is_combining);
+ output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
+ output_charmap (stream, "toupper", to_upper);
+ output_charmap (stream, "tolower", to_lower);
+ output_charmap (stream, "map \"totitle\";", to_title);
+ output_widthmap (stream);
+ fprintf (stream, "END LC_CTYPE\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+#endif
+
+/* ========================================================================= */
+
+/* The width property from the EastAsianWidth.txt file.
+ Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
+const char * unicode_width[0x110000];
+
+/* Stores in unicode_width[] the width property from the EastAsianWidth.txt
+ file. */
+static void
+fill_width (const char *width_filename)
+{
+ unsigned int i, j;
+ FILE *stream;
+ char field0[FIELDLEN];
+ char field1[FIELDLEN];
+ char field2[FIELDLEN];
+ int lineno = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
+
+ stream = fopen (width_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", width_filename);
+ exit (1);
+ }
+
+ for (;;)
+ {
+ int n;
+ int c;
+
+ lineno++;
+ c = getc (stream);
+ if (c == EOF)
+ break;
+ if (c == '#')
+ {
+ do c = getc (stream); while (c != EOF && c != '\n');
+ continue;
+ }
+ ungetc (c, stream);
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ' ');
+ n += getfield (stream, field2, '\n');
+ if (n == 0)
+ break;
+ if (n != 3)
+ {
+ fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
+ exit (1);
+ }
+ i = strtoul (field0, NULL, 16);
+ if (strstr (field0, "..") != NULL)
+ {
+ /* Deal with a range. */
+ j = strtoul (strstr (field0, "..") + 2, NULL, 16);
+ for (; i <= j; i++)
+ unicode_width[i] = strdup (field1);
+ }
+ else
+ {
+ /* Single character line. */
+ unicode_width[i] = strdup (field1);
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", width_filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Non-spacing attribute and width. */
+
+/* The non-spacing attribute table consists of:
+ - Non-spacing characters; generated from PropList.txt or
+ "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
+ - Format control characters; generated from
+ "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+ - Zero width characters; generated from
+ "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+ */
+
static bool
-is_java_whitespace (unsigned int ch)
+is_nonspacing (unsigned int ch)
{
- return (ch == ' ' || ch == '\t' || ch == '\f'
- || ch == '\n' || ch == '\r');
+ return (unicode_attributes[ch].name != NULL
+ && (get_bidi_category (ch) == UC_BIDI_NSM
+ || is_category_Cc (ch) || is_category_Cf (ch)
+ || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
}
-/* The Java Language Specification, 3rd edition, §3.8.
- http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
- and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
-static int
-java_ident_category (unsigned int ch)
+static void
+output_nonspacing_property (const char *filename)
{
- /* FIXME: Check this against Sun's JDK implementation. */
- if (is_category_L (ch) /* = Character.isLetter(ch) */
- || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
- || is_category_Sc (ch) /* currency symbol */
- || is_category_Pc (ch) /* connector punctuation */
- )
- return UC_IDENTIFIER_START;
- if (is_category_Nd (ch) /* digit */
- || is_category_Mc (ch) /* combining mark */
- || is_category_Mn (ch) /* non-spacing mark */
- )
- return UC_IDENTIFIER_VALID;
- if ((ch >= 0x0000 && ch <= 0x0008)
- || (ch >= 0x000E && ch <= 0x001B)
- || (ch >= 0x007F && ch <= 0x009F)
- || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
- )
- return UC_IDENTIFIER_IGNORABLE;
- return UC_IDENTIFIER_INVALID;
+ FILE *stream;
+ int ind[0x110000 / 0x200];
+ unsigned int i;
+ unsigned int i_max;
+ int next_ind;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ next_ind = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = false;
+ unsigned int ch;
+
+ if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
+ for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
+ if (is_nonspacing (ch))
+ {
+ nontrivial = true;
+ break;
+ }
+ if (nontrivial)
+ ind[i] = next_ind++;
+ else
+ ind[i] = -1;
+ }
+
+ fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
+ next_ind);
+ i_max = 0;
+ for (i = 0; i < 0x110000 / 0x200; i++)
+ {
+ bool nontrivial = (ind[i] >= 0);
+
+ if (nontrivial)
+ {
+ unsigned int j;
+
+ fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
+ for (j = 0; j < 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ unsigned int l;
+ unsigned char bits = 0;
+
+ for (l = 0; l < 8; l++)
+ {
+ unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
+
+ if (is_nonspacing (ch))
+ bits |= 1 << l;
+ }
+ fprintf (stream, " 0x%02x%c", bits,
+ ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
+ }
+ i_max = i;
+ }
+ }
+ fprintf (stream, "};\n");
+
+ i_max = ((i_max + 8 - 1) / 8) * 8;
+ fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
+ i_max);
+ {
+ unsigned int j;
+
+ for (j = 0; j < i_max / 8; j++)
+ {
+ unsigned int k;
+
+ fprintf (stream, " ");
+ for (k = 0; k < 8; k++)
+ {
+ i = j * 8 + k;
+ fprintf (stream, " %2d%c", ind[i],
+ j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
+ }
+ fprintf (stream, " /* 0x%04x-0x%04x */\n",
+ j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
+ }
+ }
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
+static char
+symbolic_width (unsigned int ch)
+{
+ /* Test for unassigned character. */
+ if (is_property_unassigned_code_value (ch))
+ {
+ /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
+ if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
+ return 'A';
+ if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
+ || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
+ || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
+ return '2';
+ return 0;
+ }
+ else
+ {
+ /* Test for non-spacing or control character. */
+ if (is_category_Cc (ch) && ch < 0x00A0)
+ return 0;
+ if (is_nonspacing (ch))
+ return '0';
+ /* Test for double-width character. */
+ if (unicode_width[ch] != NULL
+ && (strcmp (unicode_width[ch], "W") == 0
+ || strcmp (unicode_width[ch], "F") == 0))
+ return '2';
+ /* Test for half-width character. */
+ if (unicode_width[ch] != NULL
+ && strcmp (unicode_width[ch], "H") == 0)
+ return '1';
+ }
+ /* In ancient CJK encodings, Cyrillic and most other characters are
+ double-width as well. */
+ if (ch >= 0x00A1 && ch < 0x10000)
+ return 'A';
+ return '1';
+}
+
+static void
+output_width_property_test (const char *filename)
+{
+ FILE *stream;
+ unsigned int interval_start, interval_end, ch;
+ char interval_value;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ interval_value = 0;
+ interval_start = interval_end = 0; /* avoid GCC warning */
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ char value = symbolic_width (ch);
+ if (value != 0) /* skip Cc control characters and unassigned characters */
+ {
+ if (value == interval_value)
+ /* Extend the interval. */
+ interval_end = ch;
+ else
+ {
+ /* Terminate the interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+ /* Start a new interval. */
+ interval_start = interval_end = ch;
+ interval_value = value;
+ }
+ }
+ }
+ /* Terminate the last interval. */
+ if (interval_value != 0)
+ {
+ if (interval_end == interval_start)
+ fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+ else
+ fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Line breaking classification.
+ Updated for Unicode TR #14 revision 26. */
+
+enum
+{
+ /* Values >= 25 are resolved at run time. */
+ LBP_BK = 25, /* mandatory break */
+/*LBP_CR, carriage return - not used here because it's a DOSism */
+/*LBP_LF, line feed - not used here because it's a DOSism */
+ LBP_CM = 26, /* attached characters and combining marks */
+/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
+/*LBP_SG, surrogates - not used here because they are not characters */
+ LBP_WJ = 0, /* word joiner */
+ LBP_ZW = 27, /* zero width space */
+ LBP_GL = 1, /* non-breaking (glue) */
+ LBP_SP = 28, /* space */
+ LBP_B2 = 2, /* break opportunity before and after */
+ LBP_BA = 3, /* break opportunity after */
+ LBP_BB = 4, /* break opportunity before */
+ LBP_HY = 5, /* hyphen */
+ LBP_CB = 29, /* contingent break opportunity */
+ LBP_CL = 6, /* closing punctuation */
+ LBP_CP = 7, /* closing parenthesis */
+ LBP_EX = 8, /* exclamation/interrogation */
+ LBP_IN = 9, /* inseparable */
+ LBP_NS = 10, /* non starter */
+ LBP_OP = 11, /* opening punctuation */
+ LBP_QU = 12, /* ambiguous quotation */
+ LBP_IS = 13, /* infix separator (numeric) */
+ LBP_NU = 14, /* numeric */
+ LBP_PO = 15, /* postfix (numeric) */
+ LBP_PR = 16, /* prefix (numeric) */
+ LBP_SY = 17, /* symbols allowing breaks */
+ LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+ LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+ LBP_H2 = 19, /* Hangul LV syllable */
+ LBP_H3 = 20, /* Hangul LVT syllable */
+ LBP_ID = 21, /* ideographic */
+ LBP_JL = 22, /* Hangul L Jamo */
+ LBP_JV = 23, /* Hangul V Jamo */
+ LBP_JT = 24, /* Hangul T Jamo */
+ LBP_SA = 31, /* complex context (South East Asian) */
+ LBP_XX = 32 /* unknown */
+};
+
+/* Returns the line breaking classification for ch, as a bit mask. */
+static int64_t
+get_lbp (unsigned int ch)
+{
+ int64_t attr = 0;
+
+ if (unicode_attributes[ch].name != NULL)
+ {
+ /* mandatory break */
+ if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
+ || ch == 0x000C /* form feed */
+ || ch == 0x000B /* line tabulation */
+ || ch == 0x2028 /* LINE SEPARATOR */
+ || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
+ attr |= (int64_t) 1 << LBP_BK;
+
+ if (ch == 0x2060 /* WORD JOINER */
+ || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
+ attr |= (int64_t) 1 << LBP_WJ;
+
+ /* zero width space */
+ if (ch == 0x200B /* ZERO WIDTH SPACE */)
+ attr |= (int64_t) 1 << LBP_ZW;
+
+ /* non-breaking (glue) */
+ if (ch == 0x00A0 /* NO-BREAK SPACE */
+ || ch == 0x202F /* NARROW NO-BREAK SPACE */
+ || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
+ || ch == 0x034F /* COMBINING GRAPHEME JOINER */
+ || ch == 0x2007 /* FIGURE SPACE */
+ || ch == 0x2011 /* NON-BREAKING HYPHEN */
+ || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
+ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
+ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
+ || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
+ || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
+ attr |= (int64_t) 1 << LBP_GL;
+
+ /* space */
+ if (ch == 0x0020 /* SPACE */)
+ attr |= (int64_t) 1 << LBP_SP;
+
+ /* break opportunity before and after */
+ if (ch == 0x2014 /* EM DASH */)
+ attr |= (int64_t) 1 << LBP_B2;
+
+ /* break opportunity after */
+ if (/* Breaking Spaces */
+ ch == 0x1680 /* OGHAM SPACE MARK */
+ || ch == 0x2000 /* EN QUAD */
+ || ch == 0x2001 /* EM QUAD */
+ || ch == 0x2002 /* EN SPACE */
+ || ch == 0x2003 /* EM SPACE */
+ || ch == 0x2004 /* THREE-PER-EM SPACE */
+ || ch == 0x2005 /* FOUR-PER-EM SPACE */
+ || ch == 0x2006 /* SIX-PER-EM SPACE */
+ || ch == 0x2008 /* PUNCTUATION SPACE */
+ || ch == 0x2009 /* THIN SPACE */
+ || ch == 0x200A /* HAIR SPACE */
+ || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
+ /* Tabs */
+ || ch == 0x0009 /* tab */
+ /* Conditional Hyphens */
+ || ch == 0x00AD /* SOFT HYPHEN */
+ /* Breaking Hyphens */
+ || ch == 0x058A /* ARMENIAN HYPHEN */
+ || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
+ || ch == 0x2010 /* HYPHEN */
+ || ch == 0x2012 /* FIGURE DASH */
+ || ch == 0x2013 /* EN DASH */
+ /* Visible Word Dividers */
+ || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
+ || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
+ || ch == 0x1361 /* ETHIOPIC WORDSPACE */
+ || ch == 0x17D8 /* KHMER SIGN BEYYAL */
+ || ch == 0x17DA /* KHMER SIGN KOOMUUT */
+ || ch == 0x2027 /* HYPHENATION POINT */
+ || ch == 0x007C /* VERTICAL LINE */
+ /* Historic Word Separators */
+ || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
+ || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
+ || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
+ || ch == 0x2056 /* THREE DOT PUNCTUATION */
+ || ch == 0x2058 /* FOUR DOT PUNCTUATION */
+ || ch == 0x2059 /* FIVE DOT PUNCTUATION */
+ || ch == 0x205A /* TWO DOT PUNCTUATION */
+ || ch == 0x205B /* FOUR DOT MARK */
+ || ch == 0x205D /* TRICOLON */
+ || ch == 0x205E /* VERTICAL FOUR DOTS */
+ || ch == 0x2E19 /* PALM BRANCH */
+ || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
+ || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
+ || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
+ || ch == 0x2E30 /* RING POINT */
+ || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
+ || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
+ || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
+ || ch == 0x10102 /* AEGEAN CHECK MARK */
+ || ch == 0x1039F /* UGARITIC WORD DIVIDER */
+ || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
+ || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
+ || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
+ /* Dandas */
+ || ch == 0x0964 /* DEVANAGARI DANDA */
+ || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
+ || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
+ || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
+ || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
+ || ch == 0x104B /* MYANMAR SIGN SECTION */
+ || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
+ || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
+ || ch == 0x17D4 /* KHMER SIGN KHAN */
+ || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
+ || ch == 0x1B5E /* BALINESE CARIK SIKI */
+ || ch == 0x1B5F /* BALINESE CARIK PAREREN */
+ || ch == 0xA8CE /* SAURASHTRA DANDA */
+ || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
+ || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
+ || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
+ || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
+ || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
+ || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
+ /* Tibetan */
+ || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
+ || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
+ || ch == 0x0F85 /* TIBETAN MARK PALUTA */
+ || ch == 0x0FBE /* TIBETAN KU RU KHA */
+ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
+ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
+ /* Other Terminating Punctuation */
+ || ch == 0x1804 /* MONGOLIAN COLON */
+ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
+ || ch == 0x1B5A /* BALINESE PANTI */
+ || ch == 0x1B5B /* BALINESE PAMADA */
+ || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
+ || ch == 0x1B60 /* BALINESE PAMENENG */
+ || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
+ || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
+ || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
+ || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
+ || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
+ || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
+ || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
+ || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
+ || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
+ || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
+ || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
+ || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
+ || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
+ || ch == 0xA60D /* VAI COMMA */
+ || ch == 0xA60F /* VAI QUESTION MARK */
+ || ch == 0xA92E /* KAYAH LI SIGN CWI */
+ || ch == 0xA92F /* KAYAH LI SIGN SHYA */
+ || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
+ || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
+ || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
+ || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
+ || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
+ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
+ || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
+ || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
+ || ch == 0xA6F3 /* BAMUM FULL STOP */
+ || ch == 0xA6F4 /* BAMUM COLON */
+ || ch == 0xA6F5 /* BAMUM COMMA */
+ || ch == 0xA6F6 /* BAMUM SEMICOLON */
+ || ch == 0xA6F7 /* BAMUM QUESTION MARK */
+ || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
+ || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
+ || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+ || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
+ || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
+ || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
+ || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
+ || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
+ || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
+ || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
+ || ch == 0x11047 /* BRAHMI DANDA */
+ || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
+ || ch == 0x110BE /* KAITHI SECTION MARK */
+ || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
+ || ch == 0x110C0 /* KAITHI DANDA */
+ || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
+ || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
+ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
+ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
+ attr |= (int64_t) 1 << LBP_BA;
+
+ /* break opportunity before */
+ if (ch == 0x00B4 /* ACUTE ACCENT */
+ || ch == 0x1FFD /* GREEK OXIA */
+ || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
+ || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
+ || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
+ || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
+ || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
+ || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
+ || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
+ || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
+ || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
+ || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
+ || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
+ || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
+ || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
+ || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
+ || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
+ || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
+ || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
+ attr |= (int64_t) 1 << LBP_BB;
+
+ /* hyphen */
+ if (ch == 0x002D /* HYPHEN-MINUS */)
+ attr |= (int64_t) 1 << LBP_HY;
+
+ /* contingent break opportunity */
+ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
+ attr |= (int64_t) 1 << LBP_CB;
+
+ /* closing parenthesis */
+ if (ch == 0x0029 /* RIGHT PARENTHESIS */
+ || ch == 0x005D /* RIGHT SQUARE BRACKET */)
+ attr |= (int64_t) 1 << LBP_CP;
+
+ /* closing punctuation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 'e'
+ && !(attr & ((int64_t) 1 << LBP_CP)))
+ || ch == 0x3001 /* IDEOGRAPHIC COMMA */
+ || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
+ || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
+ || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
+ || ch == 0xFE50 /* SMALL COMMA */
+ || ch == 0xFE52 /* SMALL FULL STOP */
+ || ch == 0xFF0C /* FULLWIDTH COMMA */
+ || ch == 0xFF0E /* FULLWIDTH FULL STOP */
+ || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
+ || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
+ || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
+ || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
+ || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
+ || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
+ || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
+ || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
+ || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
+ attr |= (int64_t) 1 << LBP_CL;
+
+ /* exclamation/interrogation */
+ if (ch == 0x0021 /* EXCLAMATION MARK */
+ || ch == 0x003F /* QUESTION MARK */
+ || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
+ || ch == 0x061B /* ARABIC SEMICOLON */
+ || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
+ || ch == 0x061F /* ARABIC QUESTION MARK */
+ || ch == 0x06D4 /* ARABIC FULL STOP */
+ || ch == 0x07F9 /* NKO EXCLAMATION MARK */
+ || ch == 0x0F0D /* TIBETAN MARK SHAD */
+ || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
+ || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
+ || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
+ || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
+ || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
+ || ch == 0x1802 /* MONGOLIAN COMMA */
+ || ch == 0x1803 /* MONGOLIAN FULL STOP */
+ || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
+ || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
+ || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
+ || ch == 0x1945 /* LIMBU QUESTION MARK */
+ || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
+ || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
+ || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
+ || ch == 0x2CFE /* COPTIC FULL STOP */
+ || ch == 0x2E2E /* REVERSED QUESTION MARK */
+ || ch == 0xA60E /* VAI FULL STOP */
+ || ch == 0xA876 /* PHAGS-PA MARK SHAD */
+ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
+ || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
+ || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
+ || ch == 0xFE56 /* SMALL QUESTION MARK */
+ || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
+ || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
+ || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
+ attr |= (int64_t) 1 << LBP_EX;
+
+ /* inseparable */
+ if (ch == 0x2024 /* ONE DOT LEADER */
+ || ch == 0x2025 /* TWO DOT LEADER */
+ || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
+ || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
+ attr |= (int64_t) 1 << LBP_IN;
+
+ /* non starter */
+ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
+ || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
+ || ch == 0x203D /* INTERROBANG */
+ || ch == 0x2047 /* DOUBLE QUESTION MARK */
+ || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
+ || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
+ || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
+ || ch == 0x301C /* WAVE DASH */
+ || ch == 0x303C /* MASU MARK */
+ || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
+ || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
+ || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
+ || ch == 0x309D /* HIRAGANA ITERATION MARK */
+ || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
+ || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
+ || ch == 0x30FB /* KATAKANA MIDDLE DOT */
+ || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+ || ch == 0x30FD /* KATAKANA ITERATION MARK */
+ || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
+ || ch == 0xA015 /* YI SYLLABLE WU */
+ || ch == 0xFE54 /* SMALL SEMICOLON */
+ || ch == 0xFE55 /* SMALL COLON */
+ || ch == 0xFF1A /* FULLWIDTH COLON */
+ || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
+ || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
+ || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+ || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
+ || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
+ || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
+ || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
+ attr |= (int64_t) 1 << LBP_NS;
+
+ /* opening punctuation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && unicode_attributes[ch].category[1] == 's')
+ || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
+ || ch == 0x00BF /* INVERTED QUESTION MARK */
+ || ch == 0x2E18 /* INVERTED INTERROBANG */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
+ || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
+ || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
+ || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
+ || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
+ || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
+ attr |= (int64_t) 1 << LBP_OP;
+
+ /* ambiguous quotation */
+ if ((unicode_attributes[ch].category[0] == 'P'
+ && (unicode_attributes[ch].category[1] == 'f'
+ || unicode_attributes[ch].category[1] == 'i'))
+ || ch == 0x0022 /* QUOTATION MARK */
+ || ch == 0x0027 /* APOSTROPHE */
+ || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
+ || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
+ || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
+ || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
+ || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
+ || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
+ || ch == 0x2E0B /* RAISED SQUARE */)
+ attr |= (int64_t) 1 << LBP_QU;
+
+ /* infix separator (numeric) */
+ if (ch == 0x002C /* COMMA */
+ || ch == 0x002E /* FULL STOP */
+ || ch == 0x003A /* COLON */
+ || ch == 0x003B /* SEMICOLON */
+ || ch == 0x037E /* GREEK QUESTION MARK */
+ || ch == 0x0589 /* ARMENIAN FULL STOP */
+ || ch == 0x060C /* ARABIC COMMA */
+ || ch == 0x060D /* ARABIC DATE SEPARATOR */
+ || ch == 0x07F8 /* NKO COMMA */
+ || ch == 0x2044 /* FRACTION SLASH */
+ || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
+ || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
+ || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
+ attr |= (int64_t) 1 << LBP_IS;
+
+ /* numeric */
+ if ((unicode_attributes[ch].category[0] == 'N'
+ && unicode_attributes[ch].category[1] == 'd'
+ && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
+ || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
+ || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
+ attr |= (int64_t) 1 << LBP_NU;
+
+ /* postfix (numeric) */
+ if (ch == 0x0025 /* PERCENT SIGN */
+ || ch == 0x00A2 /* CENT SIGN */
+ || ch == 0x00B0 /* DEGREE SIGN */
+ || ch == 0x060B /* AFGHANI SIGN */
+ || ch == 0x066A /* ARABIC PERCENT SIGN */
+ || ch == 0x2030 /* PER MILLE SIGN */
+ || ch == 0x2031 /* PER TEN THOUSAND SIGN */
+ || ch == 0x2032 /* PRIME */
+ || ch == 0x2033 /* DOUBLE PRIME */
+ || ch == 0x2034 /* TRIPLE PRIME */
+ || ch == 0x2035 /* REVERSED PRIME */
+ || ch == 0x2036 /* REVERSED DOUBLE PRIME */
+ || ch == 0x2037 /* REVERSED TRIPLE PRIME */
+ || ch == 0x20A7 /* PESETA SIGN */
+ || ch == 0x2103 /* DEGREE CELSIUS */
+ || ch == 0x2109 /* DEGREE FAHRENHEIT */
+ || ch == 0xFDFC /* RIAL SIGN */
+ || ch == 0xFE6A /* SMALL PERCENT SIGN */
+ || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
+ || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
+ || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
+ || ch == 0x09F2 /* BENGALI RUPEE MARK */
+ || ch == 0x09F3 /* BENGALI RUPEE SIGN */
+ || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
+ || ch == 0x0D79 /* MALAYALAM DATE MARK */
+ || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
+ || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
+ attr |= (int64_t) 1 << LBP_PO;
+
+ /* prefix (numeric) */
+ if ((unicode_attributes[ch].category[0] == 'S'
+ && unicode_attributes[ch].category[1] == 'c')
+ || ch == 0x002B /* PLUS SIGN */
+ || ch == 0x005C /* REVERSE SOLIDUS */
+ || ch == 0x00B1 /* PLUS-MINUS SIGN */
+ || ch == 0x2116 /* NUMERO SIGN */
+ || ch == 0x2212 /* MINUS SIGN */
+ || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
+ if (!(attr & ((int64_t) 1 << LBP_PO)))
+ attr |= (int64_t) 1 << LBP_PR;
+
+ /* symbols allowing breaks */
+ if (ch == 0x002F /* SOLIDUS */)
+ attr |= (int64_t) 1 << LBP_SY;
+
+ if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
+ attr |= (int64_t) 1 << LBP_H2;
+
+ if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
+ attr |= (int64_t) 1 << LBP_H3;
+
+ if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
+ attr |= (int64_t) 1 << LBP_JL;
+
+ if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
+ attr |= (int64_t) 1 << LBP_JV;
+
+ if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
+ attr |= (int64_t) 1 << LBP_JT;
+
+ /* complex context (South East Asian) */
+ if (((unicode_attributes[ch].category[0] == 'C'
+ && unicode_attributes[ch].category[1] == 'f')
+ || (unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'n')
+ && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
+ || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
+ || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
+ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
+ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
+ || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
+ || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
+ || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
+ && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
+ || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
+ || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
+ || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
+ || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
+ || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
+ attr |= (int64_t) 1 << LBP_SA;
+
+ /* attached characters and combining marks */
+ if ((unicode_attributes[ch].category[0] == 'M'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'e'
+ || unicode_attributes[ch].category[1] == 'n'))
+ || (unicode_attributes[ch].category[0] == 'C'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'f')
+ && ch != 0x110BD /* KAITHI NUMBER SIGN */))
+ if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+ attr |= (int64_t) 1 << LBP_CM;
+
+ /* ideographic */
+ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
+ || ch == 0x3000 /* IDEOGRAPHIC SPACE */
+ || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
+ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
+ || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
+ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
+ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
+ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
+ || ch == 0xFE62 /* SMALL PLUS SIGN */
+ || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
+ || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
+ || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
+ || ch == 0xFE66 /* SMALL EQUALS SIGN */
+ || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
+ || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
+ || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
+ || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
+ || (ch >= 0x3000 && ch <= 0x33FF
+ && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
+ || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
+ || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
+ || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
+ || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
+ || ch == 0xFE45 /* SESAME DOT */
+ || ch == 0xFE46 /* WHITE SESAME DOT */
+ || ch == 0xFE49 /* DASHED OVERLINE */
+ || ch == 0xFE4A /* CENTRELINE OVERLINE */
+ || ch == 0xFE4B /* WAVY OVERLINE */
+ || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
+ || ch == 0xFE4D /* DASHED LOW LINE */
+ || ch == 0xFE4E /* CENTRELINE LOW LINE */
+ || ch == 0xFE4F /* WAVY LOW LINE */
+ || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
+ || ch == 0xFE58 /* SMALL EM DASH */
+ || ch == 0xFE5F /* SMALL NUMBER SIGN */
+ || ch == 0xFE60 /* SMALL AMPERSAND */
+ || ch == 0xFE61 /* SMALL ASTERISK */
+ || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
+ || ch == 0xFE6B /* SMALL COMMERCIAL AT */
+ || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
+ || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
+ || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
+ || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
+ || ch == 0xFF0A /* FULLWIDTH ASTERISK */
+ || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
+ || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
+ || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
+ || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
+ || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
+ || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
+ || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
+ || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
+ || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
+ || ch == 0xFF3F /* FULLWIDTH LOW LINE */
+ || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
+ || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
+ || ch == 0xFF5E /* FULLWIDTH TILDE */
+ || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
+ || ch == 0xFFE3 /* FULLWIDTH MACRON */
+ || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
+ || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
+ || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
+ || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
+ if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
+ {
+ /* ambiguous (ideograph) ? */
+ if ((unicode_width[ch] != NULL
+ && unicode_width[ch][0] == 'A'
+ && ch >= 0x2000)
+ || ch == 0x24EA /* CIRCLED DIGIT ZERO */
+ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
+ attr |= (int64_t) 1 << LBP_AI;
+ else
+ attr |= (int64_t) 1 << LBP_ID;
+ }
+
+ /* ordinary alphabetic and symbol characters */
+ if ((unicode_attributes[ch].category[0] == 'L'
+ && (unicode_attributes[ch].category[1] == 'u'
+ || unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 't'
+ || unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'S'
+ && (unicode_attributes[ch].category[1] == 'm'
+ || unicode_attributes[ch].category[1] == 'k'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'N'
+ && (unicode_attributes[ch].category[1] == 'l'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || (unicode_attributes[ch].category[0] == 'P'
+ && (unicode_attributes[ch].category[1] == 'c'
+ || unicode_attributes[ch].category[1] == 'd'
+ || unicode_attributes[ch].category[1] == 'o'))
+ || ch == 0x0600 /* ARABIC NUMBER SIGN */
+ || ch == 0x0601 /* ARABIC SIGN SANAH */
+ || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
+ || ch == 0x0603 /* ARABIC SIGN SAFHA */
+ || ch == 0x06DD /* ARABIC END OF AYAH */
+ || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
+ || ch == 0x2061 /* FUNCTION APPLICATION */
+ || ch == 0x2062 /* INVISIBLE TIMES */
+ || ch == 0x2063 /* INVISIBLE SEPARATOR */
+ || ch == 0x2064 /* INVISIBLE PLUS */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x110BD /* KAITHI NUMBER SIGN */)
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+ {
+ /* ambiguous (alphabetic) ? */
+ if ((unicode_width[ch] != NULL
+ && unicode_width[ch][0] == 'A'
+ && ch >= 0x2000
+ /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
+ && ch != 0x2022 /* BULLET */
+ && ch != 0x203E /* OVERLINE */
+ && ch != 0x2126 /* OHM SIGN */
+ && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
+ && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
+ && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
+ && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
+ && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
+ && ch != 0x21E7 /* UPWARDS WHITE ARROW */
+ && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
+ && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
+ || ch == 0x00A7 /* SECTION SIGN */
+ || ch == 0x00A8 /* DIAERESIS */
+ || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
+ || ch == 0x00B2 /* SUPERSCRIPT TWO */
+ || ch == 0x00B3 /* SUPERSCRIPT THREE */
+ || ch == 0x00B6 /* PILCROW SIGN */
+ || ch == 0x00B7 /* MIDDLE DOT */
+ || ch == 0x00B8 /* CEDILLA */
+ || ch == 0x00B9 /* SUPERSCRIPT ONE */
+ || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
+ || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
+ || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
+ || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
+ || ch == 0x00D7 /* MULTIPLICATION SIGN */
+ || ch == 0x00F7 /* DIVISION SIGN */
+ || ch == 0x02C7 /* CARON */
+ || ch == 0x02C9 /* MODIFIER LETTER MACRON */
+ || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
+ || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
+ || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
+ || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
+ || ch == 0x02D8 /* BREVE */
+ || ch == 0x02D9 /* DOT ABOVE */
+ || ch == 0x02DA /* RING ABOVE */
+ || ch == 0x02DB /* OGONEK */
+ || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
+ || ch == 0x24EA /* CIRCLED DIGIT ZERO */
+ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
+ /* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
+ || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
+ || ch == 0x2616 /* WHITE SHOGI PIECE */
+ || ch == 0x2617 /* BLACK SHOGI PIECE */)
+ attr |= (int64_t) 1 << LBP_AI;
+ else
+ attr |= (int64_t) 1 << LBP_AL;
+ attr &= ~((int64_t) 1 << LBP_CM);
+ }
+ }
+ else
+ {
+ /* Unassigned character. */
+ if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+ || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
+ || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+ || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
+ || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
+ Supplementary Ideographic Plane (Plane 2) outside of blocks */
+ || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
+ attr |= (int64_t) 1 << LBP_ID;
+ }
+
+ if (attr == 0)
+ /* unknown */
+ attr |= (int64_t) 1 << LBP_XX;
+
+ return attr;
}
-/* Construction of sparse 3-level tables. */
-#define TABLE identsyntax_table
-#define ELEMENT uint8_t
-#define DEFAULT UC_IDENTIFIER_INVALID
-#define xmalloc malloc
-#define xrealloc realloc
-#include "3level.h"
+/* Output the line breaking properties in a human readable format. */
+static void
+debug_output_lbp (FILE *stream)
+{
+ unsigned int i;
+
+ for (i = 0; i < 0x110000; i++)
+ {
+ int64_t attr = get_lbp (i);
+ if (attr != (int64_t) 1 << LBP_XX)
+ {
+ fprintf (stream, "0x%04X", i);
+#define PRINT_BIT(attr,bit) \
+ if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
+ PRINT_BIT(attr,LBP_BK);
+ PRINT_BIT(attr,LBP_CM);
+ PRINT_BIT(attr,LBP_WJ);
+ PRINT_BIT(attr,LBP_ZW);
+ PRINT_BIT(attr,LBP_GL);
+ PRINT_BIT(attr,LBP_SP);
+ PRINT_BIT(attr,LBP_B2);
+ PRINT_BIT(attr,LBP_BA);
+ PRINT_BIT(attr,LBP_BB);
+ PRINT_BIT(attr,LBP_HY);
+ PRINT_BIT(attr,LBP_CB);
+ PRINT_BIT(attr,LBP_CL);
+ PRINT_BIT(attr,LBP_CP);
+ PRINT_BIT(attr,LBP_EX);
+ PRINT_BIT(attr,LBP_IN);
+ PRINT_BIT(attr,LBP_NS);
+ PRINT_BIT(attr,LBP_OP);
+ PRINT_BIT(attr,LBP_QU);
+ PRINT_BIT(attr,LBP_IS);
+ PRINT_BIT(attr,LBP_NU);
+ PRINT_BIT(attr,LBP_PO);
+ PRINT_BIT(attr,LBP_PR);
+ PRINT_BIT(attr,LBP_SY);
+ PRINT_BIT(attr,LBP_AI);
+ PRINT_BIT(attr,LBP_AL);
+ PRINT_BIT(attr,LBP_H2);
+ PRINT_BIT(attr,LBP_H3);
+ PRINT_BIT(attr,LBP_ID);
+ PRINT_BIT(attr,LBP_JL);
+ PRINT_BIT(attr,LBP_JV);
+ PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_XX);
+#undef PRINT_BIT
+ fprintf (stream, "\n");
+ }
+ }
+}
-/* Output an identifier syntax categorization in a three-level bitmap. */
static void
-output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
+debug_output_lbrk_tables (const char *filename)
{
FILE *stream;
- unsigned int ch, i;
- struct identsyntax_table t;
- unsigned int level1_offset, level2_offset, level3_offset;
stream = fopen (filename, "w");
if (stream == NULL)
exit (1);
}
- fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
- version);
-
- t.p = 7; /* or 8 */
- t.q = 5; /* or 4 */
- identsyntax_table_init (&t);
+ debug_output_lbp (stream);
- for (ch = 0; ch < 0x110000; ch++)
+ if (ferror (stream) || fclose (stream))
{
- int syntaxcode = predicate (ch);
- if (syntaxcode != UC_IDENTIFIER_INVALID)
- identsyntax_table_add (&t, ch, syntaxcode);
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
}
+}
- identsyntax_table_finalize (&t);
+/* The line breaking property from the LineBreak.txt file. */
+int unicode_org_lbp[0x110000];
- /* Offsets in t.result, in memory of this process. */
- level1_offset =
- 5 * sizeof (uint32_t);
- level2_offset =
- 5 * sizeof (uint32_t)
- + t.level1_size * sizeof (uint32_t);
- level3_offset =
- 5 * sizeof (uint32_t)
- + t.level1_size * sizeof (uint32_t)
- + (t.level2_size << t.q) * sizeof (uint32_t);
+/* Stores in unicode_org_lbp[] the line breaking property from the
+ LineBreak.txt file. */
+static void
+fill_org_lbp (const char *linebreak_filename)
+{
+ unsigned int i, j;
+ FILE *stream;
+ char field0[FIELDLEN];
+ char field1[FIELDLEN];
+ char field2[FIELDLEN];
+ int lineno = 0;
- for (i = 0; i < 5; i++)
- fprintf (stream, "#define identsyntax_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
- fprintf (stream, "static const\n");
- fprintf (stream, "struct\n");
- fprintf (stream, " {\n");
- fprintf (stream, " int level1[%zu];\n", t.level1_size);
- fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
- (1 << t.p) * 2 / 16);
- fprintf (stream, " }\n");
- fprintf (stream, "%s =\n", name);
- fprintf (stream, "{\n");
- fprintf (stream, " {");
- if (t.level1_size > 8)
- fprintf (stream, "\n ");
- for (i = 0; i < t.level1_size; i++)
+ for (i = 0; i < 0x110000; i++)
+ unicode_org_lbp[i] = LBP_XX;
+
+ stream = fopen (linebreak_filename, "r");
+ if (stream == NULL)
{
- uint32_t offset;
- if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- offset = ((uint32_t *) (t.result + level1_offset))[i];
- if (offset == 0)
- fprintf (stream, " %5d", -1);
- else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
- if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
+ exit (1);
}
- if (t.level1_size > 8)
- fprintf (stream, "\n ");
- fprintf (stream, " },\n");
- fprintf (stream, " {");
- if (t.level2_size << t.q > 8)
- fprintf (stream, "\n ");
- for (i = 0; i < t.level2_size << t.q; i++)
+
+ for (;;)
{
- uint32_t offset;
- if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- offset = ((uint32_t *) (t.result + level2_offset))[i];
- if (offset == 0)
- fprintf (stream, " %5d", -1);
+ int n;
+ int c;
+ int value;
+
+ lineno++;
+ c = getc (stream);
+ if (c == EOF)
+ break;
+ if (c == '#')
+ {
+ do c = getc (stream); while (c != EOF && c != '\n');
+ continue;
+ }
+ ungetc (c, stream);
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ' ');
+ n += getfield (stream, field2, '\n');
+ if (n == 0)
+ break;
+ if (n != 3)
+ {
+ fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
+ lineno);
+ exit (1);
+ }
+#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
+ if (false) {}
+ TRY(LBP_BK)
+ TRY(LBP_CM)
+ TRY(LBP_WJ)
+ TRY(LBP_ZW)
+ TRY(LBP_GL)
+ TRY(LBP_SP)
+ TRY(LBP_B2)
+ TRY(LBP_BA)
+ TRY(LBP_BB)
+ TRY(LBP_HY)
+ TRY(LBP_CB)
+ TRY(LBP_CL)
+ TRY(LBP_CP)
+ TRY(LBP_EX)
+ TRY(LBP_IN)
+ TRY(LBP_NS)
+ TRY(LBP_OP)
+ TRY(LBP_QU)
+ TRY(LBP_IS)
+ TRY(LBP_NU)
+ TRY(LBP_PO)
+ TRY(LBP_PR)
+ TRY(LBP_SY)
+ TRY(LBP_AI)
+ TRY(LBP_AL)
+ TRY(LBP_H2)
+ TRY(LBP_H3)
+ TRY(LBP_ID)
+ TRY(LBP_JL)
+ TRY(LBP_JV)
+ TRY(LBP_JT)
+ TRY(LBP_SA)
+ TRY(LBP_XX)
+#undef TRY
+ else if (strcmp (field1, "LF") == 0) value = LBP_BK;
+ else if (strcmp (field1, "CR") == 0) value = LBP_BK;
+ else if (strcmp (field1, "NL") == 0) value = LBP_BK;
+ else if (strcmp (field1, "SG") == 0) value = LBP_XX;
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t));
- if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
- }
- if (t.level2_size << t.q > 8)
- fprintf (stream, "\n ");
- fprintf (stream, " },\n");
- /* Pack the level3 array. Each entry needs 2 bits only. */
- fprintf (stream, " {");
- if ((t.level3_size << t.p) * 2 / 16 > 8)
- fprintf (stream, "\n ");
- for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
- {
- if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- fprintf (stream, " 0x%04x",
- (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
- | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
- if (i+1 < (t.level3_size << t.p) * 2 / 16)
- fprintf (stream, ",");
+ {
+ fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
+ field1, linebreak_filename, lineno);
+ exit (1);
+ }
+ i = strtoul (field0, NULL, 16);
+ if (strstr (field0, "..") != NULL)
+ {
+ /* Deal with a range. */
+ j = strtoul (strstr (field0, "..") + 2, NULL, 16);
+ for (; i <= j; i++)
+ unicode_org_lbp[i] = value;
+ }
+ else
+ {
+ /* Single character line. */
+ unicode_org_lbp[i] = value;
+ }
}
- if ((t.level3_size << t.p) * 2 / 16 > 8)
- fprintf (stream, "\n ");
- fprintf (stream, " }\n");
- fprintf (stream, "};\n");
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error writing to '%s'\n", filename);
+ fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
exit (1);
}
}
+/* Output the line breaking properties in a human readable format. */
static void
-output_ident_properties (const char *version)
+debug_output_org_lbp (FILE *stream)
{
-#define PROPERTY(P) \
- debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
- output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
- output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
- PROPERTY(c_whitespace)
- PROPERTY(java_whitespace)
-#undef PROPERTY
+ unsigned int i;
- output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
- output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
+ for (i = 0; i < 0x110000; i++)
+ {
+ int attr = unicode_org_lbp[i];
+ if (attr != LBP_XX)
+ {
+ fprintf (stream, "0x%04X", i);
+#define PRINT_BIT(attr,bit) \
+ if (attr == bit) fprintf (stream, " " #bit);
+ PRINT_BIT(attr,LBP_BK);
+ PRINT_BIT(attr,LBP_CM);
+ PRINT_BIT(attr,LBP_WJ);
+ PRINT_BIT(attr,LBP_ZW);
+ PRINT_BIT(attr,LBP_GL);
+ PRINT_BIT(attr,LBP_SP);
+ PRINT_BIT(attr,LBP_B2);
+ PRINT_BIT(attr,LBP_BA);
+ PRINT_BIT(attr,LBP_BB);
+ PRINT_BIT(attr,LBP_HY);
+ PRINT_BIT(attr,LBP_CB);
+ PRINT_BIT(attr,LBP_CL);
+ PRINT_BIT(attr,LBP_CP);
+ PRINT_BIT(attr,LBP_EX);
+ PRINT_BIT(attr,LBP_IN);
+ PRINT_BIT(attr,LBP_NS);
+ PRINT_BIT(attr,LBP_OP);
+ PRINT_BIT(attr,LBP_QU);
+ PRINT_BIT(attr,LBP_IS);
+ PRINT_BIT(attr,LBP_NU);
+ PRINT_BIT(attr,LBP_PO);
+ PRINT_BIT(attr,LBP_PR);
+ PRINT_BIT(attr,LBP_SY);
+ PRINT_BIT(attr,LBP_AI);
+ PRINT_BIT(attr,LBP_AL);
+ PRINT_BIT(attr,LBP_H2);
+ PRINT_BIT(attr,LBP_H3);
+ PRINT_BIT(attr,LBP_ID);
+ PRINT_BIT(attr,LBP_JL);
+ PRINT_BIT(attr,LBP_JV);
+ PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_XX);
+#undef PRINT_BIT
+ fprintf (stream, "\n");
+ }
+ }
}
-/* ========================================================================= */
+static void
+debug_output_org_lbrk_tables (const char *filename)
+{
+ FILE *stream;
-/* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
- glibc/localedata/locales/i18n file, generated by
- glibc/localedata/gen-unicode-ctype.c. */
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
-/* Character mappings. */
+ debug_output_org_lbp (stream);
-static unsigned int
-to_upper (unsigned int ch)
-{
- if (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].upper != NONE)
- return unicode_attributes[ch].upper;
- else
- return ch;
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
}
-static unsigned int
-to_lower (unsigned int ch)
+/* Construction of sparse 3-level tables. */
+#define TABLE lbp_table
+#define ELEMENT unsigned char
+#define DEFAULT LBP_XX
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_lbp (FILE *stream1, FILE *stream2)
{
- if (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].lower != NONE)
- return unicode_attributes[ch].lower;
- else
- return ch;
-}
+ unsigned int i;
+ struct lbp_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ t.p = 7;
+ t.q = 9;
+ lbp_table_init (&t);
+
+ for (i = 0; i < 0x110000; i++)
+ {
+ int64_t attr = get_lbp (i);
-static unsigned int
-to_title (unsigned int ch)
-{
- if (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].title != NONE)
- return unicode_attributes[ch].title;
- else
- return ch;
-}
+ /* Now attr should contain exactly one bit. */
+ if (attr == 0 || ((attr & (attr - 1)) != 0))
+ abort ();
-/* Character class properties. */
+ if (attr != (int64_t) 1 << LBP_XX)
+ {
+ unsigned int log2_attr;
+ for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
-static bool
-is_upper (unsigned int ch)
-{
- return (to_lower (ch) != ch);
-}
+ lbp_table_add (&t, i, log2_attr);
+ }
+ }
-static bool
-is_lower (unsigned int ch)
-{
- return (to_upper (ch) != ch)
- /* <U00DF> is lowercase, but without simple to_upper mapping. */
- || (ch == 0x00DF);
-}
+ lbp_table_finalize (&t);
-static bool
-is_alpha (unsigned int ch)
-{
- return (unicode_attributes[ch].name != NULL
- && ((unicode_attributes[ch].category[0] == 'L'
- /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
- <U0E2F>, <U0E46> should belong to is_punct. */
- && (ch != 0x0E2F) && (ch != 0x0E46))
- /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
- <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
- || (ch == 0x0E31)
- || (ch >= 0x0E34 && ch <= 0x0E3A)
- || (ch >= 0x0E47 && ch <= 0x0E4E)
- /* Avoid warning for <U0345>. */
- || (ch == 0x0345)
- /* Avoid warnings for <U2160>..<U217F>. */
- || (unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'l')
- /* Avoid warnings for <U24B6>..<U24E9>. */
- || (unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'o'
- && strstr (unicode_attributes[ch].name, " LETTER ")
- != NULL)
- /* Consider all the non-ASCII digits as alphabetic.
- ISO C 99 forbids us to have them in category "digit",
- but we want iswalnum to return true on them. */
- || (unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'd'
- && !(ch >= 0x0030 && ch <= 0x0039))));
-}
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
-static bool
-is_digit (unsigned int ch)
-{
-#if 0
- return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'd');
- /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
- a zero. Must add <0> in front of them by hand. */
-#else
- /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
- takes it away:
- 7.25.2.1.5:
- The iswdigit function tests for any wide character that corresponds
- to a decimal-digit character (as defined in 5.2.1).
- 5.2.1:
- the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
- */
- return (ch >= 0x0030 && ch <= 0x0039);
-#endif
-}
+ for (i = 0; i < 5; i++)
+ fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream1, "\n");
+ fprintf (stream1, "typedef struct\n");
+ fprintf (stream1, " {\n");
+ fprintf (stream1, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream1, " }\n");
+ fprintf (stream1, "lbrkprop_t;\n");
+ fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
-static bool
-is_outdigit (unsigned int ch)
-{
- return (ch >= 0x0030 && ch <= 0x0039);
+ fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
+ fprintf (stream2, "{\n");
+ fprintf (stream2, " {");
+ if (t.level1_size > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream2, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " },\n");
+ fprintf (stream2, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level3_offset) / sizeof (unsigned char));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream2, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " },\n");
+ fprintf (stream2, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
+ const char *value_string;
+ switch (value)
+ {
+#define CASE(x) case x: value_string = #x; break;
+ CASE(LBP_BK);
+ CASE(LBP_CM);
+ CASE(LBP_WJ);
+ CASE(LBP_ZW);
+ CASE(LBP_GL);
+ CASE(LBP_SP);
+ CASE(LBP_B2);
+ CASE(LBP_BA);
+ CASE(LBP_BB);
+ CASE(LBP_HY);
+ CASE(LBP_CB);
+ CASE(LBP_CL);
+ CASE(LBP_CP);
+ CASE(LBP_EX);
+ CASE(LBP_IN);
+ CASE(LBP_NS);
+ CASE(LBP_OP);
+ CASE(LBP_QU);
+ CASE(LBP_IS);
+ CASE(LBP_NU);
+ CASE(LBP_PO);
+ CASE(LBP_PR);
+ CASE(LBP_SY);
+ CASE(LBP_AI);
+ CASE(LBP_AL);
+ CASE(LBP_H2);
+ CASE(LBP_H3);
+ CASE(LBP_ID);
+ CASE(LBP_JL);
+ CASE(LBP_JV);
+ CASE(LBP_JT);
+ CASE(LBP_SA);
+ CASE(LBP_XX);
+#undef CASE
+ default:
+ abort ();
+ }
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " %s%s", value_string,
+ (i+1 < t.level3_size << t.p ? "," : ""));
+ }
+ if (t.level3_size << t.p > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " }\n");
+ fprintf (stream2, "};\n");
}
-static bool
-is_alnum (unsigned int ch)
+static void
+output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
{
- return is_alpha (ch) || is_digit (ch);
-}
+ const char *filenames[2];
+ FILE *streams[2];
+ size_t i;
-static bool
-is_blank (unsigned int ch)
-{
- return (ch == 0x0009 /* '\t' */
- /* Category Zs without mention of "<noBreak>" */
- || (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && unicode_attributes[ch].category[1] == 's'
- && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
-}
+ filenames[0] = filename1;
+ filenames[1] = filename2;
-static bool
-is_space (unsigned int ch)
-{
- /* Don't make U+00A0 a space. Non-breaking space means that all programs
- should treat it like a punctuation character, not like a space. */
- return (ch == 0x0020 /* ' ' */
- || ch == 0x000C /* '\f' */
- || ch == 0x000A /* '\n' */
- || ch == 0x000D /* '\r' */
- || ch == 0x0009 /* '\t' */
- || ch == 0x000B /* '\v' */
- /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
- || (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && (unicode_attributes[ch].category[1] == 'l'
- || unicode_attributes[ch].category[1] == 'p'
- || (unicode_attributes[ch].category[1] == 's'
- && !strstr (unicode_attributes[ch].decomposition,
- "<noBreak>")))));
-}
+ for (i = 0; i < 2; i++)
+ {
+ streams[i] = fopen (filenames[i], "w");
+ if (streams[i] == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
+ exit (1);
+ }
+ }
-static bool
-is_cntrl (unsigned int ch)
-{
- return (unicode_attributes[ch].name != NULL
- && (strcmp (unicode_attributes[ch].name, "<control>") == 0
- /* Categories Zl and Zp */
- || (unicode_attributes[ch].category[0] == 'Z'
- && (unicode_attributes[ch].category[1] == 'l'
- || unicode_attributes[ch].category[1] == 'p'))));
-}
+ for (i = 0; i < 2; i++)
+ {
+ FILE *stream = streams[i];
-static bool
-is_xdigit (unsigned int ch)
-{
-#if 0
- return is_digit (ch)
- || (ch >= 0x0041 && ch <= 0x0046)
- || (ch >= 0x0061 && ch <= 0x0066);
-#else
- /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
- takes it away:
- 7.25.2.1.12:
- The iswxdigit function tests for any wide character that corresponds
- to a hexadecimal-digit character (as defined in 6.4.4.1).
- 6.4.4.1:
- hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
- */
- return (ch >= 0x0030 && ch <= 0x0039)
- || (ch >= 0x0041 && ch <= 0x0046)
- || (ch >= 0x0061 && ch <= 0x0066);
-#endif
-}
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "\n");
-static bool
-is_graph (unsigned int ch)
-{
- return (unicode_attributes[ch].name != NULL
- && strcmp (unicode_attributes[ch].name, "<control>")
- && !is_space (ch));
-}
+ /* Put a GPL header on it. The gnulib module is under LGPL (although it
+ still carries the GPL header), and it's gnulib-tool which replaces the
+ GPL header with an LGPL header. */
+ fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
+ }
-static bool
-is_print (unsigned int ch)
-{
- return (unicode_attributes[ch].name != NULL
- && strcmp (unicode_attributes[ch].name, "<control>")
- /* Categories Zl and Zp */
- && !(unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'Z'
- && (unicode_attributes[ch].category[1] == 'l'
- || unicode_attributes[ch].category[1] == 'p')));
-}
+ output_lbp (streams[0], streams[1]);
-static bool
-is_punct (unsigned int ch)
-{
-#if 0
- return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'P');
-#else
- /* The traditional POSIX definition of punctuation is every graphic,
- non-alphanumeric character. */
- return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
-#endif
+ for (i = 0; i < 2; i++)
+ {
+ if (ferror (streams[i]) || fclose (streams[i]))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filenames[i]);
+ exit (1);
+ }
+ }
}
-/* Output all properties. */
-static void
-output_old_ctype (const char *version)
-{
-#define PROPERTY(P) \
- debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
- output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
- output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
- PROPERTY(alnum)
- PROPERTY(alpha)
- PROPERTY(cntrl)
- PROPERTY(digit)
- PROPERTY(graph)
- PROPERTY(lower)
- PROPERTY(print)
- PROPERTY(punct)
- PROPERTY(space)
- PROPERTY(upper)
- PROPERTY(xdigit)
- PROPERTY(blank)
-#undef PROPERTY
-}
+/* ========================================================================= */
-#if 0
+/* Word break property.
+ Updated for Unicode TR #29 revision 17. */
+
+/* Possible values of the Word_Break property. */
+enum
+{
+ WBP_OTHER = 0,
+ WBP_CR = 11,
+ WBP_LF = 12,
+ WBP_NEWLINE = 10,
+ WBP_EXTEND = 8,
+ WBP_FORMAT = 9,
+ WBP_KATAKANA = 1,
+ WBP_ALETTER = 2,
+ WBP_MIDNUMLET = 3,
+ WBP_MIDLETTER = 4,
+ WBP_MIDNUM = 5,
+ WBP_NUMERIC = 6,
+ WBP_EXTENDNUMLET = 7
+};
-static bool
-is_combining (unsigned int ch)
+/* Returns the word breaking property for ch, as a bit mask. */
+static int
+get_wbp (unsigned int ch)
{
- /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
- file. In 3.0.1 it was identical to the union of the general categories
- "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
- PropList.txt file, so we take the latter definition. */
- return (unicode_attributes[ch].name != NULL
- && unicode_attributes[ch].category[0] == 'M'
- && (unicode_attributes[ch].category[1] == 'n'
- || unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'e'));
-}
+ int attr = 0;
-static bool
-is_combining_level3 (unsigned int ch)
-{
- return is_combining (ch)
- && !(unicode_attributes[ch].combining[0] != '\0'
- && unicode_attributes[ch].combining[0] != '0'
- && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
-}
+ if (unicode_attributes[ch].name != NULL)
+ {
+ if (ch == 0x000D)
+ attr |= 1 << WBP_CR;
-/* Return the UCS symbol string for a Unicode character. */
-static const char *
-ucs_symbol (unsigned int i)
-{
- static char buf[11+1];
+ if (ch == 0x000A)
+ attr |= 1 << WBP_LF;
- sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
- return buf;
-}
+ if (ch == 0x000B || ch == 0x000C
+ || ch == 0x0085
+ || ch == 0x2028 || ch == 0x2029)
+ attr |= 1 << WBP_NEWLINE;
-/* Return the UCS symbol range string for a Unicode characters interval. */
-static const char *
-ucs_symbol_range (unsigned int low, unsigned int high)
-{
- static char buf[24+1];
+ if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
+ || (unicode_attributes[ch].category != NULL
+ && strcmp (unicode_attributes[ch].category, "Mc") == 0))
+ attr |= 1 << WBP_EXTEND;
- strcpy (buf, ucs_symbol (low));
- strcat (buf, "..");
- strcat (buf, ucs_symbol (high));
- return buf;
-}
+ if (unicode_attributes[ch].category != NULL
+ && strcmp (unicode_attributes[ch].category, "Cf") == 0
+ && ch != 0x200B && ch != 0x200C && ch != 0x200D)
+ attr |= 1 << WBP_FORMAT;
-/* Output a character class (= property) table. */
+ if ((unicode_scripts[ch] < numscripts
+ && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
+ || (ch >= 0x3031 && ch <= 0x3035)
+ || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
+ || ch == 0xFF70)
+ attr |= 1 << WBP_KATAKANA;
-static void
-output_charclass (FILE *stream, const char *classname,
- bool (*func) (unsigned int))
-{
- char table[0x110000];
- unsigned int i;
- bool need_semicolon;
- const int max_column = 75;
- int column;
+ if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
+ || ch == 0x05F3)
+ && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
+ && (attr & (1 << WBP_KATAKANA)) == 0
+ && ((get_lbp (ch) >> LBP_SA) & 1) == 0
+ && !(unicode_scripts[ch] < numscripts
+ && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
+ && (attr & (1 << WBP_EXTEND)) == 0)
+ attr |= 1 << WBP_ALETTER;
- for (i = 0; i < 0x110000; i++)
- table[i] = (int) func (i);
+ if (is_WBP_MIDNUMLET (ch))
+ attr |= 1 << WBP_MIDNUMLET;
- fprintf (stream, "%s ", classname);
- need_semicolon = false;
- column = 1000;
- for (i = 0; i < 0x110000; )
- {
- if (!table[i])
- i++;
- else
- {
- unsigned int low, high;
- char buf[25];
-
- low = i;
- do
- i++;
- while (i < 0x110000 && table[i]);
- high = i - 1;
-
- if (low == high)
- strcpy (buf, ucs_symbol (low));
- else
- strcpy (buf, ucs_symbol_range (low, high));
-
- if (need_semicolon)
- {
- fprintf (stream, ";");
- column++;
- }
-
- if (column + strlen (buf) > max_column)
- {
- fprintf (stream, "/\n ");
- column = 3;
- }
-
- fprintf (stream, "%s", buf);
- column += strlen (buf);
- need_semicolon = true;
- }
- }
- fprintf (stream, "\n");
-}
+ if (is_WBP_MIDLETTER (ch))
+ attr |= 1 << WBP_MIDLETTER;
-/* Output a character mapping table. */
+ if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
+ || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
+ || ch == 0xFF1B)
+ && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
+ attr |= 1 << WBP_MIDNUM;
-static void
-output_charmap (FILE *stream, const char *mapname,
- unsigned int (*func) (unsigned int))
-{
- char table[0x110000];
- unsigned int i;
- bool need_semicolon;
- const int max_column = 75;
- int column;
+ if (((get_lbp (ch) >> LBP_NU) & 1) != 0
+ && ch != 0x066C)
+ attr |= 1 << WBP_NUMERIC;
- for (i = 0; i < 0x110000; i++)
- table[i] = (func (i) != i);
+ if (unicode_attributes[ch].category != NULL
+ && strcmp (unicode_attributes[ch].category, "Pc") == 0)
+ attr |= 1 << WBP_EXTENDNUMLET;
+ }
- fprintf (stream, "%s ", mapname);
- need_semicolon = false;
- column = 1000;
- for (i = 0; i < 0x110000; i++)
- if (table[i])
- {
- char buf[25+1];
-
- strcpy (buf, "(");
- strcat (buf, ucs_symbol (i));
- strcat (buf, ",");
- strcat (buf, ucs_symbol (func (i)));
- strcat (buf, ")");
-
- if (need_semicolon)
- {
- fprintf (stream, ";");
- column++;
- }
-
- if (column + strlen (buf) > max_column)
- {
- fprintf (stream, "/\n ");
- column = 3;
- }
-
- fprintf (stream, "%s", buf);
- column += strlen (buf);
- need_semicolon = true;
- }
- fprintf (stream, "\n");
-}
+ if (attr == 0)
+ /* other */
+ attr |= 1 << WBP_OTHER;
-/* Output the width table. */
+ return attr;
+}
+/* Output the word break property in a human readable format. */
static void
-output_widthmap (FILE *stream)
+debug_output_wbp (FILE *stream)
{
-}
+ unsigned int i;
-/* Output the tables to the given file. */
+ for (i = 0; i < 0x110000; i++)
+ {
+ int attr = get_wbp (i);
+ if (attr != 1 << WBP_OTHER)
+ {
+ fprintf (stream, "0x%04X", i);
+ if (attr & (1 << WBP_CR))
+ fprintf (stream, " CR");
+ if (attr & (1 << WBP_LF))
+ fprintf (stream, " LF");
+ if (attr & (1 << WBP_NEWLINE))
+ fprintf (stream, " Newline");
+ if (attr & (1 << WBP_EXTEND))
+ fprintf (stream, " Extend");
+ if (attr & (1 << WBP_FORMAT))
+ fprintf (stream, " Format");
+ if (attr & (1 << WBP_KATAKANA))
+ fprintf (stream, " Katakana");
+ if (attr & (1 << WBP_ALETTER))
+ fprintf (stream, " ALetter");
+ if (attr & (1 << WBP_MIDNUMLET))
+ fprintf (stream, " MidNumLet");
+ if (attr & (1 << WBP_MIDLETTER))
+ fprintf (stream, " MidLetter");
+ if (attr & (1 << WBP_MIDNUM))
+ fprintf (stream, " MidNum");
+ if (attr & (1 << WBP_NUMERIC))
+ fprintf (stream, " Numeric");
+ if (attr & (1 << WBP_EXTENDNUMLET))
+ fprintf (stream, " ExtendNumLet");
+ fprintf (stream, "\n");
+ }
+ }
+}
static void
-output_tables (const char *filename, const char *version)
+debug_output_wbrk_tables (const char *filename)
{
FILE *stream;
- unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
exit (1);
}
- fprintf (stream, "escape_char /\n");
- fprintf (stream, "comment_char %%\n");
- fprintf (stream, "\n");
- fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
- version);
- fprintf (stream, "\n");
-
- fprintf (stream, "LC_IDENTIFICATION\n");
- fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
- fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
- fprintf (stream, "address \"\"\n");
- fprintf (stream, "contact \"\"\n");
- fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
- fprintf (stream, "tel \"\"\n");
- fprintf (stream, "fax \"\"\n");
- fprintf (stream, "language \"\"\n");
- fprintf (stream, "territory \"Earth\"\n");
- fprintf (stream, "revision \"%s\"\n", version);
- {
- time_t now;
- char date[11];
- now = time (NULL);
- strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
- fprintf (stream, "date \"%s\"\n", date);
- }
- fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
- fprintf (stream, "END LC_IDENTIFICATION\n");
- fprintf (stream, "\n");
+ debug_output_wbp (stream);
- /* Verifications. */
- for (ch = 0; ch < 0x110000; ch++)
+ if (ferror (stream) || fclose (stream))
{
- /* toupper restriction: "Only characters specified for the keywords
- lower and upper shall be specified. */
- if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
- fprintf (stderr,
- "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
- ucs_symbol (ch), ch, to_upper (ch));
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
- /* tolower restriction: "Only characters specified for the keywords
- lower and upper shall be specified. */
- if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
- fprintf (stderr,
- "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
- ucs_symbol (ch), ch, to_lower (ch));
+/* The word break property from the WordBreakProperty.txt file. */
+int unicode_org_wbp[0x110000];
- /* alpha restriction: "Characters classified as either upper or lower
- shall automatically belong to this class. */
- if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
- fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
+/* Stores in unicode_org_wbp[] the word break property from the
+ WordBreakProperty.txt file. */
+static void
+fill_org_wbp (const char *wordbreakproperty_filename)
+{
+ unsigned int i;
+ FILE *stream;
- /* alpha restriction: "No character specified for the keywords cntrl,
- digit, punct or space shall be specified." */
- if (is_alpha (ch) && is_cntrl (ch))
- fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
- if (is_alpha (ch) && is_digit (ch))
- fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
- if (is_alpha (ch) && is_punct (ch))
- fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
- if (is_alpha (ch) && is_space (ch))
- fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
+ for (i = 0; i < 0x110000; i++)
+ unicode_org_wbp[i] = WBP_OTHER;
- /* space restriction: "No character specified for the keywords upper,
- lower, alpha, digit, graph or xdigit shall be specified."
- upper, lower, alpha already checked above. */
- if (is_space (ch) && is_digit (ch))
- fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
- if (is_space (ch) && is_graph (ch))
- fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
- if (is_space (ch) && is_xdigit (ch))
- fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
+ stream = fopen (wordbreakproperty_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
+ exit (1);
+ }
- /* cntrl restriction: "No character specified for the keywords upper,
- lower, alpha, digit, punct, graph, print or xdigit shall be
- specified." upper, lower, alpha already checked above. */
- if (is_cntrl (ch) && is_digit (ch))
- fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
- if (is_cntrl (ch) && is_punct (ch))
- fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
- if (is_cntrl (ch) && is_graph (ch))
- fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
- if (is_cntrl (ch) && is_print (ch))
- fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
- if (is_cntrl (ch) && is_xdigit (ch))
- fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char propname[200+1];
+ int propvalue;
- /* punct restriction: "No character specified for the keywords upper,
- lower, alpha, digit, cntrl, xdigit or as the <space> character shall
- be specified." upper, lower, alpha, cntrl already checked above. */
- if (is_punct (ch) && is_digit (ch))
- fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
- if (is_punct (ch) && is_xdigit (ch))
- fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
- if (is_punct (ch) && (ch == 0x0020))
- fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
- /* graph restriction: "No character specified for the keyword cntrl
- shall be specified." Already checked above. */
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
- /* print restriction: "No character specified for the keyword cntrl
- shall be specified." Already checked above. */
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n",
+ wordbreakproperty_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+#define PROP(name,value) \
+ if (strcmp (propname, name) == 0) propvalue = value; else
+ PROP ("CR", WBP_CR)
+ PROP ("LF", WBP_LF)
+ PROP ("Newline", WBP_NEWLINE)
+ PROP ("Extend", WBP_EXTEND)
+ PROP ("Format", WBP_FORMAT)
+ PROP ("Katakana", WBP_KATAKANA)
+ PROP ("ALetter", WBP_ALETTER)
+ PROP ("MidNumLet", WBP_MIDNUMLET)
+ PROP ("MidLetter", WBP_MIDLETTER)
+ PROP ("MidNum", WBP_MIDNUM)
+ PROP ("Numeric", WBP_NUMERIC)
+ PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+#undef PROP
+ {
+ fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
+ wordbreakproperty_filename);
+ exit (1);
+ }
+ if (!(i1 <= i2 && i2 < 0x110000))
+ abort ();
- /* graph - print relation: differ only in the <space> character.
- How is this possible if there are more than one space character?!
- I think susv2/xbd/locale.html should speak of "space characters",
- not "space character". */
- if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
- fprintf (stderr,
- "%s is print but not graph|<space>\n", ucs_symbol (ch));
- if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
- fprintf (stderr,
- "%s is graph|<space> but not print\n", ucs_symbol (ch));
+ for (i = i1; i <= i2; i++)
+ unicode_org_wbp[i] = propvalue;
}
- fprintf (stream, "LC_CTYPE\n");
- output_charclass (stream, "upper", is_upper);
- output_charclass (stream, "lower", is_lower);
- output_charclass (stream, "alpha", is_alpha);
- output_charclass (stream, "digit", is_digit);
- output_charclass (stream, "outdigit", is_outdigit);
- output_charclass (stream, "blank", is_blank);
- output_charclass (stream, "space", is_space);
- output_charclass (stream, "cntrl", is_cntrl);
- output_charclass (stream, "punct", is_punct);
- output_charclass (stream, "xdigit", is_xdigit);
- output_charclass (stream, "graph", is_graph);
- output_charclass (stream, "print", is_print);
- output_charclass (stream, "class \"combining\";", is_combining);
- output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
- output_charmap (stream, "toupper", to_upper);
- output_charmap (stream, "tolower", to_lower);
- output_charmap (stream, "map \"totitle\";", to_title);
- output_widthmap (stream);
- fprintf (stream, "END LC_CTYPE\n");
-
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error writing to '%s'\n", filename);
+ fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
exit (1);
}
}
-#endif
-
-/* ========================================================================= */
-
-/* The width property from the EastAsianWidth.txt file.
- Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
-const char * unicode_width[0x110000];
-
-/* Stores in unicode_width[] the width property from the EastAsianWidth.txt
- file. */
+/* Output the word break property in a human readable format. */
+static void
+debug_output_org_wbp (FILE *stream)
+{
+ unsigned int i;
+
+ for (i = 0; i < 0x110000; i++)
+ {
+ int propvalue = unicode_org_wbp[i];
+ if (propvalue != WBP_OTHER)
+ {
+ fprintf (stream, "0x%04X", i);
+#define PROP(name,value) \
+ if (propvalue == value) fprintf (stream, " " name); else
+ PROP ("CR", WBP_CR)
+ PROP ("LF", WBP_LF)
+ PROP ("Newline", WBP_NEWLINE)
+ PROP ("Extend", WBP_EXTEND)
+ PROP ("Format", WBP_FORMAT)
+ PROP ("Katakana", WBP_KATAKANA)
+ PROP ("ALetter", WBP_ALETTER)
+ PROP ("MidNumLet", WBP_MIDNUMLET)
+ PROP ("MidLetter", WBP_MIDLETTER)
+ PROP ("MidNum", WBP_MIDNUM)
+ PROP ("Numeric", WBP_NUMERIC)
+ PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+#undef PROP
+ fprintf (stream, " ??");
+ fprintf (stream, "\n");
+ }
+ }
+}
+
static void
-fill_width (const char *width_filename)
+debug_output_org_wbrk_tables (const char *filename)
{
- unsigned int i, j;
FILE *stream;
- char field0[FIELDLEN];
- char field1[FIELDLEN];
- char field2[FIELDLEN];
- int lineno = 0;
-
- for (i = 0; i < 0x110000; i++)
- unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
- stream = fopen (width_filename, "r");
+ stream = fopen (filename, "w");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", width_filename);
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
- for (;;)
- {
- int n;
- int c;
+ debug_output_org_wbp (stream);
- lineno++;
- c = getc (stream);
- if (c == EOF)
- break;
- if (c == '#')
- {
- do c = getc (stream); while (c != EOF && c != '\n');
- continue;
- }
- ungetc (c, stream);
- n = getfield (stream, field0, ';');
- n += getfield (stream, field1, ' ');
- n += getfield (stream, field2, '\n');
- if (n == 0)
- break;
- if (n != 3)
- {
- fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
- exit (1);
- }
- i = strtoul (field0, NULL, 16);
- if (strstr (field0, "..") != NULL)
- {
- /* Deal with a range. */
- j = strtoul (strstr (field0, "..") + 2, NULL, 16);
- for (; i <= j; i++)
- unicode_width[i] = strdup (field1);
- }
- else
- {
- /* Single character line. */
- unicode_width[i] = strdup (field1);
- }
- }
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", width_filename);
+ fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
-/* Line breaking classification. */
+/* Construction of sparse 3-level tables. */
+#define TABLE wbp_table
+#define ELEMENT unsigned char
+#define DEFAULT WBP_OTHER
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
-enum
+static void
+output_wbp (FILE *stream)
{
- /* Values >= 24 are resolved at run time. */
- LBP_BK = 24, /* mandatory break */
-/*LBP_CR, carriage return - not used here because it's a DOSism */
-/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 25, /* attached characters and combining marks */
-/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
-/*LBP_SG, surrogates - not used here because they are not characters */
- LBP_WJ = 0, /* word joiner */
- LBP_ZW = 26, /* zero width space */
- LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 27, /* space */
- LBP_B2 = 2, /* break opportunity before and after */
- LBP_BA = 3, /* break opportunity after */
- LBP_BB = 4, /* break opportunity before */
- LBP_HY = 5, /* hyphen */
- LBP_CB = 28, /* contingent break opportunity */
- LBP_CL = 6, /* closing punctuation */
- LBP_EX = 7, /* exclamation/interrogation */
- LBP_IN = 8, /* inseparable */
- LBP_NS = 9, /* non starter */
- LBP_OP = 10, /* opening punctuation */
- LBP_QU = 11, /* ambiguous quotation */
- LBP_IS = 12, /* infix separator (numeric) */
- LBP_NU = 13, /* numeric */
- LBP_PO = 14, /* postfix (numeric) */
- LBP_PR = 15, /* prefix (numeric) */
- LBP_SY = 16, /* symbols allowing breaks */
- LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
- LBP_AL = 17, /* ordinary alphabetic and symbol characters */
- LBP_H2 = 18, /* Hangul LV syllable */
- LBP_H3 = 19, /* Hangul LVT syllable */
- LBP_ID = 20, /* ideographic */
- LBP_JL = 21, /* Hangul L Jamo */
- LBP_JV = 22, /* Hangul V Jamo */
- LBP_JT = 23, /* Hangul T Jamo */
- LBP_SA = 30, /* complex context (South East Asian) */
- LBP_XX = 31 /* unknown */
-};
+ unsigned int i;
+ struct wbp_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
-/* Returns the line breaking classification for ch, as a bit mask. */
-static int
-get_lbp (unsigned int ch)
-{
- int attr = 0;
+ t.p = 7;
+ t.q = 9;
+ wbp_table_init (&t);
- if (unicode_attributes[ch].name != NULL)
+ for (i = 0; i < 0x110000; i++)
{
- /* mandatory break */
- if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
- || ch == 0x000C /* form feed */
- || ch == 0x000B /* line tabulation */
- || ch == 0x2028 /* LINE SEPARATOR */
- || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
- attr |= 1 << LBP_BK;
-
- if (ch == 0x2060 /* WORD JOINER */
- || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
- attr |= 1 << LBP_WJ;
-
- /* zero width space */
- if (ch == 0x200B /* ZERO WIDTH SPACE */)
- attr |= 1 << LBP_ZW;
-
- /* non-breaking (glue) */
- if (ch == 0x00A0 /* NO-BREAK SPACE */
- || ch == 0x202F /* NARROW NO-BREAK SPACE */
- || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
- || ch == 0x034F /* COMBINING GRAPHEME JOINER */
- || ch == 0x2007 /* FIGURE SPACE */
- || ch == 0x2011 /* NON-BREAKING HYPHEN */
- || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
- || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
- || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
- || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
- attr |= 1 << LBP_GL;
-
- /* space */
- if (ch == 0x0020 /* SPACE */)
- attr |= 1 << LBP_SP;
-
- /* break opportunity before and after */
- if (ch == 0x2014 /* EM DASH */)
- attr |= 1 << LBP_B2;
-
- /* break opportunity after */
- if (ch == 0x1680 /* OGHAM SPACE MARK */
- || ch == 0x2000 /* EN QUAD */
- || ch == 0x2001 /* EM QUAD */
- || ch == 0x2002 /* EN SPACE */
- || ch == 0x2003 /* EM SPACE */
- || ch == 0x2004 /* THREE-PER-EM SPACE */
- || ch == 0x2005 /* FOUR-PER-EM SPACE */
- || ch == 0x2006 /* SIX-PER-EM SPACE */
- || ch == 0x2008 /* PUNCTUATION SPACE */
- || ch == 0x2009 /* THIN SPACE */
- || ch == 0x200A /* HAIR SPACE */
- || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
- || ch == 0x0009 /* tab */
- || ch == 0x00AD /* SOFT HYPHEN */
- || ch == 0x058A /* ARMENIAN HYPHEN */
- || ch == 0x2010 /* HYPHEN */
- || ch == 0x2012 /* FIGURE DASH */
- || ch == 0x2013 /* EN DASH */
- || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
- || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
- || ch == 0x1361 /* ETHIOPIC WORDSPACE */
- || ch == 0x17D8 /* KHMER SIGN BEYYAL */
- || ch == 0x17DA /* KHMER SIGN KOOMUUT */
- || ch == 0x2027 /* HYPHENATION POINT */
- || ch == 0x007C /* VERTICAL LINE */
- || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
- || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
- || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
- || ch == 0x2056 /* THREE DOT PUNCTUATION */
- || ch == 0x2058 /* FOUR DOT PUNCTUATION */
- || ch == 0x2059 /* FIVE DOT PUNCTUATION */
- || ch == 0x205A /* TWO DOT PUNCTUATION */
- || ch == 0x205B /* FOUR DOT MARK */
- || ch == 0x205D /* TRICOLON */
- || ch == 0x205E /* VERTICAL FOUR DOTS */
- || ch == 0x2E19 /* PALM BRANCH */
- || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
- || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
- || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
- || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
- || ch == 0x2E30 /* RING POINT */
- || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
- || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
- || ch == 0x10102 /* AEGEAN CHECK MARK */
- || ch == 0x1039F /* UGARITIC WORD DIVIDER */
- || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
- || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
- || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
- || ch == 0x0964 /* DEVANAGARI DANDA */
- || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
- || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
- || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
- || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
- || ch == 0x104B /* MYANMAR SIGN SECTION */
- || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
- || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
- || ch == 0x17D4 /* KHMER SIGN KHAN */
- || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
- || ch == 0x1B5E /* BALINESE CARIK SIKI */
- || ch == 0x1B5F /* BALINESE CARIK PAREREN */
- || ch == 0xA8CE /* SAURASHTRA DANDA */
- || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
- || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
- || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
- || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
- || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
- || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
- || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
- || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
- || ch == 0x0F85 /* TIBETAN MARK PALUTA */
- || ch == 0x0FBE /* TIBETAN KU RU KHA */
- || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
- || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
- || ch == 0x1804 /* MONGOLIAN COLON */
- || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
- || ch == 0x1B5A /* BALINESE PANTI */
- || ch == 0x1B5B /* BALINESE PAMADA */
- || ch == 0x1B5C /* BALINESE WINDU */
- || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
- || ch == 0x1B60 /* BALINESE PAMENENG */
- || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
- || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
- || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
- || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
- || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
- || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
- || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
- || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
- || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
- || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
- || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
- || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
- || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
- || ch == 0xA60D /* VAI COMMA */
- || ch == 0xA60F /* VAI QUESTION MARK */
- || ch == 0xA92E /* KAYAH LI SIGN CWI */
- || ch == 0xA92F /* KAYAH LI SIGN SHYA */
- || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
- || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
- || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
- || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
- || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
- || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
- /* Extra characters for compatibility with Unicode LineBreak.txt. */
- || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
- || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
- || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
- attr |= 1 << LBP_BA;
-
- /* break opportunity before */
- if (ch == 0x00B4 /* ACUTE ACCENT */
- || ch == 0x1FFD /* GREEK OXIA */
- || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
- || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
- || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
- || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
- || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
- || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
- || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
- || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
- || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
- || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
- || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
- || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
- || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
- || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
- || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
- || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
- || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
- attr |= 1 << LBP_BB;
-
- /* hyphen */
- if (ch == 0x002D /* HYPHEN-MINUS */)
- attr |= 1 << LBP_HY;
-
- /* contingent break opportunity */
- if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
- attr |= 1 << LBP_CB;
-
- /* closing punctuation */
- if ((unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 'e')
- || ch == 0x3001 /* IDEOGRAPHIC COMMA */
- || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
- || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
- || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
- || ch == 0xFE50 /* SMALL COMMA */
- || ch == 0xFE52 /* SMALL FULL STOP */
- || ch == 0xFF0C /* FULLWIDTH COMMA */
- || ch == 0xFF0E /* FULLWIDTH FULL STOP */
- || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
- || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
- attr |= 1 << LBP_CL;
-
- /* exclamation/interrogation */
- if (ch == 0x0021 /* EXCLAMATION MARK */
- || ch == 0x003F /* QUESTION MARK */
- || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
- || ch == 0x061B /* ARABIC SEMICOLON */
- || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
- || ch == 0x061F /* ARABIC QUESTION MARK */
- || ch == 0x06D4 /* ARABIC FULL STOP */
- || ch == 0x07F9 /* NKO EXCLAMATION MARK */
- || ch == 0x0F0D /* TIBETAN MARK SHAD */
- || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
- || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
- || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
- || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
- || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
- || ch == 0x1802 /* MONGOLIAN COMMA */
- || ch == 0x1803 /* MONGOLIAN FULL STOP */
- || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
- || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
- || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
- || ch == 0x1945 /* LIMBU QUESTION MARK */
- || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
- || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
- || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
- || ch == 0x2CFE /* COPTIC FULL STOP */
- || ch == 0x2E2E /* REVERSED QUESTION MARK */
-#if REVISION_22
- || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
-#endif
- || ch == 0xA60E /* VAI FULL STOP */
- || ch == 0xA876 /* PHAGS-PA MARK SHAD */
- || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
- || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
- || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
- || ch == 0xFE56 /* SMALL QUESTION MARK */
- || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
- || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
- || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
- attr |= 1 << LBP_EX;
-
- /* inseparable */
- if (ch == 0x2024 /* ONE DOT LEADER */
- || ch == 0x2025 /* TWO DOT LEADER */
- || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
- || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
- attr |= 1 << LBP_IN;
-
- /* non starter */
- if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
- || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
- || ch == 0x203D /* INTERROBANG */
- || ch == 0x2047 /* DOUBLE QUESTION MARK */
- || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
- || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
- || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
- || ch == 0x301C /* WAVE DASH */
- || ch == 0x303C /* MASU MARK */
- || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
- || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
- || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
- || ch == 0x309D /* HIRAGANA ITERATION MARK */
- || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
- || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
- || ch == 0x30FB /* KATAKANA MIDDLE DOT */
- || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
- || ch == 0x30FD /* KATAKANA ITERATION MARK */
- || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
- || ch == 0xA015 /* YI SYLLABLE WU */
- || ch == 0xFE54 /* SMALL SEMICOLON */
- || ch == 0xFE55 /* SMALL COLON */
- || ch == 0xFF1A /* FULLWIDTH COLON */
- || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
- || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
- || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
- || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
- || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
- || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
- || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
- attr |= 1 << LBP_NS;
-
- /* opening punctuation */
- if ((unicode_attributes[ch].category[0] == 'P'
- && unicode_attributes[ch].category[1] == 's')
- || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
- || ch == 0x00BF /* INVERTED QUESTION MARK */
- || ch == 0x2E18 /* INVERTED INTERROBANG */)
- attr |= 1 << LBP_OP;
-
- /* ambiguous quotation */
- if ((unicode_attributes[ch].category[0] == 'P'
- && (unicode_attributes[ch].category[1] == 'f'
- || unicode_attributes[ch].category[1] == 'i'))
- || ch == 0x0022 /* QUOTATION MARK */
- || ch == 0x0027 /* APOSTROPHE */
- || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
- || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
- || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
- || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
- || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
- || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
- || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
- || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
- || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
- || ch == 0x2E0B /* RAISED SQUARE */)
- attr |= 1 << LBP_QU;
-
- /* infix separator (numeric) */
- if (ch == 0x002C /* COMMA */
- || ch == 0x002E /* FULL STOP */
- || ch == 0x003A /* COLON */
- || ch == 0x003B /* SEMICOLON */
- || ch == 0x037E /* GREEK QUESTION MARK */
- || ch == 0x0589 /* ARMENIAN FULL STOP */
- || ch == 0x060C /* ARABIC COMMA */
- || ch == 0x060D /* ARABIC DATE SEPARATOR */
- || ch == 0x07F8 /* NKO COMMA */
- || ch == 0x2044 /* FRACTION SLASH */
- || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
- || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
- || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
- attr |= 1 << LBP_IS;
-
- /* numeric */
- if ((unicode_attributes[ch].category[0] == 'N'
- && unicode_attributes[ch].category[1] == 'd'
- && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
- || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
- || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
- attr |= 1 << LBP_NU;
+ int attr = get_wbp (i);
- /* postfix (numeric) */
- if (ch == 0x0025 /* PERCENT SIGN */
- || ch == 0x00A2 /* CENT SIGN */
- || ch == 0x00B0 /* DEGREE SIGN */
- || ch == 0x060B /* AFGHANI SIGN */
- || ch == 0x066A /* ARABIC PERCENT SIGN */
- || ch == 0x2030 /* PER MILLE SIGN */
- || ch == 0x2031 /* PER TEN THOUSAND SIGN */
- || ch == 0x2032 /* PRIME */
- || ch == 0x2033 /* DOUBLE PRIME */
- || ch == 0x2034 /* TRIPLE PRIME */
- || ch == 0x2035 /* REVERSED PRIME */
- || ch == 0x2036 /* REVERSED DOUBLE PRIME */
- || ch == 0x2037 /* REVERSED TRIPLE PRIME */
- || ch == 0x20A7 /* PESETA SIGN */
- || ch == 0x2103 /* DEGREE CELSIUS */
- || ch == 0x2109 /* DEGREE FAHRENHEIT */
- || ch == 0xFDFC /* RIAL SIGN */
- || ch == 0xFE6A /* SMALL PERCENT SIGN */
- || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
- || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
- /* Extra characters for compatibility with Unicode LineBreak.txt. */
- || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
- || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
- || ch == 0x0D79 /* MALAYALAM DATE MARK */)
- attr |= 1 << LBP_PO;
+ /* Now attr should contain exactly one bit. */
+ if (attr == 0 || ((attr & (attr - 1)) != 0))
+ abort ();
- /* prefix (numeric) */
- if ((unicode_attributes[ch].category[0] == 'S'
- && unicode_attributes[ch].category[1] == 'c')
- || ch == 0x002B /* PLUS SIGN */
- || ch == 0x005C /* REVERSE SOLIDUS */
- || ch == 0x00B1 /* PLUS-MINUS SIGN */
- || ch == 0x2116 /* NUMERO SIGN */
- || ch == 0x2212 /* MINUS SIGN */
- || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
- if (!(attr & (1 << LBP_PO)))
- attr |= 1 << LBP_PR;
+ if (attr != 1 << WBP_OTHER)
+ {
+ unsigned int log2_attr;
+ for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
- /* symbols allowing breaks */
- if (ch == 0x002F /* SOLIDUS */)
- attr |= 1 << LBP_SY;
+ wbp_table_add (&t, i, log2_attr);
+ }
+ }
- if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
- attr |= 1 << LBP_H2;
+ wbp_table_finalize (&t);
- if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
- attr |= 1 << LBP_H3;
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
- if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
- attr |= 1 << LBP_JL;
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "\n");
+ fprintf (stream, "typedef struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "wbrkprop_t;\n");
+ fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (unsigned char));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 4)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
+ const char *value_string;
+ switch (value)
+ {
+#define CASE(x) case x: value_string = #x; break;
+ CASE(WBP_OTHER);
+ CASE(WBP_CR);
+ CASE(WBP_LF);
+ CASE(WBP_NEWLINE);
+ CASE(WBP_EXTEND);
+ CASE(WBP_FORMAT);
+ CASE(WBP_KATAKANA);
+ CASE(WBP_ALETTER);
+ CASE(WBP_MIDNUMLET);
+ CASE(WBP_MIDLETTER);
+ CASE(WBP_MIDNUM);
+ CASE(WBP_NUMERIC);
+ CASE(WBP_EXTENDNUMLET);
+#undef CASE
+ default:
+ abort ();
+ }
+ if (i > 0 && (i % 4) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " %s%s", value_string,
+ (i+1 < t.level3_size << t.p ? "," : ""));
+ }
+ if (t.level3_size << t.p > 4)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+}
- if (ch >= 0x1160 && ch <= 0x11A2)
- attr |= 1 << LBP_JV;
+static void
+output_wbrk_tables (const char *filename, const char *version)
+{
+ FILE *stream;
- if (ch >= 0x11A8 && ch <= 0x11F9)
- attr |= 1 << LBP_JT;
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
- /* complex context (South East Asian) */
- if (((unicode_attributes[ch].category[0] == 'C'
- && unicode_attributes[ch].category[1] == 'f')
- || (unicode_attributes[ch].category[0] == 'L'
- && (unicode_attributes[ch].category[1] == 'm'
- || unicode_attributes[ch].category[1] == 'o'))
- || (unicode_attributes[ch].category[0] == 'M'
- && (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'n'))
- /* Extra characters for compatibility with Unicode LineBreak.txt. */
- || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
- || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
- || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
- || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
- && ((ch >= 0x0E00 && ch <= 0x0EFF)
- || (ch >= 0x1000 && ch <= 0x109F)
- || (ch >= 0x1780 && ch <= 0x17FF)
- || (ch >= 0x1950 && ch <= 0x19DF)))
- attr |= 1 << LBP_SA;
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "\n");
- /* attached characters and combining marks */
- if ((unicode_attributes[ch].category[0] == 'M'
- && (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'e'
- || unicode_attributes[ch].category[1] == 'n'))
- || (unicode_attributes[ch].category[0] == 'C'
- && (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'f')))
- if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
- attr |= 1 << LBP_CM;
+ /* Put a GPL header on it. The gnulib module is under LGPL (although it
+ still carries the GPL header), and it's gnulib-tool which replaces the
+ GPL header with an LGPL header. */
+ fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
- /* ideographic */
- if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
- || ch == 0x3000 /* IDEOGRAPHIC SPACE */
- || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
- || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
- || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
- || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
- || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
- || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
- || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
- || ch == 0xFE62 /* SMALL PLUS SIGN */
- || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
- || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
- || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
- || ch == 0xFE66 /* SMALL EQUALS SIGN */
- || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
- || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
- || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
- || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
- || (ch >= 0x3000 && ch <= 0x33FF
- && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
- /* Extra characters for compatibility with Unicode LineBreak.txt. */
- || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
- || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
- || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
- || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
- || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
- || ch == 0xFE45 /* SESAME DOT */
- || ch == 0xFE46 /* WHITE SESAME DOT */
- || ch == 0xFE49 /* DASHED OVERLINE */
- || ch == 0xFE4A /* CENTRELINE OVERLINE */
- || ch == 0xFE4B /* WAVY OVERLINE */
- || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
- || ch == 0xFE4D /* DASHED LOW LINE */
- || ch == 0xFE4E /* CENTRELINE LOW LINE */
- || ch == 0xFE4F /* WAVY LOW LINE */
- || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
- || ch == 0xFE58 /* SMALL EM DASH */
- || ch == 0xFE5F /* SMALL NUMBER SIGN */
- || ch == 0xFE60 /* SMALL AMPERSAND */
- || ch == 0xFE61 /* SMALL ASTERISK */
- || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
- || ch == 0xFE6B /* SMALL COMMERCIAL AT */
- || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
- || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
- || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
- || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
- || ch == 0xFF0A /* FULLWIDTH ASTERISK */
- || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
- || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
- || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
- || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
- || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
- || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
- || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
- || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
- || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
- || ch == 0xFF3F /* FULLWIDTH LOW LINE */
- || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
- || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
- || ch == 0xFF5E /* FULLWIDTH TILDE */
- || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
- || ch == 0xFFE3 /* FULLWIDTH MACRON */
- || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
- if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
- {
- /* ambiguous (ideograph) ? */
- if ((unicode_width[ch] != NULL
- && unicode_width[ch][0] == 'A'
- && ch >= 0x2000)
- || ch == 0x24EA /* CIRCLED DIGIT ZERO */
- || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
- attr |= 1 << LBP_AI;
- else
- attr |= 1 << LBP_ID;
- }
+ output_wbp (stream);
- /* ordinary alphabetic and symbol characters */
- if ((unicode_attributes[ch].category[0] == 'L'
- && (unicode_attributes[ch].category[1] == 'u'
- || unicode_attributes[ch].category[1] == 'l'
- || unicode_attributes[ch].category[1] == 't'
- || unicode_attributes[ch].category[1] == 'm'
- || unicode_attributes[ch].category[1] == 'o'))
- || (unicode_attributes[ch].category[0] == 'S'
- && (unicode_attributes[ch].category[1] == 'm'
- || unicode_attributes[ch].category[1] == 'k'
- || unicode_attributes[ch].category[1] == 'o'))
- || (unicode_attributes[ch].category[0] == 'N'
- && (unicode_attributes[ch].category[1] == 'l'
- || unicode_attributes[ch].category[1] == 'o'))
- || (unicode_attributes[ch].category[0] == 'P'
- && (unicode_attributes[ch].category[1] == 'c'
- || unicode_attributes[ch].category[1] == 'd'
- || unicode_attributes[ch].category[1] == 'o'))
- || ch == 0x0600 /* ARABIC NUMBER SIGN */
- || ch == 0x0601 /* ARABIC SIGN SANAH */
- || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
- || ch == 0x0603 /* ARABIC SIGN SAFHA */
- || ch == 0x06DD /* ARABIC END OF AYAH */
- || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
- || ch == 0x2061 /* FUNCTION APPLICATION */
- || ch == 0x2062 /* INVISIBLE TIMES */
- || ch == 0x2063 /* INVISIBLE SEPARATOR */
- || ch == 0x2064 /* INVISIBLE PLUS */)
- if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
- {
- /* ambiguous (alphabetic) ? */
- if ((unicode_width[ch] != NULL
- && unicode_width[ch][0] == 'A'
- && ch >= 0x2000
- /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
- && ch != 0x2022 /* BULLET */
- && ch != 0x203E /* OVERLINE */
- && ch != 0x2126 /* OHM SIGN */
- && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
- && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
- && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
- && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
- && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
- && ch != 0x21E7 /* UPWARDS WHITE ARROW */
- && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
- && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
-#if !REVISION_22
- || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
- || ch == 0x00A7 /* SECTION SIGN */
- || ch == 0x00A8 /* DIAERESIS */
- || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
- || ch == 0x00B2 /* SUPERSCRIPT TWO */
- || ch == 0x00B3 /* SUPERSCRIPT THREE */
- || ch == 0x00B6 /* PILCROW SIGN */
- || ch == 0x00B7 /* MIDDLE DOT */
- || ch == 0x00B8 /* CEDILLA */
- || ch == 0x00B9 /* SUPERSCRIPT ONE */
- || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
- || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
- || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
- || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
- || ch == 0x00BF /* INVERTED QUESTION MARK */
- || ch == 0x00D7 /* MULTIPLICATION SIGN */
- || ch == 0x00F7 /* DIVISION SIGN */
- || ch == 0x02C7 /* CARON */
- || ch == 0x02C9 /* MODIFIER LETTER MACRON */
- || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
- || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
- || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
- || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
- || ch == 0x02D8 /* BREVE */
- || ch == 0x02D9 /* DOT ABOVE */
- || ch == 0x02DA /* RING ABOVE */
- || ch == 0x02DB /* OGONEK */
- || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
-#endif
- || ch == 0x24EA /* CIRCLED DIGIT ZERO */
- || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
- /* Extra characters for compatibility with Unicode LineBreak.txt. */
- || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
- || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
- || ch == 0x2616 /* WHITE SHOGI PIECE */
- || ch == 0x2617 /* BLACK SHOGI PIECE */)
- attr |= 1 << LBP_AI;
- else
- attr |= 1 << LBP_AL;
- attr &= ~(1 << LBP_CM);
- }
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
}
+}
- if (attr == 0)
- /* unknown */
- attr |= 1 << LBP_XX;
+/* ========================================================================= */
- return attr;
-}
+/* Grapheme break property.
+ Updated for Unicode TR #29 revision 17. */
-/* Output the line breaking properties in a human readable format. */
-static void
-debug_output_lbp (FILE *stream)
+/* Possible values of the Grapheme_Cluster_Break property. */
+enum
{
- unsigned int i;
+ GBP_OTHER = 0,
+ GBP_CR = 1,
+ GBP_LF = 2,
+ GBP_CONTROL = 3,
+ GBP_EXTEND = 4,
+ GBP_PREPEND = 5,
+ GBP_SPACINGMARK = 6,
+ GBP_L = 7,
+ GBP_V = 8,
+ GBP_T = 9,
+ GBP_LV = 10,
+ GBP_LVT = 11
+};
- for (i = 0; i < 0x110000; i++)
- {
- int attr = get_lbp (i);
- if (attr != 1 << LBP_XX)
- {
- fprintf (stream, "0x%04X", i);
-#define PRINT_BIT(attr,bit) \
- if (attr & (1 << bit)) fprintf (stream, " " #bit);
- PRINT_BIT(attr,LBP_BK);
- PRINT_BIT(attr,LBP_CM);
- PRINT_BIT(attr,LBP_WJ);
- PRINT_BIT(attr,LBP_ZW);
- PRINT_BIT(attr,LBP_GL);
- PRINT_BIT(attr,LBP_SP);
- PRINT_BIT(attr,LBP_B2);
- PRINT_BIT(attr,LBP_BA);
- PRINT_BIT(attr,LBP_BB);
- PRINT_BIT(attr,LBP_HY);
- PRINT_BIT(attr,LBP_CB);
- PRINT_BIT(attr,LBP_CL);
- PRINT_BIT(attr,LBP_EX);
- PRINT_BIT(attr,LBP_IN);
- PRINT_BIT(attr,LBP_NS);
- PRINT_BIT(attr,LBP_OP);
- PRINT_BIT(attr,LBP_QU);
- PRINT_BIT(attr,LBP_IS);
- PRINT_BIT(attr,LBP_NU);
- PRINT_BIT(attr,LBP_PO);
- PRINT_BIT(attr,LBP_PR);
- PRINT_BIT(attr,LBP_SY);
- PRINT_BIT(attr,LBP_AI);
- PRINT_BIT(attr,LBP_AL);
- PRINT_BIT(attr,LBP_H2);
- PRINT_BIT(attr,LBP_H3);
- PRINT_BIT(attr,LBP_ID);
- PRINT_BIT(attr,LBP_JL);
- PRINT_BIT(attr,LBP_JV);
- PRINT_BIT(attr,LBP_JT);
- PRINT_BIT(attr,LBP_SA);
- PRINT_BIT(attr,LBP_XX);
-#undef PRINT_BIT
- fprintf (stream, "\n");
- }
- }
-}
+/* Construction of sparse 3-level tables. */
+#define TABLE gbp_table
+#define ELEMENT unsigned char
+#define DEFAULT GBP_OTHER
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* The grapheme break property from the GraphemeBreakProperty.txt file. */
+int unicode_org_gbp[0x110000];
+/* Output the unit test data for the grapheme break property. */
static void
-debug_output_lbrk_tables (const char *filename)
+output_gbp_test (const char *filename)
{
FILE *stream;
+ bool need_comma;
+ unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
exit (1);
}
- debug_output_lbp (stream);
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
+ fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int gbp = unicode_org_gbp[ch];
+ const char *gbp_string;
+
+ while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
+ ch++;
+
+ switch (gbp)
+ {
+#define CASE(x) case x: gbp_string = #x; break;
+ CASE (GBP_OTHER)
+ CASE (GBP_CR)
+ CASE (GBP_LF)
+ CASE (GBP_CONTROL)
+ CASE (GBP_EXTEND)
+ CASE (GBP_PREPEND)
+ CASE (GBP_SPACINGMARK)
+ CASE (GBP_L)
+ CASE (GBP_V)
+ CASE (GBP_T)
+ CASE (GBP_LV)
+ CASE (GBP_LVT)
+#undef CASE
+ default:
+ abort ();
+ }
+
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
+
+ need_comma = true;
+ }
+ fprintf (stream, "\n");
if (ferror (stream) || fclose (stream))
{
}
}
-/* The line breaking property from the LineBreak.txt file. */
-int unicode_org_lbp[0x110000];
-
-/* Stores in unicode_org_lbp[] the line breaking property from the
- LineBreak.txt file. */
+/* Output the per-character grapheme break property table. */
static void
-fill_org_lbp (const char *linebreak_filename)
+output_gbp_table (const char *filename, const char *version)
{
- unsigned int i, j;
FILE *stream;
- char field0[FIELDLEN];
- char field1[FIELDLEN];
- char field2[FIELDLEN];
- int lineno = 0;
-
- for (i = 0; i < 0x110000; i++)
- unicode_org_lbp[i] = LBP_XX;
+ unsigned int ch, i;
+ struct gbp_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
- stream = fopen (linebreak_filename, "r");
+ stream = fopen (filename, "w");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
- for (;;)
- {
- int n;
- int c;
- int value;
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
- lineno++;
- c = getc (stream);
- if (c == EOF)
- break;
- if (c == '#')
- {
- do c = getc (stream); while (c != EOF && c != '\n');
- continue;
- }
- ungetc (c, stream);
- n = getfield (stream, field0, ';');
- n += getfield (stream, field1, ' ');
- n += getfield (stream, field2, '\n');
- if (n == 0)
- break;
- if (n != 3)
- {
- fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
- lineno);
- exit (1);
- }
-#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
- if (false) {}
- TRY(LBP_BK)
- TRY(LBP_CM)
- TRY(LBP_WJ)
- TRY(LBP_ZW)
- TRY(LBP_GL)
- TRY(LBP_SP)
- TRY(LBP_B2)
- TRY(LBP_BA)
- TRY(LBP_BB)
- TRY(LBP_HY)
- TRY(LBP_CB)
- TRY(LBP_CL)
- TRY(LBP_EX)
- TRY(LBP_IN)
- TRY(LBP_NS)
- TRY(LBP_OP)
- TRY(LBP_QU)
- TRY(LBP_IS)
- TRY(LBP_NU)
- TRY(LBP_PO)
- TRY(LBP_PR)
- TRY(LBP_SY)
- TRY(LBP_AI)
- TRY(LBP_AL)
- TRY(LBP_H2)
- TRY(LBP_H3)
- TRY(LBP_ID)
- TRY(LBP_JL)
- TRY(LBP_JV)
- TRY(LBP_JT)
- TRY(LBP_SA)
- TRY(LBP_XX)
-#undef TRY
- else if (strcmp (field1, "LF") == 0) value = LBP_BK;
- else if (strcmp (field1, "CR") == 0) value = LBP_BK;
- else if (strcmp (field1, "NL") == 0) value = LBP_BK;
- else if (strcmp (field1, "SG") == 0) value = LBP_XX;
+ t.p = 7;
+ t.q = 9;
+ gbp_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ gbp_table_add (&t, ch, unicode_org_gbp[ch]);
+
+ gbp_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
+ t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "unigbrkprop =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
else
- {
- fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
- field1, linebreak_filename, lineno);
- exit (1);
- }
- i = strtoul (field0, NULL, 16);
- if (strstr (field0, "..") != NULL)
- {
- /* Deal with a range. */
- j = strtoul (strstr (field0, "..") + 2, NULL, 16);
- for (; i <= j; i++)
- unicode_org_lbp[i] = value;
- }
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
else
- {
- /* Single character line. */
- unicode_org_lbp[i] = value;
- }
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t) / 2);
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) / 2; i++)
+ {
+ unsigned char *p = (unsigned char *) (t.result + level3_offset);
+ unsigned char value0 = p[i * 2];
+ unsigned char value1 = p[i * 2 + 1];
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
+ (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
}
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
+ fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
-/* Output the line breaking properties in a human readable format. */
+/* Stores in unicode_org_gbp[] the grapheme breaking property from the
+ GraphemeBreakProperty.txt file. */
static void
-debug_output_org_lbp (FILE *stream)
+fill_org_gbp (const char *graphemebreakproperty_filename)
{
unsigned int i;
+ FILE *stream;
+ int lineno = 0;
for (i = 0; i < 0x110000; i++)
- {
- int attr = unicode_org_lbp[i];
- if (attr != LBP_XX)
- {
- fprintf (stream, "0x%04X", i);
-#define PRINT_BIT(attr,bit) \
- if (attr == bit) fprintf (stream, " " #bit);
- PRINT_BIT(attr,LBP_BK);
- PRINT_BIT(attr,LBP_CM);
- PRINT_BIT(attr,LBP_WJ);
- PRINT_BIT(attr,LBP_ZW);
- PRINT_BIT(attr,LBP_GL);
- PRINT_BIT(attr,LBP_SP);
- PRINT_BIT(attr,LBP_B2);
- PRINT_BIT(attr,LBP_BA);
- PRINT_BIT(attr,LBP_BB);
- PRINT_BIT(attr,LBP_HY);
- PRINT_BIT(attr,LBP_CB);
- PRINT_BIT(attr,LBP_CL);
- PRINT_BIT(attr,LBP_EX);
- PRINT_BIT(attr,LBP_IN);
- PRINT_BIT(attr,LBP_NS);
- PRINT_BIT(attr,LBP_OP);
- PRINT_BIT(attr,LBP_QU);
- PRINT_BIT(attr,LBP_IS);
- PRINT_BIT(attr,LBP_NU);
- PRINT_BIT(attr,LBP_PO);
- PRINT_BIT(attr,LBP_PR);
- PRINT_BIT(attr,LBP_SY);
- PRINT_BIT(attr,LBP_AI);
- PRINT_BIT(attr,LBP_AL);
- PRINT_BIT(attr,LBP_H2);
- PRINT_BIT(attr,LBP_H3);
- PRINT_BIT(attr,LBP_ID);
- PRINT_BIT(attr,LBP_JL);
- PRINT_BIT(attr,LBP_JV);
- PRINT_BIT(attr,LBP_JT);
- PRINT_BIT(attr,LBP_SA);
- PRINT_BIT(attr,LBP_XX);
-#undef PRINT_BIT
- fprintf (stream, "\n");
- }
- }
-}
-
-static void
-debug_output_org_lbrk_tables (const char *filename)
-{
- FILE *stream;
+ unicode_org_gbp[i] = GBP_OTHER;
- stream = fopen (filename, "w");
+ stream = fopen (graphemebreakproperty_filename, "r");
if (stream == NULL)
{
- fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ fprintf (stderr, "error during fopen of '%s'\n",
+ graphemebreakproperty_filename);
exit (1);
}
- debug_output_org_lbp (stream);
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char propname[200+1];
+ int propvalue;
+
+ lineno++;
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n",
+ graphemebreakproperty_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+#define PROP(name,value) \
+ if (strcmp (propname, name) == 0) propvalue = value; else
+ PROP ("CR", GBP_CR)
+ PROP ("LF", GBP_LF)
+ PROP ("Control", GBP_CONTROL)
+ PROP ("Extend", GBP_EXTEND)
+ PROP ("Prepend", GBP_PREPEND)
+ PROP ("SpacingMark", GBP_SPACINGMARK)
+ PROP ("L", GBP_L)
+ PROP ("V", GBP_V)
+ PROP ("T", GBP_T)
+ PROP ("LV", GBP_LV)
+ PROP ("LVT", GBP_LVT)
+#undef PROP
+ {
+ fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
+ graphemebreakproperty_filename, lineno);
+ exit (1);
+ }
+ if (!(i1 <= i2 && i2 < 0x110000))
+ abort ();
+
+ for (i = i1; i <= i2; i++)
+ unicode_org_gbp[i] = propvalue;
+ }
if (ferror (stream) || fclose (stream))
{
- fprintf (stderr, "error writing to '%s'\n", filename);
+ fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
exit (1);
}
}
+/* ========================================================================= */
+
+/* Composition and decomposition.
+ Updated for Unicode TR #15 revision 33. */
+
+/* Maximum number of characters into which a single Unicode character can be
+ decomposed. */
+#define MAX_DECOMP_LENGTH 18
+
+enum
+{
+ UC_DECOMP_CANONICAL,/* Canonical decomposition. */
+ UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
+ UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
+ UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
+ UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
+ UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
+ UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
+ UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
+ UC_DECOMP_SUPER, /* <super> A superscript form. */
+ UC_DECOMP_SUB, /* <sub> A subscript form. */
+ UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
+ UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
+ UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
+ UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
+ UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
+ UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
+ UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
+};
+
+/* Return the decomposition for a Unicode character (ignoring Hangul Jamo
+ decompositions). Return the type, or -1 for none. */
+static int
+get_decomposition (unsigned int ch,
+ unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
+{
+ const char *decomposition = unicode_attributes[ch].decomposition;
+
+ if (decomposition != NULL && decomposition[0] != '\0')
+ {
+ int type = UC_DECOMP_CANONICAL;
+ unsigned int length;
+ char *endptr;
+
+ if (decomposition[0] == '<')
+ {
+ const char *rangle;
+ size_t typelen;
+
+ rangle = strchr (decomposition + 1, '>');
+ if (rangle == NULL)
+ abort ();
+ typelen = rangle + 1 - decomposition;
+#define TYPE(t1,t2) \
+ if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
+ type = t2; \
+ else
+ TYPE ("<font>", UC_DECOMP_FONT)
+ TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
+ TYPE ("<initial>", UC_DECOMP_INITIAL)
+ TYPE ("<medial>", UC_DECOMP_MEDIAL)
+ TYPE ("<final>", UC_DECOMP_FINAL)
+ TYPE ("<isolated>", UC_DECOMP_ISOLATED)
+ TYPE ("<circle>", UC_DECOMP_CIRCLE)
+ TYPE ("<super>", UC_DECOMP_SUPER)
+ TYPE ("<sub>", UC_DECOMP_SUB)
+ TYPE ("<vertical>", UC_DECOMP_VERTICAL)
+ TYPE ("<wide>", UC_DECOMP_WIDE)
+ TYPE ("<narrow>", UC_DECOMP_NARROW)
+ TYPE ("<small>", UC_DECOMP_SMALL)
+ TYPE ("<square>", UC_DECOMP_SQUARE)
+ TYPE ("<fraction>", UC_DECOMP_FRACTION)
+ TYPE ("<compat>", UC_DECOMP_COMPAT)
+ {
+ fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
+ exit (1);
+ }
+#undef TYPE
+ decomposition = rangle + 1;
+ if (decomposition[0] == ' ')
+ decomposition++;
+ }
+ for (length = 0; length < MAX_DECOMP_LENGTH; length++)
+ {
+ decomposed[length] = strtoul (decomposition, &endptr, 16);
+ if (endptr == decomposition)
+ break;
+ decomposition = endptr;
+ if (decomposition[0] == ' ')
+ decomposition++;
+ }
+ if (*decomposition != '\0')
+ /* MAX_DECOMP_LENGTH is too small. */
+ abort ();
+
+ *lengthp = length;
+ return type;
+ }
+ else
+ return -1;
+}
+
/* Construction of sparse 3-level tables. */
-#define TABLE lbp_table
-#define ELEMENT unsigned char
-#define DEFAULT LBP_XX
+#define TABLE decomp_table
+#define ELEMENT uint16_t
+#define DEFAULT (uint16_t)(-1)
#define xmalloc malloc
#define xrealloc realloc
#include "3level.h"
static void
-output_lbp (FILE *stream1, FILE *stream2)
+output_decomposition (FILE *stream1, FILE *stream2)
{
- unsigned int i;
- struct lbp_table t;
+ struct decomp_table t;
unsigned int level1_offset, level2_offset, level3_offset;
+ unsigned int offset;
+ unsigned int ch;
+ unsigned int i;
- t.p = 7;
- t.q = 9;
- lbp_table_init (&t);
-
- for (i = 0; i < 0x110000; i++)
- {
- int attr = get_lbp (i);
-
- /* Now attr should contain exactly one bit. */
- if (attr == 0 || ((attr & (attr - 1)) != 0))
- abort ();
-
- if (attr != 1 << LBP_XX)
- {
- unsigned int log2_attr;
- for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
+ t.p = 5;
+ t.q = 5;
+ decomp_table_init (&t);
- lbp_table_add (&t, i, log2_attr);
- }
- }
+ fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
+ fprintf (stream1, "\n");
+ fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
+ offset = 0;
- lbp_table_finalize (&t);
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type >= 0)
+ {
+ if (!(offset < (1 << 15)))
+ abort ();
+ decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
+
+ /* Produce length 3-bytes entries. */
+ if (length == 0)
+ /* We would need a special representation of zero-length entries. */
+ abort ();
+ for (i = 0; i < length; i++)
+ {
+ if (offset > 0)
+ fprintf (stream2, ",");
+ if ((offset % 4) == 0)
+ fprintf (stream2, "\n ");
+ if (!(decomposed[i] < (1 << 18)))
+ abort ();
+ fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
+ (((i+1 < length ? (1 << 23) : 0)
+ | (i == 0 ? (type << 18) : 0)
+ | decomposed[i]) >> 16) & 0xff,
+ (decomposed[i] >> 8) & 0xff,
+ decomposed[i] & 0xff);
+ offset++;
+ }
+ }
+ }
+
+ fprintf (stream2, "\n};\n");
+ fprintf (stream2, "\n");
+
+ decomp_table_finalize (&t);
level1_offset =
5 * sizeof (uint32_t);
+ (t.level2_size << t.q) * sizeof (uint32_t);
for (i = 0; i < 5; i++)
- fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ fprintf (stream1, "#define decomp_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
fprintf (stream1, "\n");
fprintf (stream1, "typedef struct\n");
fprintf (stream1, " {\n");
fprintf (stream1, " int level1[%zu];\n", t.level1_size);
fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
fprintf (stream1, " }\n");
- fprintf (stream1, "lbrkprop_t;\n");
- fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
-
- fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
+ fprintf (stream1, "decomp_index_table_t;\n");
+ fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
+ fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
fprintf (stream2, "{\n");
fprintf (stream2, " {");
if (t.level1_size > 8)
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream2, "\n ");
+ fprintf (stream2, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream2, " %5d", -1);
+ fprintf (stream2, " %5d", -1);
else
- fprintf (stream2, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream2, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream2, ",");
+ fprintf (stream2, ",");
}
if (t.level1_size > 8)
fprintf (stream2, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream2, "\n ");
+ fprintf (stream2, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream2, " %5d", -1);
+ fprintf (stream2, " %5d", -1);
else
- fprintf (stream2, " %5zu",
- (offset - level3_offset) / sizeof (unsigned char));
+ fprintf (stream2, " %5zu",
+ (offset - level3_offset) / sizeof (uint16_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream2, ",");
+ fprintf (stream2, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream2, "\n ");
fprintf (stream2, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
- unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
- const char *value_string;
- switch (value)
- {
-#define CASE(x) case x: value_string = #x; break;
- CASE(LBP_BK);
- CASE(LBP_CM);
- CASE(LBP_WJ);
- CASE(LBP_ZW);
- CASE(LBP_GL);
- CASE(LBP_SP);
- CASE(LBP_B2);
- CASE(LBP_BA);
- CASE(LBP_BB);
- CASE(LBP_HY);
- CASE(LBP_CB);
- CASE(LBP_CL);
- CASE(LBP_EX);
- CASE(LBP_IN);
- CASE(LBP_NS);
- CASE(LBP_OP);
- CASE(LBP_QU);
- CASE(LBP_IS);
- CASE(LBP_NU);
- CASE(LBP_PO);
- CASE(LBP_PR);
- CASE(LBP_SY);
- CASE(LBP_AI);
- CASE(LBP_AL);
- CASE(LBP_H2);
- CASE(LBP_H3);
- CASE(LBP_ID);
- CASE(LBP_JL);
- CASE(LBP_JV);
- CASE(LBP_JT);
- CASE(LBP_SA);
- CASE(LBP_XX);
-#undef CASE
- default:
- abort ();
- }
+ uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
if (i > 0 && (i % 8) == 0)
- fprintf (stream2, "\n ");
- fprintf (stream2, " %s%s", value_string,
- (i+1 < t.level3_size << t.p ? "," : ""));
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
+ if (i+1 < t.level3_size << t.p)
+ fprintf (stream2, ",");
}
if (t.level3_size << t.p > 8)
fprintf (stream2, "\n ");
}
static void
-output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
+output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
{
const char *filenames[2];
FILE *streams[2];
{
streams[i] = fopen (filenames[i], "w");
if (streams[i] == NULL)
- {
- fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
- exit (1);
- }
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
+ exit (1);
+ }
}
for (i = 0; i < 2; i++)
FILE *stream = streams[i];
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
- version);
- fprintf (stream, "\n");
-
- /* Put a GPL header on it. The gnulib module is under LGPL (although it
- still carries the GPL header), and it's gnulib-tool which replaces the
- GPL header with an LGPL header. */
- fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
- fprintf (stream, "\n");
- fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
- fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
- fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
- fprintf (stream, " (at your option) any later version.\n");
- fprintf (stream, "\n");
- fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
- fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
- fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
- fprintf (stream, " GNU General Public License for more details.\n");
- fprintf (stream, "\n");
- fprintf (stream, " You should have received a copy of the GNU General Public License\n");
- fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "/* Decomposition of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
fprintf (stream, "\n");
}
- output_lbp (streams[0], streams[1]);
+ output_decomposition (streams[0], streams[1]);
for (i = 0; i < 2; i++)
{
if (ferror (streams[i]) || fclose (streams[i]))
- {
- fprintf (stderr, "error writing to '%s'\n", filenames[i]);
- exit (1);
- }
- }
-}
-
-/* ========================================================================= */
-
-/* Word break property. */
-
-/* Possible values of the Word_Break property. */
-enum
-{
- WBP_OTHER = 0,
- WBP_CR = 11,
- WBP_LF = 12,
- WBP_NEWLINE = 10,
- WBP_EXTEND = 8,
- WBP_FORMAT = 9,
- WBP_KATAKANA = 1,
- WBP_ALETTER = 2,
- WBP_MIDNUMLET = 3,
- WBP_MIDLETTER = 4,
- WBP_MIDNUM = 5,
- WBP_NUMERIC = 6,
- WBP_EXTENDNUMLET = 7
-};
-
-/* Returns the word breaking property for ch, as a bit mask. */
-static int
-get_wbp (unsigned int ch)
-{
- int attr = 0;
-
- if (unicode_attributes[ch].name != NULL)
- {
- if (ch == 0x000D)
- attr |= 1 << WBP_CR;
-
- if (ch == 0x000A)
- attr |= 1 << WBP_LF;
-
- if (ch == 0x000B || ch == 0x000C
- || ch == 0x0085
- || ch == 0x2028 || ch == 0x2029)
- attr |= 1 << WBP_NEWLINE;
-
- if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
- || (unicode_attributes[ch].category != NULL
- && strcmp (unicode_attributes[ch].category, "Mc") == 0))
- attr |= 1 << WBP_EXTEND;
-
- if (unicode_attributes[ch].category != NULL
- && strcmp (unicode_attributes[ch].category, "Cf") == 0
- && ch != 0x200C && ch != 0x200D)
- attr |= 1 << WBP_FORMAT;
-
- if ((unicode_scripts[ch] < numscripts
- && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
- || (ch >= 0x3031 && ch <= 0x3035)
- || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
- || ch == 0xFF70)
- attr |= 1 << WBP_KATAKANA;
-
- if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
- || ch == 0x05F3)
- && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
- && (attr & (1 << WBP_KATAKANA)) == 0
- && ((get_lbp (ch) >> LBP_SA) & 1) == 0
- && !(unicode_scripts[ch] < numscripts
- && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
- && (attr & (1 << WBP_EXTEND)) == 0)
- attr |= 1 << WBP_ALETTER;
-
- if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
- || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
- attr |= 1 << WBP_MIDNUMLET;
-
- if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
- || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
- attr |= 1 << WBP_MIDLETTER;
-
- if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
- || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
- || ch == 0xFF1B)
- && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
- attr |= 1 << WBP_MIDNUM;
-
- if (((get_lbp (ch) >> LBP_NU) & 1) != 0
- && ch != 0x066C)
- attr |= 1 << WBP_NUMERIC;
-
- if (unicode_attributes[ch].category != NULL
- && strcmp (unicode_attributes[ch].category, "Pc") == 0)
- attr |= 1 << WBP_EXTENDNUMLET;
+ {
+ fprintf (stderr, "error writing to '%s'\n", filenames[i]);
+ exit (1);
+ }
}
-
- if (attr == 0)
- /* other */
- attr |= 1 << WBP_OTHER;
-
- return attr;
}
-/* Output the word break property in a human readable format. */
-static void
-debug_output_wbp (FILE *stream)
-{
- unsigned int i;
-
- for (i = 0; i < 0x110000; i++)
- {
- int attr = get_wbp (i);
- if (attr != 1 << WBP_OTHER)
- {
- fprintf (stream, "0x%04X", i);
- if (attr & (1 << WBP_CR))
- fprintf (stream, " CR");
- if (attr & (1 << WBP_LF))
- fprintf (stream, " LF");
- if (attr & (1 << WBP_NEWLINE))
- fprintf (stream, " Newline");
- if (attr & (1 << WBP_EXTEND))
- fprintf (stream, " Extend");
- if (attr & (1 << WBP_FORMAT))
- fprintf (stream, " Format");
- if (attr & (1 << WBP_KATAKANA))
- fprintf (stream, " Katakana");
- if (attr & (1 << WBP_ALETTER))
- fprintf (stream, " ALetter");
- if (attr & (1 << WBP_MIDNUMLET))
- fprintf (stream, " MidNumLet");
- if (attr & (1 << WBP_MIDLETTER))
- fprintf (stream, " MidLetter");
- if (attr & (1 << WBP_MIDNUM))
- fprintf (stream, " MidNum");
- if (attr & (1 << WBP_NUMERIC))
- fprintf (stream, " Numeric");
- if (attr & (1 << WBP_EXTENDNUMLET))
- fprintf (stream, " ExtendNumLet");
- fprintf (stream, "\n");
- }
- }
-}
+/* The "excluded from composition" property from the CompositionExclusions.txt file. */
+char unicode_composition_exclusions[0x110000];
static void
-debug_output_wbrk_tables (const char *filename)
+fill_composition_exclusions (const char *compositionexclusions_filename)
{
FILE *stream;
-
- stream = fopen (filename, "w");
- if (stream == NULL)
- {
- fprintf (stderr, "cannot open '%s' for writing\n", filename);
- exit (1);
- }
-
- debug_output_wbp (stream);
-
- if (ferror (stream) || fclose (stream))
- {
- fprintf (stderr, "error writing to '%s'\n", filename);
- exit (1);
- }
-}
-
-/* The word break property from the WordBreakProperty.txt file. */
-int unicode_org_wbp[0x110000];
-
-/* Stores in unicode_org_wbp[] the word break property from the
- WordBreakProperty.txt file. */
-static void
-fill_org_wbp (const char *wordbreakproperty_filename)
-{
unsigned int i;
- FILE *stream;
-
- for (i = 0; i < 0x110000; i++)
- unicode_org_wbp[i] = WBP_OTHER;
- stream = fopen (wordbreakproperty_filename, "r");
+ stream = fopen (compositionexclusions_filename, "r");
if (stream == NULL)
{
- fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
- exit (1);
- }
-
- for (;;)
- {
- char buf[200+1];
- unsigned int i1, i2;
- char padding[200+1];
- char propname[200+1];
- int propvalue;
-
- if (fscanf (stream, "%200[^\n]\n", buf) < 1)
- break;
-
- if (buf[0] == '\0' || buf[0] == '#')
- continue;
-
- if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
- {
- if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
- {
- fprintf (stderr, "parse error in '%s'\n",
- wordbreakproperty_filename);
- exit (1);
- }
- i2 = i1;
- }
-#define PROP(name,value) \
- if (strcmp (propname, name) == 0) propvalue = value; else
- PROP ("CR", WBP_CR)
- PROP ("LF", WBP_LF)
- PROP ("Newline", WBP_NEWLINE)
- PROP ("Extend", WBP_EXTEND)
- PROP ("Format", WBP_FORMAT)
- PROP ("Katakana", WBP_KATAKANA)
- PROP ("ALetter", WBP_ALETTER)
- PROP ("MidNumLet", WBP_MIDNUMLET)
- PROP ("MidLetter", WBP_MIDLETTER)
- PROP ("MidNum", WBP_MIDNUM)
- PROP ("Numeric", WBP_NUMERIC)
- PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
-#undef PROP
- {
- fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
- wordbreakproperty_filename);
- exit (1);
- }
- if (!(i1 <= i2 && i2 < 0x110000))
- abort ();
-
- for (i = i1; i <= i2; i++)
- unicode_org_wbp[i] = propvalue;
- }
-
- if (ferror (stream) || fclose (stream))
- {
- fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
+ fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
exit (1);
}
-}
-
-/* Output the word break property in a human readable format. */
-static void
-debug_output_org_wbp (FILE *stream)
-{
- unsigned int i;
for (i = 0; i < 0x110000; i++)
- {
- int propvalue = unicode_org_wbp[i];
- if (propvalue != WBP_OTHER)
- {
- fprintf (stream, "0x%04X", i);
-#define PROP(name,value) \
- if (propvalue == value) fprintf (stream, " " name); else
- PROP ("CR", WBP_CR)
- PROP ("LF", WBP_LF)
- PROP ("Newline", WBP_NEWLINE)
- PROP ("Extend", WBP_EXTEND)
- PROP ("Format", WBP_FORMAT)
- PROP ("Katakana", WBP_KATAKANA)
- PROP ("ALetter", WBP_ALETTER)
- PROP ("MidNumLet", WBP_MIDNUMLET)
- PROP ("MidLetter", WBP_MIDLETTER)
- PROP ("MidNum", WBP_MIDNUM)
- PROP ("Numeric", WBP_NUMERIC)
- PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
-#undef PROP
- fprintf (stream, " ??");
- fprintf (stream, "\n");
- }
- }
-}
-
-static void
-debug_output_org_wbrk_tables (const char *filename)
-{
- FILE *stream;
-
- stream = fopen (filename, "w");
- if (stream == NULL)
- {
- fprintf (stderr, "cannot open '%s' for writing\n", filename);
- exit (1);
- }
-
- debug_output_org_wbp (stream);
-
- if (ferror (stream) || fclose (stream))
- {
- fprintf (stderr, "error writing to '%s'\n", filename);
- exit (1);
- }
-}
-
-/* Construction of sparse 3-level tables. */
-#define TABLE wbp_table
-#define ELEMENT unsigned char
-#define DEFAULT WBP_OTHER
-#define xmalloc malloc
-#define xrealloc realloc
-#include "3level.h"
-
-static void
-output_wbp (FILE *stream)
-{
- unsigned int i;
- struct wbp_table t;
- unsigned int level1_offset, level2_offset, level3_offset;
-
- t.p = 7;
- t.q = 9;
- wbp_table_init (&t);
+ unicode_composition_exclusions[i] = 0;
- for (i = 0; i < 0x110000; i++)
+ for (;;)
{
- int attr = get_wbp (i);
+ char buf[200+1];
+ unsigned int i;
- /* Now attr should contain exactly one bit. */
- if (attr == 0 || ((attr & (attr - 1)) != 0))
- abort ();
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
- if (attr != 1 << WBP_OTHER)
- {
- unsigned int log2_attr;
- for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
- wbp_table_add (&t, i, log2_attr);
- }
+ if (sscanf (buf, "%X", &i) != 1)
+ {
+ fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
+ exit (1);
+ }
+ if (!(i < 0x110000))
+ abort ();
+
+ unicode_composition_exclusions[i] = 1;
}
- wbp_table_finalize (&t);
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
+ exit (1);
+ }
+}
- level1_offset =
- 5 * sizeof (uint32_t);
- level2_offset =
- 5 * sizeof (uint32_t)
- + t.level1_size * sizeof (uint32_t);
- level3_offset =
- 5 * sizeof (uint32_t)
- + t.level1_size * sizeof (uint32_t)
- + (t.level2_size << t.q) * sizeof (uint32_t);
+static void
+debug_output_composition_tables (const char *filename)
+{
+ FILE *stream;
+ unsigned int ch;
- for (i = 0; i < 5; i++)
- fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
- fprintf (stream, "\n");
- fprintf (stream, "typedef struct\n");
- fprintf (stream, " {\n");
- fprintf (stream, " int level1[%zu];\n", t.level1_size);
- fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
- fprintf (stream, " }\n");
- fprintf (stream, "wbrkprop_t;\n");
- fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
- fprintf (stream, "{\n");
- fprintf (stream, " {");
- if (t.level1_size > 8)
- fprintf (stream, "\n ");
- for (i = 0; i < t.level1_size; i++)
+ stream = fopen (filename, "w");
+ if (stream == NULL)
{
- uint32_t offset;
- if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- offset = ((uint32_t *) (t.result + level1_offset))[i];
- if (offset == 0)
- fprintf (stream, " %5d", -1);
- else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
- if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
}
- if (t.level1_size > 8)
- fprintf (stream, "\n ");
- fprintf (stream, " },\n");
- fprintf (stream, " {");
- if (t.level2_size << t.q > 8)
- fprintf (stream, "\n ");
- for (i = 0; i < t.level2_size << t.q; i++)
+
+ for (ch = 0; ch < 0x110000; ch++)
{
- uint32_t offset;
- if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
- offset = ((uint32_t *) (t.result + level2_offset))[i];
- if (offset == 0)
- fprintf (stream, " %5d", -1);
- else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (unsigned char));
- if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type == UC_DECOMP_CANONICAL
+ /* Consider only binary decompositions.
+ Exclude singleton decompositions. */
+ && length == 2)
+ {
+ unsigned int code1 = decomposed[0];
+ unsigned int code2 = decomposed[1];
+ unsigned int combined = ch;
+
+ /* Exclude decompositions where the first part is not a starter,
+ i.e. is not of canonical combining class 0. */
+ if (strcmp (unicode_attributes[code1].combining, "0") == 0
+ /* Exclude characters listed in CompositionExclusions.txt. */
+ && !unicode_composition_exclusions[combined])
+ {
+ /* The combined character must now also be a starter.
+ Verify this. */
+ if (strcmp (unicode_attributes[combined].combining, "0") != 0)
+ abort ();
+
+ fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
+ code1,
+ code2,
+ combined,
+ unicode_attributes[code2].combining);
+ }
+ }
}
- if (t.level2_size << t.q > 8)
- fprintf (stream, "\n ");
- fprintf (stream, " },\n");
- fprintf (stream, " {");
- if (t.level3_size << t.p > 4)
- fprintf (stream, "\n ");
- for (i = 0; i < t.level3_size << t.p; i++)
+
+ if (ferror (stream) || fclose (stream))
{
- unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
- const char *value_string;
- switch (value)
- {
-#define CASE(x) case x: value_string = #x; break;
- CASE(WBP_OTHER);
- CASE(WBP_CR);
- CASE(WBP_LF);
- CASE(WBP_NEWLINE);
- CASE(WBP_EXTEND);
- CASE(WBP_FORMAT);
- CASE(WBP_KATAKANA);
- CASE(WBP_ALETTER);
- CASE(WBP_MIDNUMLET);
- CASE(WBP_MIDLETTER);
- CASE(WBP_MIDNUM);
- CASE(WBP_NUMERIC);
- CASE(WBP_EXTENDNUMLET);
-#undef CASE
- default:
- abort ();
- }
- if (i > 0 && (i % 4) == 0)
- fprintf (stream, "\n ");
- fprintf (stream, " %s%s", value_string,
- (i+1 < t.level3_size << t.p ? "," : ""));
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
}
- if (t.level3_size << t.p > 4)
- fprintf (stream, "\n ");
- fprintf (stream, " }\n");
- fprintf (stream, "};\n");
}
static void
-output_wbrk_tables (const char *filename, const char *version)
+output_composition_tables (const char *filename, const char *version)
{
FILE *stream;
+ unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
fprintf (stream, "\n");
/* Put a GPL header on it. The gnulib module is under LGPL (although it
still carries the GPL header), and it's gnulib-tool which replaces the
GPL header with an LGPL header. */
- fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
+ fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
fprintf (stream, "\n");
fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
fprintf (stream, "\n");
- output_wbp (stream);
+ /* The composition table is a set of mappings (code1, code2) -> combined,
+ with 928 entries,
+ 367 values for code1 (from 0x003C to 0x30FD),
+ 54 values for code2 (from 0x0300 to 0x309A).
+ For a fixed code1, there are from 1 to 19 possible values for code2.
+ For a fixed code2, there are from 1 to 117 possible values for code1.
+ This is a very sparse matrix.
+
+ We want an O(1) hash lookup.
+
+ We could implement the hash lookup by mapping (code1, code2) to a linear
+ combination mul1*code1 + mul2*code2, which is then used as an index into
+ a 3-level table. But this leads to a table of size 37 KB.
+
+ We use gperf to implement the hash lookup, giving it the 928 sets of
+ 4 bytes (code1, code2) as input. gperf generates a hash table of size
+ 1527, which is quite good (60% filled). It requires an auxiliary table
+ lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
+
+ fprintf (stream, "struct composition_rule { char codes[6]; };\n");
+ fprintf (stream, "%%struct-type\n");
+ fprintf (stream, "%%language=ANSI-C\n");
+ fprintf (stream, "%%define slot-name codes\n");
+ fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
+ fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
+ fprintf (stream, "%%compare-lengths\n");
+ fprintf (stream, "%%compare-strncmp\n");
+ fprintf (stream, "%%readonly-tables\n");
+ fprintf (stream, "%%omit-struct-type\n");
+ fprintf (stream, "%%%%\n");
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type == UC_DECOMP_CANONICAL
+ /* Consider only binary decompositions.
+ Exclude singleton decompositions. */
+ && length == 2)
+ {
+ unsigned int code1 = decomposed[0];
+ unsigned int code2 = decomposed[1];
+ unsigned int combined = ch;
+
+ /* Exclude decompositions where the first part is not a starter,
+ i.e. is not of canonical combining class 0. */
+ if (strcmp (unicode_attributes[code1].combining, "0") == 0
+ /* Exclude characters listed in CompositionExclusions.txt. */
+ && !unicode_composition_exclusions[combined])
+ {
+ /* The combined character must now also be a starter.
+ Verify this. */
+ if (strcmp (unicode_attributes[combined].combining, "0") != 0)
+ abort ();
+
+ fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
+ (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
+ (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
+ combined);
+ }
+ }
+ }
if (ferror (stream) || fclose (stream))
{
static void
output_simple_mapping_test (const char *filename,
- const char *function_name,
- unsigned int (*func) (unsigned int),
- const char *version)
+ const char *function_name,
+ unsigned int (*func) (unsigned int),
+ const char *version)
{
FILE *stream;
bool need_comma;
fprintf (stream, " You should have received a copy of the GNU General Public License\n");
fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
fprintf (stream, "\n");
- fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
fprintf (stream, "\n");
fprintf (stream, "#include \"test-mapping-part1.h\"\n");
fprintf (stream, "\n");
unsigned int value = func (ch);
if (value != ch)
- {
- if (need_comma)
- fprintf (stream, ",\n");
- fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
- need_comma = true;
- }
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
+ need_comma = true;
+ }
}
if (need_comma)
fprintf (stream, "\n");
static void
output_simple_mapping (const char *filename,
- unsigned int (*func) (unsigned int),
- const char *version)
+ unsigned int (*func) (unsigned int),
+ const char *version)
{
FILE *stream;
unsigned int ch, i;
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
- fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
- version);
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
t.p = 7;
t.q = 9;
for (i = 0; i < 5; i++)
fprintf (stream, "#define mapping_header_%d %d\n", i,
- ((uint32_t *) t.result)[i]);
+ ((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level2_offset) / sizeof (uint32_t));
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
if (offset == 0)
- fprintf (stream, " %5d", -1);
+ fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (int32_t));
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (int32_t));
if (i+1 < t.level2_size << t.q)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
if (i > 0 && (i % 8) == 0)
- fprintf (stream, "\n ");
+ fprintf (stream, "\n ");
fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
if (i+1 < t.level3_size << t.p)
- fprintf (stream, ",");
+ fprintf (stream, ",");
}
if (t.level3_size << t.p > 8)
fprintf (stream, "\n ");
/* ========================================================================= */
+/* A special casing context.
+ A context is negated through x -> -x. */
+enum
+{
+ SCC_ALWAYS = 0,
+ SCC_FINAL_SIGMA,
+ SCC_AFTER_SOFT_DOTTED,
+ SCC_MORE_ABOVE,
+ SCC_BEFORE_DOT,
+ SCC_AFTER_I
+};
+
+/* A special casing rule. */
+struct special_casing_rule
+{
+ unsigned int code;
+ unsigned int lower_mapping[3];
+ unsigned int title_mapping[3];
+ unsigned int upper_mapping[3];
+ unsigned int casefold_mapping[3];
+ const char *language;
+ int context;
+};
+
+/* The special casing rules. */
+struct special_casing_rule **casing_rules;
+unsigned int num_casing_rules;
+unsigned int allocated_casing_rules;
+
+static void
+add_casing_rule (struct special_casing_rule *new_rule)
+{
+ if (num_casing_rules == allocated_casing_rules)
+ {
+ allocated_casing_rules = 2 * allocated_casing_rules;
+ if (allocated_casing_rules < 16)
+ allocated_casing_rules = 16;
+ casing_rules =
+ (struct special_casing_rule **)
+ realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
+ }
+ casing_rules[num_casing_rules++] = new_rule;
+}
+
+/* Stores in casing_rules the special casing rules found in
+ specialcasing_filename. */
+static void
+fill_casing_rules (const char *specialcasing_filename)
+{
+ FILE *stream;
+
+ stream = fopen (specialcasing_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+
+ casing_rules = NULL;
+ num_casing_rules = 0;
+ allocated_casing_rules = 0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ char *scanptr;
+ char *endptr;
+ int i;
+
+ unsigned int code;
+ unsigned int lower_mapping[3];
+ unsigned int title_mapping[3];
+ unsigned int upper_mapping[3];
+ char *language;
+ int context;
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ /* Scan code. */
+ scanptr = buf;
+ code = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan lower mapping. */
+ for (i = 0; i < 3; i++)
+ lower_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ lower_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan title mapping. */
+ for (i = 0; i < 3; i++)
+ title_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ title_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan upper mapping. */
+ for (i = 0; i < 3; i++)
+ upper_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ upper_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan language and context. */
+ language = NULL;
+ context = SCC_ALWAYS;
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr != '\0' && *scanptr != '#')
+ {
+ const char *word_begin = scanptr;
+ const char *word_end;
+
+ while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+ scanptr++;
+ word_end = scanptr;
+
+ while (*scanptr == ' ')
+ scanptr++;
+
+ if (word_end - word_begin == 2)
+ {
+ language = (char *) malloc ((word_end - word_begin) + 1);
+ memcpy (language, word_begin, 2);
+ language[word_end - word_begin] = '\0';
+ word_begin = word_end = NULL;
+
+ if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
+ {
+ word_begin = scanptr;
+ while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+ scanptr++;
+ word_end = scanptr;
+ }
+ }
+
+ if (word_end > word_begin)
+ {
+ bool negate = false;
+
+ if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
+ {
+ word_begin += 4;
+ negate = true;
+ }
+ if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
+ context = SCC_FINAL_SIGMA;
+ else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
+ context = SCC_AFTER_SOFT_DOTTED;
+ else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
+ context = SCC_MORE_ABOVE;
+ else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
+ context = SCC_BEFORE_DOT;
+ else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
+ context = SCC_AFTER_I;
+ else
+ {
+ fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ if (negate)
+ context = - context;
+ }
+
+ if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ }
+
+ /* Store the rule. */
+ {
+ struct special_casing_rule *new_rule =
+ (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+ new_rule->code = code;
+ new_rule->language = language;
+ new_rule->context = context;
+ memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
+ memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
+ memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
+
+ add_casing_rule (new_rule);
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+}
+
+/* A casefolding rule. */
+struct casefold_rule
+{
+ unsigned int code;
+ unsigned int mapping[3];
+ const char *language;
+};
+
+/* The casefolding rules. */
+struct casefold_rule **casefolding_rules;
+unsigned int num_casefolding_rules;
+unsigned int allocated_casefolding_rules;
+
+/* Stores in casefolding_rules the case folding rules found in
+ casefolding_filename. */
+static void
+fill_casefolding_rules (const char *casefolding_filename)
+{
+ FILE *stream;
+
+ stream = fopen (casefolding_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
+ exit (1);
+ }
+
+ casefolding_rules = NULL;
+ num_casefolding_rules = 0;
+ allocated_casefolding_rules = 0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ char *scanptr;
+ char *endptr;
+ int i;
+
+ unsigned int code;
+ char type;
+ unsigned int mapping[3];
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ /* Scan code. */
+ scanptr = buf;
+ code = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan type. */
+ while (*scanptr == ' ')
+ scanptr++;
+
+ switch (*scanptr)
+ {
+ case 'C': case 'F': case 'S': case 'T':
+ type = *scanptr;
+ break;
+ default:
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan casefold mapping. */
+ for (i = 0; i < 3; i++)
+ mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
+ if (type != 'S')
+ {
+ const char * const *languages;
+ unsigned int languages_count;
+
+ /* Type 'T' indicates that the rule is applicable to Turkish
+ languages only. */
+ if (type == 'T')
+ {
+ static const char * const turkish_languages[] = { "tr", "az" };
+ languages = turkish_languages;
+ languages_count = 2;
+ }
+ else
+ {
+ static const char * const all_languages[] = { NULL };
+ languages = all_languages;
+ languages_count = 1;
+ }
+
+ for (i = 0; i < languages_count; i++)
+ {
+ /* Store a new rule. */
+ struct casefold_rule *new_rule =
+ (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
+ new_rule->code = code;
+ memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
+ new_rule->language = languages[i];
+
+ if (num_casefolding_rules == allocated_casefolding_rules)
+ {
+ allocated_casefolding_rules = 2 * allocated_casefolding_rules;
+ if (allocated_casefolding_rules < 16)
+ allocated_casefolding_rules = 16;
+ casefolding_rules =
+ (struct casefold_rule **)
+ realloc (casefolding_rules,
+ allocated_casefolding_rules * sizeof (struct casefold_rule *));
+ }
+ casefolding_rules[num_casefolding_rules++] = new_rule;
+ }
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
+ exit (1);
+ }
+}
+
+/* Casefold mapping, when it maps to a single character. */
+unsigned int unicode_casefold[0x110000];
+
+static unsigned int
+to_casefold (unsigned int ch)
+{
+ return unicode_casefold[ch];
+}
+
+/* Redistribute the casefolding_rules:
+ - Rules that map to a single character, language independently, are stored
+ in unicode_casefold.
+ - Other rules are merged into casing_rules. */
+static void
+redistribute_casefolding_rules (void)
+{
+ unsigned int ch, i, j;
+
+ /* Fill unicode_casefold[]. */
+ for (ch = 0; ch < 0x110000; ch++)
+ unicode_casefold[ch] = ch;
+ for (i = 0; i < num_casefolding_rules; i++)
+ {
+ struct casefold_rule *cfrule = casefolding_rules[i];
+
+ if (cfrule->language == NULL && cfrule->mapping[1] == 0)
+ {
+ ch = cfrule->code;
+ if (!(ch < 0x110000))
+ abort ();
+ unicode_casefold[ch] = cfrule->mapping[0];
+ }
+ }
+
+ /* Extend the special casing rules by filling in their casefold_mapping[]
+ field. */
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+ unsigned int k;
+
+ rule->casefold_mapping[0] = to_casefold (rule->code);
+ for (k = 1; k < 3; k++)
+ rule->casefold_mapping[k] = 0;
+ }
+
+ /* Now merge the other casefolding rules into casing_rules. */
+ for (i = 0; i < num_casefolding_rules; i++)
+ {
+ struct casefold_rule *cfrule = casefolding_rules[i];
+
+ if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
+ {
+ /* Find a rule that applies to the same code, same language, and it
+ has context SCC_ALWAYS. At the same time, update all rules that
+ have the same code and same or more specific language. */
+ struct special_casing_rule *found_rule = NULL;
+
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+
+ if (rule->code == cfrule->code
+ && (cfrule->language == NULL
+ || (rule->language != NULL
+ && strcmp (rule->language, cfrule->language) == 0)))
+ {
+ memcpy (rule->casefold_mapping, cfrule->mapping,
+ sizeof (rule->casefold_mapping));
+
+ if ((cfrule->language == NULL
+ ? rule->language == NULL
+ : rule->language != NULL
+ && strcmp (rule->language, cfrule->language) == 0)
+ && rule->context == SCC_ALWAYS)
+ {
+ /* Found it. */
+ found_rule = rule;
+ }
+ }
+ }
+
+ if (found_rule == NULL)
+ {
+ /* Create a new rule. */
+ struct special_casing_rule *new_rule =
+ (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+
+ /* Try to find a rule that applies to the same code, no language
+ restriction, and with context SCC_ALWAYS. */
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+
+ if (rule->code == cfrule->code
+ && rule->context == SCC_ALWAYS
+ && rule->language == NULL)
+ {
+ /* Found it. */
+ found_rule = rule;
+ break;
+ }
+ }
+
+ new_rule->code = cfrule->code;
+ new_rule->language = cfrule->language;
+ new_rule->context = SCC_ALWAYS;
+ if (found_rule != NULL)
+ {
+ memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
+ sizeof (new_rule->lower_mapping));
+ memcpy (new_rule->title_mapping, found_rule->title_mapping,
+ sizeof (new_rule->title_mapping));
+ memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
+ sizeof (new_rule->upper_mapping));
+ }
+ else
+ {
+ unsigned int k;
+
+ new_rule->lower_mapping[0] = to_lower (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->lower_mapping[k] = 0;
+ new_rule->title_mapping[0] = to_title (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->title_mapping[k] = 0;
+ new_rule->upper_mapping[0] = to_upper (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->upper_mapping[k] = 0;
+ }
+ memcpy (new_rule->casefold_mapping, cfrule->mapping,
+ sizeof (new_rule->casefold_mapping));
+
+ add_casing_rule (new_rule);
+ }
+ }
+ }
+}
+
+static int
+compare_casing_rules (const void *a, const void *b)
+{
+ struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
+ struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
+ unsigned int a_code = a_rule->code;
+ unsigned int b_code = b_rule->code;
+
+ if (a_code < b_code)
+ return -1;
+ if (a_code > b_code)
+ return 1;
+
+ /* Sort the more specific rules before the more general ones. */
+ return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
+ + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
+}
+
+static void
+sort_casing_rules (void)
+{
+ /* Sort the rules 1. by code, 2. by specificity. */
+ if (num_casing_rules > 1)
+ qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
+ compare_casing_rules);
+}
+
+/* Output the special casing rules. */
+static void
+output_casing_rules (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int i, j;
+ unsigned int minor;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
+ fprintf (stream, "%%struct-type\n");
+ fprintf (stream, "%%language=ANSI-C\n");
+ fprintf (stream, "%%define slot-name code\n");
+ fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
+ fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
+ fprintf (stream, "%%compare-lengths\n");
+ fprintf (stream, "%%compare-strncmp\n");
+ fprintf (stream, "%%readonly-tables\n");
+ fprintf (stream, "%%omit-struct-type\n");
+ fprintf (stream, "%%%%\n");
+
+ minor = 0;
+ for (i = 0; i < num_casing_rules; i++)
+ {
+ struct special_casing_rule *rule = casing_rules[i];
+ int context;
+
+ if (i > 0 && rule->code == casing_rules[i - 1]->code)
+ minor += 1;
+ else
+ minor = 0;
+
+ if (!(rule->code < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+
+ fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
+ (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
+
+ fprintf (stream, "%d, ",
+ i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
+
+ context = rule->context;
+ if (context < 0)
+ {
+ fprintf (stream, "-");
+ context = - context;
+ }
+ else
+ fprintf (stream, " ");
+ switch (context)
+ {
+ case SCC_ALWAYS:
+ fprintf (stream, "SCC_ALWAYS ");
+ break;
+ case SCC_FINAL_SIGMA:
+ fprintf (stream, "SCC_FINAL_SIGMA ");
+ break;
+ case SCC_AFTER_SOFT_DOTTED:
+ fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
+ break;
+ case SCC_MORE_ABOVE:
+ fprintf (stream, "SCC_MORE_ABOVE ");
+ break;
+ case SCC_BEFORE_DOT:
+ fprintf (stream, "SCC_BEFORE_DOT ");
+ break;
+ case SCC_AFTER_I:
+ fprintf (stream, "SCC_AFTER_I ");
+ break;
+ default:
+ abort ();
+ }
+ fprintf (stream, ", ");
+
+ if (rule->language != NULL)
+ {
+ if (strlen (rule->language) != 2)
+ abort ();
+ fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
+ }
+ else
+ fprintf (stream, "{ '\\0', '\\0' }, ");
+
+ fprintf (stream, "{ ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->upper_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->upper_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->upper_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->lower_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->lower_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->lower_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->title_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->title_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->title_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->casefold_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->casefold_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }\n");
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Quoting the Unicode standard:
+ Definition: A character is defined to be "cased" if it has the Lowercase
+ or Uppercase property or has a General_Category value of
+ Titlecase_Letter. */
+static bool
+is_cased (unsigned int ch)
+{
+ return (is_property_lowercase (ch)
+ || is_property_uppercase (ch)
+ || is_category_Lt (ch));
+}
+
+/* Quoting the Unicode standard:
+ Definition: A character is defined to be "case-ignorable" if it has the
+ value MidLetter {or the value MidNumLet} for the Word_Break property or
+ its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
+ Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
+ The text marked in braces was added in Unicode 5.1.0, see
+ <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
+ Definition of case-ignorable". */
+/* Since this predicate is only used for the "Before C" and "After C"
+ conditions of FINAL_SIGMA, we exclude the "cased" characters here.
+ This simplifies the evaluation of the regular expressions
+ \p{cased} (\p{case-ignorable})* C
+ and
+ C (\p{case-ignorable})* \p{cased}
+ */
+static bool
+is_case_ignorable (unsigned int ch)
+{
+ return (unicode_org_wbp[ch] == WBP_MIDLETTER
+ || unicode_org_wbp[ch] == WBP_MIDNUMLET
+ || is_category_Mn (ch)
+ || is_category_Me (ch)
+ || is_category_Cf (ch)
+ || is_category_Lm (ch)
+ || is_category_Sk (ch))
+ && !is_cased (ch);
+}
+
+/* ------------------------------------------------------------------------- */
+
+/* Output all case related properties. */
+static void
+output_casing_properties (const char *version)
+{
+#define PROPERTY(FN,P) \
+ debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
+ output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
+ output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
+ PROPERTY(cased, cased)
+ PROPERTY(ignorable, case_ignorable)
+#undef PROPERTY
+}
+
+/* ========================================================================= */
+
int
main (int argc, char * argv[])
{
const char *unicodedata_filename;
const char *proplist_filename;
const char *derivedproplist_filename;
+ const char *arabicshaping_filename;
const char *scripts_filename;
const char *blocks_filename;
const char *proplist30_filename;
const char *eastasianwidth_filename;
const char *linebreak_filename;
const char *wordbreakproperty_filename;
+ const char *graphemebreakproperty_filename;
+ const char *compositionexclusions_filename;
+ const char *specialcasing_filename;
+ const char *casefolding_filename;
const char *version;
- if (argc != 11)
+ if (argc != 16)
{
- fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt version\n",
- argv[0]);
+ fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
+ argv[0]);
exit (1);
}
unicodedata_filename = argv[1];
proplist_filename = argv[2];
derivedproplist_filename = argv[3];
- scripts_filename = argv[4];
- blocks_filename = argv[5];
- proplist30_filename = argv[6];
- eastasianwidth_filename = argv[7];
- linebreak_filename = argv[8];
- wordbreakproperty_filename = argv[9];
- version = argv[10];
+ arabicshaping_filename = argv[4];
+ scripts_filename = argv[5];
+ blocks_filename = argv[6];
+ proplist30_filename = argv[7];
+ eastasianwidth_filename = argv[8];
+ linebreak_filename = argv[9];
+ wordbreakproperty_filename = argv[10];
+ graphemebreakproperty_filename = argv[11];
+ compositionexclusions_filename = argv[12];
+ specialcasing_filename = argv[13];
+ casefolding_filename = argv[14];
+ version = argv[15];
fill_attributes (unicodedata_filename);
clear_properties ();
fill_properties (proplist_filename);
fill_properties (derivedproplist_filename);
fill_properties30 (proplist30_filename);
+ fill_arabicshaping (arabicshaping_filename);
fill_scripts (scripts_filename);
fill_blocks (blocks_filename);
fill_width (eastasianwidth_filename);
fill_org_lbp (linebreak_filename);
fill_org_wbp (wordbreakproperty_filename);
+ fill_org_gbp (graphemebreakproperty_filename);
+ fill_composition_exclusions (compositionexclusions_filename);
+ fill_casing_rules (specialcasing_filename);
+ fill_casefolding_rules (casefolding_filename);
+ redistribute_casefolding_rules ();
+ sort_casing_rules ();
output_categories (version);
output_category ("unictype/categ_of.h", version);
- output_combclass ("unictype/combining.h", version);
+ output_combclass ("unictype/combiningclass.h", version);
output_bidi_category ("unictype/bidi_of.h", version);
output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
output_decimal_digit ("unictype/decdigit.h", version);
output_numeric ("unictype/numeric.h", version);
output_mirror ("unictype/mirror.h", version);
output_properties (version);
+ output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
+ output_joining_type ("unictype/joiningtype_of.h", version);
+ output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
+ output_joining_group ("unictype/joininggroup_of.h", version);
+
output_scripts (version);
output_scripts_byname (version);
output_blocks (version);
output_ident_properties (version);
+ output_nonspacing_property ("uniwidth/width.c.part");
+ output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
output_old_ctype (version);
debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
+ output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
+ output_gbp_table ("unigbrk/gbrkprop.h", version);
+
+ output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
+ debug_output_composition_tables ("uninorm/composition.txt");
+ output_composition_tables ("uninorm/composition-table.gperf", version);
+
output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
output_simple_mapping ("unicase/toupper.h", to_upper, version);
output_simple_mapping ("unicase/tolower.h", to_lower, version);
output_simple_mapping ("unicase/totitle.h", to_title, version);
+ output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
+ output_casing_rules ("unicase/special-casing-table.gperf", version);
+ output_casing_properties (version);
return 0;
}
* compile-command: "
gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
./gen-uni-tables \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
/gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
- /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
- 5.1.0
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
+ 6.0.0 \
+ && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
+ && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt
"
* End:
*/