/* Generate Unicode conforming character classification tables and
- Line Break Properties tables from a UnicodeData file.
+ line break properties tables and word break property tables and
+ decomposition/composition and case mapping tables from a UnicodeData file.
Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
/usr/local/share/Unidata/EastAsianWidth.txt \
/usr/local/share/Unidata/LineBreak.txt \
/usr/local/share/Unidata/WordBreakProperty.txt \
+ /usr/local/share/Unidata/CompositionExclusions.txt \
+ /usr/local/share/Unidata/SpecialCasing.txt \
+ /usr/local/share/Unidata/CaseFolding.txt \
5.1.0
*/
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
+ fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
+ fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (int32_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level2_offset) / sizeof (uint32_t));
if (i+1 < t.level1_size)
fprintf (stream, ",");
if (offset == 0)
fprintf (stream, " %5d", -1);
else
- fprintf (stream, " %5zd",
+ fprintf (stream, " %5zu",
(offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
if (i > 0 && (i % 8) == 0)
fprintf (stream2, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
- fprintf (stream2, " %5zd%s",
- offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
- (i+1 < t.level1_size ? "," : ""));
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream2, ",");
}
if (t.level1_size > 8)
fprintf (stream2, "\n ");
if (i > 0 && (i % 8) == 0)
fprintf (stream2, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
- fprintf (stream2, " %5zd%s",
- offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
- (i+1 < t.level2_size << t.q ? "," : ""));
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level3_offset) / sizeof (unsigned char));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream2, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream2, "\n ");
if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
- fprintf (stream, " %5zd%s",
- offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
- (i+1 < t.level1_size ? "," : ""));
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
}
if (t.level1_size > 8)
fprintf (stream, "\n ");
if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
- fprintf (stream, " %5zd%s",
- offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
- (i+1 < t.level2_size << t.q ? "," : ""));
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (unsigned char));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
/* ========================================================================= */
-int
-main (int argc, char * argv[])
+/* Maximum number of characters into which a single Unicode character can be
+ decomposed. */
+#define MAX_DECOMP_LENGTH 18
+
+enum
{
- const char *unicodedata_filename;
- const char *proplist_filename;
- const char *derivedproplist_filename;
- const char *scripts_filename;
- const char *blocks_filename;
- const char *proplist30_filename;
- const char *eastasianwidth_filename;
- const char *linebreak_filename;
- const char *wordbreakproperty_filename;
- const char *version;
+ UC_DECOMP_CANONICAL,/* Canonical decomposition. */
+ UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
+ UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
+ UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
+ UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
+ UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
+ UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
+ UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
+ UC_DECOMP_SUPER, /* <super> A superscript form. */
+ UC_DECOMP_SUB, /* <sub> A subscript form. */
+ UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
+ UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
+ UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
+ UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
+ UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
+ UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
+ UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
+};
+
+/* Return the decomposition for a Unicode character (ignoring Hangul Jamo
+ decompositions). Return the type, or -1 for none. */
+static int
+get_decomposition (unsigned int ch,
+ unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
+{
+ const char *decomposition = unicode_attributes[ch].decomposition;
- if (argc != 11)
+ if (decomposition != NULL && decomposition[0] != '\0')
{
- fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt version\n",
- argv[0]);
+ int type = UC_DECOMP_CANONICAL;
+ unsigned int length;
+ char *endptr;
+
+ if (decomposition[0] == '<')
+ {
+ const char *rangle;
+ size_t typelen;
+
+ rangle = strchr (decomposition + 1, '>');
+ if (rangle == NULL)
+ abort ();
+ typelen = rangle + 1 - decomposition;
+#define TYPE(t1,t2) \
+ if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
+ type = t2; \
+ else
+ TYPE ("<font>", UC_DECOMP_FONT)
+ TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
+ TYPE ("<initial>", UC_DECOMP_INITIAL)
+ TYPE ("<medial>", UC_DECOMP_MEDIAL)
+ TYPE ("<final>", UC_DECOMP_FINAL)
+ TYPE ("<isolated>", UC_DECOMP_ISOLATED)
+ TYPE ("<circle>", UC_DECOMP_CIRCLE)
+ TYPE ("<super>", UC_DECOMP_SUPER)
+ TYPE ("<sub>", UC_DECOMP_SUB)
+ TYPE ("<vertical>", UC_DECOMP_VERTICAL)
+ TYPE ("<wide>", UC_DECOMP_WIDE)
+ TYPE ("<narrow>", UC_DECOMP_NARROW)
+ TYPE ("<small>", UC_DECOMP_SMALL)
+ TYPE ("<square>", UC_DECOMP_SQUARE)
+ TYPE ("<fraction>", UC_DECOMP_FRACTION)
+ TYPE ("<compat>", UC_DECOMP_COMPAT)
+ {
+ fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
+ exit (1);
+ }
+#undef TYPE
+ decomposition = rangle + 1;
+ if (decomposition[0] == ' ')
+ decomposition++;
+ }
+ for (length = 0; length < MAX_DECOMP_LENGTH; length++)
+ {
+ decomposed[length] = strtoul (decomposition, &endptr, 16);
+ if (endptr == decomposition)
+ break;
+ decomposition = endptr;
+ if (decomposition[0] == ' ')
+ decomposition++;
+ }
+ if (*decomposition != '\0')
+ /* MAX_DECOMP_LENGTH is too small. */
+ abort ();
+
+ *lengthp = length;
+ return type;
+ }
+ else
+ return -1;
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE decomp_table
+#define ELEMENT uint16_t
+#define DEFAULT (uint16_t)(-1)
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_decomposition (FILE *stream1, FILE *stream2)
+{
+ struct decomp_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+ unsigned int offset;
+ unsigned int ch;
+ unsigned int i;
+
+ t.p = 5;
+ t.q = 5;
+ decomp_table_init (&t);
+
+ fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
+ fprintf (stream1, "\n");
+ fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
+ offset = 0;
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type >= 0)
+ {
+ if (!(offset < (1 << 15)))
+ abort ();
+ decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
+
+ /* Produce length 3-bytes entries. */
+ if (length == 0)
+ /* We would need a special representation of zero-length entries. */
+ abort ();
+ for (i = 0; i < length; i++)
+ {
+ if (offset > 0)
+ fprintf (stream2, ",");
+ if ((offset % 4) == 0)
+ fprintf (stream2, "\n ");
+ if (!(decomposed[i] < (1 << 18)))
+ abort ();
+ fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
+ (((i+1 < length ? (1 << 23) : 0)
+ | (i == 0 ? (type << 18) : 0)
+ | decomposed[i]) >> 16) & 0xff,
+ (decomposed[i] >> 8) & 0xff,
+ decomposed[i] & 0xff);
+ offset++;
+ }
+ }
+ }
+
+ fprintf (stream2, "\n};\n");
+ fprintf (stream2, "\n");
+
+ decomp_table_finalize (&t);
+
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream1, "#define decomp_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream1, "\n");
+ fprintf (stream1, "typedef struct\n");
+ fprintf (stream1, " {\n");
+ fprintf (stream1, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream1, " }\n");
+ fprintf (stream1, "decomp_index_table_t;\n");
+ fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
+ fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
+ fprintf (stream2, "{\n");
+ fprintf (stream2, " {");
+ if (t.level1_size > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream2, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " },\n");
+ fprintf (stream2, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream2, " %5d", -1);
+ else
+ fprintf (stream2, " %5zu",
+ (offset - level3_offset) / sizeof (uint16_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream2, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " },\n");
+ fprintf (stream2, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream2, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
+ if (i+1 < t.level3_size << t.p)
+ fprintf (stream2, ",");
+ }
+ if (t.level3_size << t.p > 8)
+ fprintf (stream2, "\n ");
+ fprintf (stream2, " }\n");
+ fprintf (stream2, "};\n");
+}
+
+static void
+output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
+{
+ const char *filenames[2];
+ FILE *streams[2];
+ size_t i;
+
+ filenames[0] = filename1;
+ filenames[1] = filename2;
+
+ for (i = 0; i < 2; i++)
+ {
+ streams[i] = fopen (filenames[i], "w");
+ if (streams[i] == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
+ exit (1);
+ }
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ FILE *stream = streams[i];
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Decomposition of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "\n");
+ }
+
+ output_decomposition (streams[0], streams[1]);
+
+ for (i = 0; i < 2; i++)
+ {
+ if (ferror (streams[i]) || fclose (streams[i]))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filenames[i]);
+ exit (1);
+ }
+ }
+}
+
+/* The "excluded from composition" property from the CompositionExclusions.txt file. */
+char unicode_composition_exclusions[0x110000];
+
+static void
+fill_composition_exclusions (const char *compositionexclusions_filename)
+{
+ FILE *stream;
+ unsigned int i;
+
+ stream = fopen (compositionexclusions_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
exit (1);
}
- unicodedata_filename = argv[1];
- proplist_filename = argv[2];
- derivedproplist_filename = argv[3];
- scripts_filename = argv[4];
- blocks_filename = argv[5];
- proplist30_filename = argv[6];
- eastasianwidth_filename = argv[7];
- linebreak_filename = argv[8];
- wordbreakproperty_filename = argv[9];
- version = argv[10];
+ for (i = 0; i < 0x110000; i++)
+ unicode_composition_exclusions[i] = 0;
- fill_attributes (unicodedata_filename);
- clear_properties ();
- fill_properties (proplist_filename);
- fill_properties (derivedproplist_filename);
- fill_properties30 (proplist30_filename);
- fill_scripts (scripts_filename);
- fill_blocks (blocks_filename);
- fill_width (eastasianwidth_filename);
- fill_org_lbp (linebreak_filename);
- fill_org_wbp (wordbreakproperty_filename);
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i;
- output_categories (version);
- output_category ("unictype/categ_of.h", version);
- output_combclass ("unictype/combining.h", version);
- output_bidi_category ("unictype/bidi_of.h", version);
- output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
- output_decimal_digit ("unictype/decdigit.h", version);
- output_digit_test ("../tests/unictype/test-digit.h", version);
- output_digit ("unictype/digit.h", version);
- output_numeric_test ("../tests/unictype/test-numeric.h", version);
- output_numeric ("unictype/numeric.h", version);
- output_mirror ("unictype/mirror.h", version);
- output_properties (version);
- output_scripts (version);
- output_scripts_byname (version);
- output_blocks (version);
- output_ident_properties (version);
- output_old_ctype (version);
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
- debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
- debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
- output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
- debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
- debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
- output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
+ if (sscanf (buf, "%X", &i) != 1)
+ {
+ fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
+ exit (1);
+ }
+ if (!(i < 0x110000))
+ abort ();
+
+ unicode_composition_exclusions[i] = 1;
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
+ exit (1);
+ }
+}
+
+static void
+debug_output_composition_tables (const char *filename)
+{
+ FILE *stream;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type == UC_DECOMP_CANONICAL
+ /* Consider only binary decompositions.
+ Exclude singleton decompositions. */
+ && length == 2)
+ {
+ unsigned int code1 = decomposed[0];
+ unsigned int code2 = decomposed[1];
+ unsigned int combined = ch;
+
+ /* Exclude decompositions where the first part is not a starter,
+ i.e. is not of canonical combining class 0. */
+ if (strcmp (unicode_attributes[code1].combining, "0") == 0
+ /* Exclude characters listed in CompositionExclusions.txt. */
+ && !unicode_composition_exclusions[combined])
+ {
+ /* The combined character must now also be a starter.
+ Verify this. */
+ if (strcmp (unicode_attributes[combined].combining, "0") != 0)
+ abort ();
+
+ fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
+ code1,
+ code2,
+ combined,
+ unicode_attributes[code2].combining);
+ }
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_composition_tables (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
+ version);
+ fprintf (stream, "\n");
+
+ /* Put a GPL header on it. The gnulib module is under LGPL (although it
+ still carries the GPL header), and it's gnulib-tool which replaces the
+ GPL header with an LGPL header. */
+ fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
+
+ /* The composition table is a set of mappings (code1, code2) -> combined,
+ with 928 entries,
+ 367 values for code1 (from 0x003C to 0x30FD),
+ 54 values for code2 (from 0x0300 to 0x309A).
+ For a fixed code1, there are from 1 to 19 possible values for code2.
+ For a fixed code2, there are from 1 to 117 possible values for code1.
+ This is a very sparse matrix.
+
+ We want an O(1) hash lookup.
+
+ We could implement the hash lookup by mapping (code1, code2) to a linear
+ combination mul1*code1 + mul2*code2, which is then used as an index into
+ a 3-level table. But this leads to a table of size 37 KB.
+
+ We use gperf to implement the hash lookup, giving it the 928 sets of
+ 4 bytes (code1, code2) as input. gperf generates a hash table of size
+ 1527, which is quite good (60% filled). It requires an auxiliary table
+ lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
+
+ fprintf (stream, "struct composition_rule { char codes[4]; };\n");
+ fprintf (stream, "%%struct-type\n");
+ fprintf (stream, "%%language=ANSI-C\n");
+ fprintf (stream, "%%define slot-name codes\n");
+ fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
+ fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
+ fprintf (stream, "%%compare-lengths\n");
+ fprintf (stream, "%%compare-strncmp\n");
+ fprintf (stream, "%%readonly-tables\n");
+ fprintf (stream, "%%omit-struct-type\n");
+ fprintf (stream, "%%%%\n");
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int length;
+ unsigned int decomposed[MAX_DECOMP_LENGTH];
+ int type = get_decomposition (ch, &length, decomposed);
+
+ if (type == UC_DECOMP_CANONICAL
+ /* Consider only binary decompositions.
+ Exclude singleton decompositions. */
+ && length == 2)
+ {
+ unsigned int code1 = decomposed[0];
+ unsigned int code2 = decomposed[1];
+ unsigned int combined = ch;
+
+ /* Exclude decompositions where the first part is not a starter,
+ i.e. is not of canonical combining class 0. */
+ if (strcmp (unicode_attributes[code1].combining, "0") == 0
+ /* Exclude characters listed in CompositionExclusions.txt. */
+ && !unicode_composition_exclusions[combined])
+ {
+ /* The combined character must now also be a starter.
+ Verify this. */
+ if (strcmp (unicode_attributes[combined].combining, "0") != 0)
+ abort ();
+
+ if (!(code1 < 0x10000))
+ abort ();
+ if (!(code2 < 0x10000))
+ abort ();
+ if (!(combined < 0x10000))
+ abort ();
+
+ fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
+ (code1 >> 8) & 0xff, code1 & 0xff,
+ (code2 >> 8) & 0xff, code2 & 0xff,
+ combined);
+ }
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Output the test for a simple character mapping table to the given file. */
+
+static void
+output_simple_mapping_test (const char *filename,
+ const char *function_name,
+ unsigned int (*func) (unsigned int),
+ const char *version)
+{
+ FILE *stream;
+ bool need_comma;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Test the Unicode character mapping functions.\n");
+ fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
+ fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
+ fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
+ fprintf (stream, " (at your option) any later version.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
+ fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+ fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
+ fprintf (stream, " GNU General Public License for more details.\n");
+ fprintf (stream, "\n");
+ fprintf (stream, " You should have received a copy of the GNU General Public License\n");
+ fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
+ fprintf (stream, "\n");
+ fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "\n");
+ fprintf (stream, "#include \"test-mapping-part1.h\"\n");
+ fprintf (stream, "\n");
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int value = func (ch);
+
+ if (value != ch)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
+ need_comma = true;
+ }
+ }
+ if (need_comma)
+ fprintf (stream, "\n");
+
+ fprintf (stream, "\n");
+ fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
+ fprintf (stream, "#include \"test-mapping-part2.h\"\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE mapping_table
+#define ELEMENT int32_t
+#define DEFAULT 0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* Output a simple character mapping table to the given file. */
+
+static void
+output_simple_mapping (const char *filename,
+ unsigned int (*func) (unsigned int),
+ const char *version)
+{
+ FILE *stream;
+ unsigned int ch, i;
+ struct mapping_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
+ version);
+
+ t.p = 7;
+ t.q = 9;
+ mapping_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int value = (int) func (ch) - (int) ch;
+
+ mapping_table_add (&t, ch, value);
+ }
+
+ mapping_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define mapping_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_mapping =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (int32_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
+ if (i+1 < t.level3_size << t.p)
+ fprintf (stream, ",");
+ }
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* A special casing context.
+ A context is negated through x -> -x. */
+enum
+{
+ SCC_ALWAYS = 0,
+ SCC_FINAL_SIGMA,
+ SCC_AFTER_SOFT_DOTTED,
+ SCC_MORE_ABOVE,
+ SCC_BEFORE_DOT,
+ SCC_AFTER_I
+};
+
+/* A special casing rule. */
+struct special_casing_rule
+{
+ unsigned int code;
+ unsigned int lower_mapping[3];
+ unsigned int title_mapping[3];
+ unsigned int upper_mapping[3];
+ unsigned int casefold_mapping[3];
+ const char *language;
+ int context;
+};
+
+/* The special casing rules. */
+struct special_casing_rule **casing_rules;
+unsigned int num_casing_rules;
+unsigned int allocated_casing_rules;
+
+static void
+add_casing_rule (struct special_casing_rule *new_rule)
+{
+ if (num_casing_rules == allocated_casing_rules)
+ {
+ allocated_casing_rules = 2 * allocated_casing_rules;
+ if (allocated_casing_rules < 16)
+ allocated_casing_rules = 16;
+ casing_rules =
+ (struct special_casing_rule **)
+ realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
+ }
+ casing_rules[num_casing_rules++] = new_rule;
+}
+
+/* Stores in casing_rules the special casing rules found in
+ specialcasing_filename. */
+static void
+fill_casing_rules (const char *specialcasing_filename)
+{
+ FILE *stream;
+
+ stream = fopen (specialcasing_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+
+ casing_rules = NULL;
+ num_casing_rules = 0;
+ allocated_casing_rules = 0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ char *scanptr;
+ char *endptr;
+ int i;
+
+ unsigned int code;
+ unsigned int lower_mapping[3];
+ unsigned int title_mapping[3];
+ unsigned int upper_mapping[3];
+ char *language;
+ int context;
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ /* Scan code. */
+ scanptr = buf;
+ code = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan lower mapping. */
+ for (i = 0; i < 3; i++)
+ lower_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ lower_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan title mapping. */
+ for (i = 0; i < 3; i++)
+ title_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ title_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan upper mapping. */
+ for (i = 0; i < 3; i++)
+ upper_mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ upper_mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan language and context. */
+ language = NULL;
+ context = SCC_ALWAYS;
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr != '\0' && *scanptr != '#')
+ {
+ const char *word_begin = scanptr;
+ const char *word_end;
+
+ while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+ scanptr++;
+ word_end = scanptr;
+
+ while (*scanptr == ' ')
+ scanptr++;
+
+ if (word_end - word_begin == 2)
+ {
+ language = (char *) malloc ((word_end - word_begin) + 1);
+ memcpy (language, word_begin, 2);
+ language[word_end - word_begin] = '\0';
+ word_begin = word_end = NULL;
+
+ if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
+ {
+ word_begin = scanptr;
+ while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+ scanptr++;
+ word_end = scanptr;
+ }
+ }
+
+ if (word_end > word_begin)
+ {
+ bool negate = false;
+
+ if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
+ {
+ word_begin += 4;
+ negate = true;
+ }
+ if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
+ context = SCC_FINAL_SIGMA;
+ else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
+ context = SCC_AFTER_SOFT_DOTTED;
+ else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
+ context = SCC_MORE_ABOVE;
+ else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
+ context = SCC_BEFORE_DOT;
+ else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
+ context = SCC_AFTER_I;
+ else
+ {
+ fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ if (negate)
+ context = - context;
+ }
+
+ if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+ }
+
+ /* Store the rule. */
+ {
+ struct special_casing_rule *new_rule =
+ (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+ new_rule->code = code;
+ new_rule->language = language;
+ new_rule->context = context;
+ memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
+ memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
+ memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
+
+ add_casing_rule (new_rule);
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
+ exit (1);
+ }
+}
+
+/* A casefolding rule. */
+struct casefold_rule
+{
+ unsigned int code;
+ unsigned int mapping[3];
+ const char *language;
+};
+
+/* The casefolding rules. */
+struct casefold_rule **casefolding_rules;
+unsigned int num_casefolding_rules;
+unsigned int allocated_casefolding_rules;
+
+/* Stores in casefolding_rules the case folding rules found in
+ casefolding_filename. */
+static void
+fill_casefolding_rules (const char *casefolding_filename)
+{
+ FILE *stream;
+
+ stream = fopen (casefolding_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
+ exit (1);
+ }
+
+ casefolding_rules = NULL;
+ num_casefolding_rules = 0;
+ allocated_casefolding_rules = 0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ char *scanptr;
+ char *endptr;
+ int i;
+
+ unsigned int code;
+ char type;
+ unsigned int mapping[3];
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ /* Scan code. */
+ scanptr = buf;
+ code = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan type. */
+ while (*scanptr == ' ')
+ scanptr++;
+
+ switch (*scanptr)
+ {
+ case 'C': case 'F': case 'S': case 'T':
+ type = *scanptr;
+ break;
+ default:
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Scan casefold mapping. */
+ for (i = 0; i < 3; i++)
+ mapping[i] = 0;
+ for (i = 0; i < 3; i++)
+ {
+ while (*scanptr == ' ')
+ scanptr++;
+ if (*scanptr == ';')
+ break;
+ mapping[i] = strtoul (scanptr, &endptr, 16);
+ if (endptr == scanptr)
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr = endptr;
+ }
+ if (*scanptr != ';')
+ {
+ fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+ exit (1);
+ }
+ scanptr++;
+
+ /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
+ if (type != 'S')
+ {
+ const char * const *languages;
+ unsigned int languages_count;
+
+ /* Type 'T' indicates that the rule is applicable to Turkish
+ languages only. */
+ if (type == 'T')
+ {
+ static const char * const turkish_languages[] = { "tr", "az" };
+ languages = turkish_languages;
+ languages_count = 2;
+ }
+ else
+ {
+ static const char * const all_languages[] = { NULL };
+ languages = all_languages;
+ languages_count = 1;
+ }
+
+ for (i = 0; i < languages_count; i++)
+ {
+ /* Store a new rule. */
+ struct casefold_rule *new_rule =
+ (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
+ new_rule->code = code;
+ memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
+ new_rule->language = languages[i];
+
+ if (num_casefolding_rules == allocated_casefolding_rules)
+ {
+ allocated_casefolding_rules = 2 * allocated_casefolding_rules;
+ if (allocated_casefolding_rules < 16)
+ allocated_casefolding_rules = 16;
+ casefolding_rules =
+ (struct casefold_rule **)
+ realloc (casefolding_rules,
+ allocated_casefolding_rules * sizeof (struct casefold_rule *));
+ }
+ casefolding_rules[num_casefolding_rules++] = new_rule;
+ }
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
+ exit (1);
+ }
+}
+
+/* Casefold mapping, when it maps to a single character. */
+unsigned int unicode_casefold[0x110000];
+
+static unsigned int
+to_casefold (unsigned int ch)
+{
+ return unicode_casefold[ch];
+}
+
+/* Redistribute the casefolding_rules:
+ - Rules that map to a single character, language independently, are stored
+ in unicode_casefold.
+ - Other rules are merged into casing_rules. */
+static void
+redistribute_casefolding_rules (void)
+{
+ unsigned int ch, i, j;
+
+ /* Fill unicode_casefold[]. */
+ for (ch = 0; ch < 0x110000; ch++)
+ unicode_casefold[ch] = ch;
+ for (i = 0; i < num_casefolding_rules; i++)
+ {
+ struct casefold_rule *cfrule = casefolding_rules[i];
+
+ if (cfrule->language == NULL && cfrule->mapping[1] == 0)
+ {
+ ch = cfrule->code;
+ if (!(ch < 0x110000))
+ abort ();
+ unicode_casefold[ch] = cfrule->mapping[0];
+ }
+ }
+
+ /* Extend the special casing rules by filling in their casefold_mapping[]
+ field. */
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+ unsigned int k;
+
+ rule->casefold_mapping[0] = to_casefold (rule->code);
+ for (k = 1; k < 3; k++)
+ rule->casefold_mapping[k] = 0;
+ }
+
+ /* Now merge the other casefolding rules into casing_rules. */
+ for (i = 0; i < num_casefolding_rules; i++)
+ {
+ struct casefold_rule *cfrule = casefolding_rules[i];
+
+ if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
+ {
+ /* Find a rule that applies to the same code, same language, and it
+ has context SCC_ALWAYS. At the same time, update all rules that
+ have the same code and same or more specific language. */
+ struct special_casing_rule *found_rule = NULL;
+
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+
+ if (rule->code == cfrule->code
+ && (cfrule->language == NULL
+ || (rule->language != NULL
+ && strcmp (rule->language, cfrule->language) == 0)))
+ {
+ memcpy (rule->casefold_mapping, cfrule->mapping,
+ sizeof (rule->casefold_mapping));
+
+ if ((cfrule->language == NULL
+ ? rule->language == NULL
+ : rule->language != NULL
+ && strcmp (rule->language, cfrule->language) == 0)
+ && rule->context == SCC_ALWAYS)
+ {
+ /* Found it. */
+ found_rule = rule;
+ }
+ }
+ }
+
+ if (found_rule == NULL)
+ {
+ /* Create a new rule. */
+ struct special_casing_rule *new_rule =
+ (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+
+ /* Try to find a rule that applies to the same code, no language
+ restriction, and with context SCC_ALWAYS. */
+ for (j = 0; j < num_casing_rules; j++)
+ {
+ struct special_casing_rule *rule = casing_rules[j];
+
+ if (rule->code == cfrule->code
+ && rule->context == SCC_ALWAYS
+ && rule->language == NULL)
+ {
+ /* Found it. */
+ found_rule = rule;
+ break;
+ }
+ }
+
+ new_rule->code = cfrule->code;
+ new_rule->language = cfrule->language;
+ new_rule->context = SCC_ALWAYS;
+ if (found_rule != NULL)
+ {
+ memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
+ sizeof (new_rule->lower_mapping));
+ memcpy (new_rule->title_mapping, found_rule->title_mapping,
+ sizeof (new_rule->title_mapping));
+ memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
+ sizeof (new_rule->upper_mapping));
+ }
+ else
+ {
+ unsigned int k;
+
+ new_rule->lower_mapping[0] = to_lower (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->lower_mapping[k] = 0;
+ new_rule->title_mapping[0] = to_title (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->title_mapping[k] = 0;
+ new_rule->upper_mapping[0] = to_upper (cfrule->code);
+ for (k = 1; k < 3; k++)
+ new_rule->upper_mapping[k] = 0;
+ }
+ memcpy (new_rule->casefold_mapping, cfrule->mapping,
+ sizeof (new_rule->casefold_mapping));
+
+ add_casing_rule (new_rule);
+ }
+ }
+ }
+}
+
+static int
+compare_casing_rules (const void *a, const void *b)
+{
+ struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
+ struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
+ unsigned int a_code = a_rule->code;
+ unsigned int b_code = b_rule->code;
+
+ if (a_code < b_code)
+ return -1;
+ if (a_code > b_code)
+ return 1;
+
+ /* Sort the more specific rules before the more general ones. */
+ return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
+ + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
+}
+
+static void
+sort_casing_rules (void)
+{
+ /* Sort the rules 1. by code, 2. by specificity. */
+ if (num_casing_rules > 1)
+ qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
+ compare_casing_rules);
+}
+
+/* Output the special casing rules. */
+static void
+output_casing_rules (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int i, j;
+ unsigned int minor;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+ fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
+ fprintf (stream, "%%struct-type\n");
+ fprintf (stream, "%%language=ANSI-C\n");
+ fprintf (stream, "%%define slot-name code\n");
+ fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
+ fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
+ fprintf (stream, "%%compare-lengths\n");
+ fprintf (stream, "%%compare-strncmp\n");
+ fprintf (stream, "%%readonly-tables\n");
+ fprintf (stream, "%%omit-struct-type\n");
+ fprintf (stream, "%%%%\n");
+
+ minor = 0;
+ for (i = 0; i < num_casing_rules; i++)
+ {
+ struct special_casing_rule *rule = casing_rules[i];
+ int context;
+
+ if (i > 0 && rule->code == casing_rules[i - 1]->code)
+ minor += 1;
+ else
+ minor = 0;
+
+ if (!(rule->code < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+
+ fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
+ (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
+
+ fprintf (stream, "%d, ",
+ i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
+
+ context = rule->context;
+ if (context < 0)
+ {
+ fprintf (stream, "-");
+ context = - context;
+ }
+ else
+ fprintf (stream, " ");
+ switch (context)
+ {
+ case SCC_ALWAYS:
+ fprintf (stream, "SCC_ALWAYS ");
+ break;
+ case SCC_FINAL_SIGMA:
+ fprintf (stream, "SCC_FINAL_SIGMA ");
+ break;
+ case SCC_AFTER_SOFT_DOTTED:
+ fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
+ break;
+ case SCC_MORE_ABOVE:
+ fprintf (stream, "SCC_MORE_ABOVE ");
+ break;
+ case SCC_BEFORE_DOT:
+ fprintf (stream, "SCC_BEFORE_DOT ");
+ break;
+ case SCC_AFTER_I:
+ fprintf (stream, "SCC_AFTER_I ");
+ break;
+ default:
+ abort ();
+ }
+ fprintf (stream, ", ");
+
+ if (rule->language != NULL)
+ {
+ if (strlen (rule->language) != 2)
+ abort ();
+ fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
+ }
+ else
+ fprintf (stream, "{ '\\0', '\\0' }, ");
+
+ fprintf (stream, "{ ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->upper_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->upper_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->upper_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->lower_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->lower_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->lower_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->title_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->title_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->title_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }, { ");
+ for (j = 0; j < 3; j++)
+ {
+ if (j > 0)
+ fprintf (stream, ", ");
+ if (!(rule->casefold_mapping[j] < 0x10000))
+ {
+ fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
+ exit (1);
+ }
+ if (rule->casefold_mapping[j] != 0)
+ fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
+ else
+ fprintf (stream, " 0");
+ }
+ fprintf (stream, " }\n");
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+int
+main (int argc, char * argv[])
+{
+ const char *unicodedata_filename;
+ const char *proplist_filename;
+ const char *derivedproplist_filename;
+ const char *scripts_filename;
+ const char *blocks_filename;
+ const char *proplist30_filename;
+ const char *eastasianwidth_filename;
+ const char *linebreak_filename;
+ const char *wordbreakproperty_filename;
+ const char *compositionexclusions_filename;
+ const char *specialcasing_filename;
+ const char *casefolding_filename;
+ const char *version;
+
+ if (argc != 14)
+ {
+ fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
+ argv[0]);
+ exit (1);
+ }
+
+ unicodedata_filename = argv[1];
+ proplist_filename = argv[2];
+ derivedproplist_filename = argv[3];
+ scripts_filename = argv[4];
+ blocks_filename = argv[5];
+ proplist30_filename = argv[6];
+ eastasianwidth_filename = argv[7];
+ linebreak_filename = argv[8];
+ wordbreakproperty_filename = argv[9];
+ compositionexclusions_filename = argv[10];
+ specialcasing_filename = argv[11];
+ casefolding_filename = argv[12];
+ version = argv[13];
+
+ fill_attributes (unicodedata_filename);
+ clear_properties ();
+ fill_properties (proplist_filename);
+ fill_properties (derivedproplist_filename);
+ fill_properties30 (proplist30_filename);
+ fill_scripts (scripts_filename);
+ fill_blocks (blocks_filename);
+ fill_width (eastasianwidth_filename);
+ fill_org_lbp (linebreak_filename);
+ fill_org_wbp (wordbreakproperty_filename);
+ fill_composition_exclusions (compositionexclusions_filename);
+ fill_casing_rules (specialcasing_filename);
+ fill_casefolding_rules (casefolding_filename);
+ redistribute_casefolding_rules ();
+ sort_casing_rules ();
+
+ output_categories (version);
+ output_category ("unictype/categ_of.h", version);
+ output_combclass ("unictype/combining.h", version);
+ output_bidi_category ("unictype/bidi_of.h", version);
+ output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
+ output_decimal_digit ("unictype/decdigit.h", version);
+ output_digit_test ("../tests/unictype/test-digit.h", version);
+ output_digit ("unictype/digit.h", version);
+ output_numeric_test ("../tests/unictype/test-numeric.h", version);
+ output_numeric ("unictype/numeric.h", version);
+ output_mirror ("unictype/mirror.h", version);
+ output_properties (version);
+ output_scripts (version);
+ output_scripts_byname (version);
+ output_blocks (version);
+ output_ident_properties (version);
+ output_old_ctype (version);
+
+ debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
+ debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
+ output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
+
+ debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
+ debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
+ output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
+
+ output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
+ debug_output_composition_tables ("uninorm/composition.txt");
+ output_composition_tables ("uninorm/composition-table.gperf", version);
+
+ output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
+ output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
+ output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
+ output_simple_mapping ("unicase/toupper.h", to_upper, version);
+ output_simple_mapping ("unicase/tolower.h", to_lower, version);
+ output_simple_mapping ("unicase/totitle.h", to_title, version);
+ output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
+ output_casing_rules ("unicase/special-casing-table.gperf", version);
return 0;
}
/gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
/gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
/gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
+ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
5.1.0
"
* End: