+ unsigned int j = (i * 4) / 8;
+ unsigned int k = (i * 4) % 8;
+ uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
+ level3_packed[j] |= (value << k);
+ }
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%02x", level3_packed[i]);
+ if (i+1 < (t.level3_size << t.p) * 4 / 8)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 4 / 8 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ free (level3_packed);
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* Convert a Joining_Group value to a C identifier. */
+static const char *
+joining_group_as_c_identifier (int joining_group)
+{
+#define TRY(value) if (joining_group == value) return #value;
+ TRY(UC_JOINING_GROUP_NONE)
+ TRY(UC_JOINING_GROUP_AIN)
+ TRY(UC_JOINING_GROUP_ALAPH)
+ TRY(UC_JOINING_GROUP_ALEF)
+ TRY(UC_JOINING_GROUP_BEH)
+ TRY(UC_JOINING_GROUP_BETH)
+ TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_DAL)
+ TRY(UC_JOINING_GROUP_DALATH_RISH)
+ TRY(UC_JOINING_GROUP_E)
+ TRY(UC_JOINING_GROUP_FARSI_YEH)
+ TRY(UC_JOINING_GROUP_FE)
+ TRY(UC_JOINING_GROUP_FEH)
+ TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
+ TRY(UC_JOINING_GROUP_GAF)
+ TRY(UC_JOINING_GROUP_GAMAL)
+ TRY(UC_JOINING_GROUP_HAH)
+ TRY(UC_JOINING_GROUP_HE)
+ TRY(UC_JOINING_GROUP_HEH)
+ TRY(UC_JOINING_GROUP_HEH_GOAL)
+ TRY(UC_JOINING_GROUP_HETH)
+ TRY(UC_JOINING_GROUP_KAF)
+ TRY(UC_JOINING_GROUP_KAPH)
+ TRY(UC_JOINING_GROUP_KHAPH)
+ TRY(UC_JOINING_GROUP_KNOTTED_HEH)
+ TRY(UC_JOINING_GROUP_LAM)
+ TRY(UC_JOINING_GROUP_LAMADH)
+ TRY(UC_JOINING_GROUP_MEEM)
+ TRY(UC_JOINING_GROUP_MIM)
+ TRY(UC_JOINING_GROUP_NOON)
+ TRY(UC_JOINING_GROUP_NUN)
+ TRY(UC_JOINING_GROUP_NYA)
+ TRY(UC_JOINING_GROUP_PE)
+ TRY(UC_JOINING_GROUP_QAF)
+ TRY(UC_JOINING_GROUP_QAPH)
+ TRY(UC_JOINING_GROUP_REH)
+ TRY(UC_JOINING_GROUP_REVERSED_PE)
+ TRY(UC_JOINING_GROUP_SAD)
+ TRY(UC_JOINING_GROUP_SADHE)
+ TRY(UC_JOINING_GROUP_SEEN)
+ TRY(UC_JOINING_GROUP_SEMKATH)
+ TRY(UC_JOINING_GROUP_SHIN)
+ TRY(UC_JOINING_GROUP_SWASH_KAF)
+ TRY(UC_JOINING_GROUP_SYRIAC_WAW)
+ TRY(UC_JOINING_GROUP_TAH)
+ TRY(UC_JOINING_GROUP_TAW)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA)
+ TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
+ TRY(UC_JOINING_GROUP_TETH)
+ TRY(UC_JOINING_GROUP_WAW)
+ TRY(UC_JOINING_GROUP_YEH)
+ TRY(UC_JOINING_GROUP_YEH_BARREE)
+ TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
+ TRY(UC_JOINING_GROUP_YUDH)
+ TRY(UC_JOINING_GROUP_YUDH_HE)
+ TRY(UC_JOINING_GROUP_ZAIN)
+ TRY(UC_JOINING_GROUP_ZHAIN)
+#undef TRY
+ abort ();
+}
+
+static void
+output_joining_group_test (const char *filename, const char *version)
+{
+ FILE *stream;
+ bool need_comma;
+ unsigned int ch;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ need_comma = false;
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ int value = unicode_joining_group[ch];
+
+ if (value != UC_JOINING_GROUP_NONE)
+ {
+ if (need_comma)
+ fprintf (stream, ",\n");
+ fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
+ need_comma = true;
+ }
+ }
+ if (need_comma)
+ fprintf (stream, "\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+static void
+output_joining_group (const char *filename, const char *version)
+{
+ FILE *stream;
+ unsigned int ch_min, ch_max, ch, i;
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ ch_min = 0x10FFFF;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_min = ch;
+ break;
+ }
+
+ ch_max = 0;
+ for (ch = 0x10FFFF; ch > 0; ch--)
+ if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+ {
+ ch_max = ch;
+ break;
+ }
+
+ if (!(ch_min <= ch_max))
+ abort ();
+
+ /* If the interval [ch_min, ch_max] is too large, we should better use a
+ 3-level table. */
+ if (!(ch_max - ch_min < 0x200))
+ abort ();
+
+ fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
+ fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
+ ch_max + 1, ch_min);
+ fprintf (stream, "{");
+ for (i = 0; i <= ch_max - ch_min; i++)
+ {
+ const char *s;
+
+ ch = ch_min + i;
+ if ((i % 2) == 0)
+ fprintf (stream, "\n ");
+ s = joining_group_as_c_identifier (unicode_joining_group[ch]);
+ fprintf (stream, " %s", s);
+ if (i+1 <= ch_max - ch_min)
+ {
+ fprintf (stream, ",");
+ if (((i+1) % 2) != 0)
+ fprintf (stream, "%*s", 38 - (int) strlen (s), "");
+ }
+ }
+ fprintf (stream, "\n");
+ fprintf (stream, "};\n");
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error writing to '%s'\n", filename);
+ exit (1);
+ }
+}
+
+/* ========================================================================= */
+
+/* Scripts. */
+
+static const char *scripts[256];
+static unsigned int numscripts;
+
+static uint8_t unicode_scripts[0x110000];
+
+static void
+fill_scripts (const char *scripts_filename)
+{
+ FILE *stream;
+ unsigned int i;
+
+ stream = fopen (scripts_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+ exit (1);
+ }
+
+ numscripts = 0;
+
+ for (i = 0; i < 0x110000; i++)
+ unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+
+ for (;;)
+ {
+ char buf[200+1];
+ unsigned int i1, i2;
+ char padding[200+1];
+ char scriptname[200+1];
+ int script;
+
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+ break;
+
+ if (buf[0] == '\0' || buf[0] == '#')
+ continue;
+
+ if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+ {
+ if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
+ {
+ fprintf (stderr, "parse error in '%s'\n", scripts_filename);
+ exit (1);
+ }
+ i2 = i1;
+ }
+ if (i2 < i1)
+ abort ();
+ if (i2 >= 0x110000)
+ abort ();
+
+ for (script = numscripts - 1; script >= 0; script--)
+ if (strcmp (scripts[script], scriptname) == 0)
+ break;
+ if (script < 0)
+ {
+ scripts[numscripts] = strdup (scriptname);
+ script = numscripts;
+ numscripts++;
+ if (numscripts == 256)
+ abort ();
+ }
+
+ for (i = i1; i <= i2; i++)
+ {
+ if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
+ fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
+ unicode_scripts[i] = script;
+ }
+ }
+
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+ exit (1);
+ }
+}
+
+/* Construction of sparse 3-level tables. */
+#define TABLE script_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_scripts (const char *version)
+{
+ const char *filename = "unictype/scripts.h";
+ FILE *stream;
+ unsigned int ch, s, i;
+ struct script_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+
+ typedef struct
+ {
+ const char *lowercase_name;
+ }
+ scriptinfo_t;
+ scriptinfo_t scriptinfo[256];
+
+ stream = fopen (filename, "w");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "cannot open '%s' for writing\n", filename);
+ exit (1);
+ }
+
+ fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+ fprintf (stream, "/* Unicode scripts. */\n");
+ fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
+ version);
+
+ for (s = 0; s < numscripts; s++)
+ {
+ char *lcp = strdup (scripts[s]);
+ char *cp;
+
+ for (cp = lcp; *cp != '\0'; cp++)
+ if (*cp >= 'A' && *cp <= 'Z')
+ *cp += 'a' - 'A';
+
+ scriptinfo[s].lowercase_name = lcp;
+ }
+
+ for (s = 0; s < numscripts; s++)
+ {
+ fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, "{\n");
+ i = 0;
+ for (ch = 0; ch < 0x110000; ch++)
+ if (unicode_scripts[ch] == s)
+ {
+ unsigned int start;
+ unsigned int end;
+
+ start = ch;
+ while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
+ ch++;
+ end = ch;
+
+ if (i > 0)
+ fprintf (stream, ",\n");
+ if (start == end)
+ fprintf (stream, " { 0x%04X, 1, 1 }", start);
+ else
+ fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
+ start, end);
+ i++;
+ }
+ fprintf (stream, "\n");
+ fprintf (stream, "};\n");
+ }
+
+ fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
+ fprintf (stream, "{\n");
+ for (s = 0; s < numscripts; s++)
+ {
+ fprintf (stream, " {\n");
+ fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " script_%s_intervals,\n",
+ scriptinfo[s].lowercase_name);
+ fprintf (stream, " \"%s\"\n", scripts[s]);
+ fprintf (stream, " }");
+ if (s+1 < numscripts)
+ fprintf (stream, ",");
+ fprintf (stream, "\n");
+ }
+ fprintf (stream, "};\n");
+
+ t.p = 7;
+ t.q = 9;
+ script_table_init (&t);
+
+ for (ch = 0; ch < 0x110000; ch++)
+ {
+ unsigned int s = unicode_scripts[ch];
+ if (s != (uint8_t)~(uint8_t)0)
+ script_table_add (&t, ch, s);
+ }
+
+ script_table_finalize (&t);
+
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
+
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define script_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_script =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
+ }
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level3_size << t.p > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ if (i > 0 && (i % 8) == 0)