NEWS.stable: log cherry-pick [8b18afa]->[c948e19] sys_select: Avoid a syntax error...

[gnulib.git] / lib / gen-uni-tables.c
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c

index 502b9ff..0eddbb1 100644 (file)
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -1,7 +1,7 @@
  /* Generate Unicode conforming character classification tables and
     line break properties tables and word break property tables and
     decomposition/composition and case mapping tables from a UnicodeData file.
-   Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.
+   Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
     Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
  
     This program is free software: you can redistribute it and/or modify
@@ -21,16 +21,18 @@
       $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
                        /usr/local/share/Unidata/PropList.txt \
                        /usr/local/share/Unidata/DerivedCoreProperties.txt \
+                      /usr/local/share/Unidata/ArabicShaping.txt \
                        /usr/local/share/Unidata/Scripts.txt \
                        /usr/local/share/Unidata/Blocks.txt \
                        /usr/local/share/Unidata/PropList-3.0.1.txt \
                        /usr/local/share/Unidata/EastAsianWidth.txt \
                        /usr/local/share/Unidata/LineBreak.txt \
                        /usr/local/share/Unidata/WordBreakProperty.txt \
+                      /usr/local/share/Unidata/GraphemeBreakProperty.txt \
                        /usr/local/share/Unidata/CompositionExclusions.txt \
                        /usr/local/share/Unidata/SpecialCasing.txt \
                        /usr/local/share/Unidata/CaseFolding.txt \
-                      5.1.0
+                      6.0.0
   */
  
  #include <stdbool.h>
@@ -210,7 +212,7 @@ fill_attributes (const char *unicodedata_filename)
        i = strtoul (field0, NULL, 16);
        if (field1[0] == '<'
            && strlen (field1) >= 9
-          && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
+          && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
          {
            /* Deal with a range. */
            lineno++;
@@ -258,6 +260,7 @@ fill_attributes (const char *unicodedata_filename)
                               field11, field12, field13, field14);
          }
      }
+
    if (ferror (stream) || fclose (stream))
      {
        fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
@@ -279,6 +282,17 @@ is_category_L (unsigned int ch)
  }
  
  static bool
+is_category_LC (unsigned int ch)
+{
+  /* See PropertyValueAliases.txt.  */
+  return (unicode_attributes[ch].name != NULL
+          && unicode_attributes[ch].category[0] == 'L'
+          && (unicode_attributes[ch].category[1] == 'u'
+              || unicode_attributes[ch].category[1] == 'l'
+              || unicode_attributes[ch].category[1] == 't'));
+}
+
+static bool
  is_category_Lu (unsigned int ch)
  {
    return (unicode_attributes[ch].name != NULL
@@ -689,7 +703,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* %s of Unicode characters.  */\n", comment);
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 4; /* or: 5 */
@@ -802,6 +816,7 @@ output_categories (const char *version)
    output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
    output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
    CATEGORY (L)
+  CATEGORY (LC)
    CATEGORY (Lu)
    CATEGORY (Ll)
    CATEGORY (Lt)
@@ -844,6 +859,7 @@ output_categories (const char *version)
  enum
  {
    UC_CATEGORY_MASK_L  = 0x0000001f,
+  UC_CATEGORY_MASK_LC = 0x00000007,
    UC_CATEGORY_MASK_Lu = 0x00000001,
    UC_CATEGORY_MASK_Ll = 0x00000002,
    UC_CATEGORY_MASK_Lt = 0x00000004,
@@ -893,6 +909,7 @@ general_category_byname (const char *category_name)
          switch (category_name[1])
            {
            case '\0': return UC_CATEGORY_MASK_L;
+          case 'C': return UC_CATEGORY_MASK_LC;
            case 'u': return UC_CATEGORY_MASK_Lu;
            case 'l': return UC_CATEGORY_MASK_Ll;
            case 't': return UC_CATEGORY_MASK_Lt;
@@ -993,7 +1010,7 @@ output_category (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Categories of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -1157,7 +1174,7 @@ output_combclass (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Combining class of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -1497,7 +1514,7 @@ output_bidi_category (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Bidi categories of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -1654,7 +1671,7 @@ output_decimal_digit_test (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    need_comma = false;
@@ -1701,7 +1718,7 @@ output_decimal_digit (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -1841,7 +1858,7 @@ output_digit_test (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    need_comma = false;
@@ -1888,7 +1905,7 @@ output_digit (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -2045,7 +2062,7 @@ output_numeric_test (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    need_comma = false;
@@ -2101,7 +2118,7 @@ output_numeric (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    /* Create table of occurring fractions.  */
@@ -2386,7 +2403,7 @@ output_mirror (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Mirrored Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -2490,6 +2507,24 @@ output_mirror (const char *filename, const char *version)
  
  /* ========================================================================= */
  
+/* Particular values of the word break property.  */
+
+static bool
+is_WBP_MIDNUMLET (unsigned int ch)
+{
+  return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+          || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
+}
+
+static bool
+is_WBP_MIDLETTER (unsigned int ch)
+{
+  return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
+          || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
+}
+
+/* ========================================================================= */
+
  /* Properties.  */
  
  /* Reading PropList.txt and DerivedCoreProperties.txt.  */
@@ -2533,6 +2568,13 @@ enum
    PROP_ALPHABETIC,
    PROP_LOWERCASE,
    PROP_UPPERCASE,
+  PROP_CASED,
+  PROP_CASE_IGNORABLE,
+  PROP_CHANGES_WHEN_LOWERCASED,
+  PROP_CHANGES_WHEN_UPPERCASED,
+  PROP_CHANGES_WHEN_TITLECASED,
+  PROP_CHANGES_WHEN_CASEFOLDED,
+  PROP_CHANGES_WHEN_CASEMAPPED,
    PROP_ID_START,
    PROP_ID_CONTINUE,
    PROP_XID_START,
@@ -2631,6 +2673,13 @@ fill_properties (const char *proplist_filename)
        PROP ("Alphabetic", PROP_ALPHABETIC)
        PROP ("Lowercase", PROP_LOWERCASE)
        PROP ("Uppercase", PROP_UPPERCASE)
+      PROP ("Cased", PROP_CASED)
+      PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
+      PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
+      PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
+      PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
+      PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
+      PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
        PROP ("ID_Start", PROP_ID_START)
        PROP ("ID_Continue", PROP_ID_CONTINUE)
        PROP ("XID_Start", PROP_XID_START)
@@ -2727,6 +2776,7 @@ fill_property30 (char array[0x110000], const char *proplist_filename, const char
        for (i = i1; i <= i2; i++)
          array[i] = 1;
      }
+
    if (ferror (stream) || fclose (stream))
      {
        fprintf (stderr, "error reading from '%s'\n", proplist_filename);
@@ -2776,6 +2826,7 @@ is_property_alphabetic (unsigned int ch)
      || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
      || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
      || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
+    || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
      || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
      || (ch == 0x10341) /* GOTHIC LETTER NINETY */
      || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
@@ -2811,7 +2862,10 @@ is_property_default_ignorable_code_point (unsigned int ch)
    bool result1 =
      (is_category_Cf (ch)
       && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
-     && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
+     && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
+     /* For some reason, the following are not listed as having property
+        Default_Ignorable_Code_Point.  */
+     && !(ch == 0x110BD))
      || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
      || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
    bool result2 =
@@ -2920,6 +2974,79 @@ is_property_titlecase (unsigned int ch)
    return is_category_Lt (ch);
  }
  
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_cased (unsigned int ch)
+{
+  bool result1 = (is_property_lowercase (ch)
+                  || is_property_uppercase (ch)
+                  || is_category_Lt (ch));
+  bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_case_ignorable (unsigned int ch)
+{
+  bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+                  || is_category_Mn (ch)
+                  || is_category_Me (ch)
+                  || is_category_Cf (ch)
+                  || is_category_Lm (ch)
+                  || is_category_Sk (ch));
+  bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_lowercased (unsigned int ch)
+{
+  bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
+  bool result2 = (unicode_attributes[ch].name != NULL
+                  && unicode_attributes[ch].lower != NONE
+                  && unicode_attributes[ch].lower != ch);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_uppercased (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_titlecased (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_casefolded (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_casemapped (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
+}
+
  /* See PropList.txt, UCD.html.  */
  static bool
  is_property_soft_dotted (unsigned int ch)
@@ -3466,6 +3593,13 @@ output_properties (const char *version)
    PROPERTY(lowercase)
    PROPERTY(other_lowercase)
    PROPERTY(titlecase)
+  PROPERTY(cased)
+  PROPERTY(case_ignorable)
+  PROPERTY(changes_when_lowercased)
+  PROPERTY(changes_when_uppercased)
+  PROPERTY(changes_when_titlecased)
+  PROPERTY(changes_when_casefolded)
+  PROPERTY(changes_when_casemapped)
    PROPERTY(soft_dotted)
    PROPERTY(id_start)
    PROPERTY(other_id_start)
@@ -3534,109 +3668,258 @@ output_properties (const char *version)
  
  /* ========================================================================= */
  
-/* Scripts.  */
+/* Arabic Shaping.  */
  
-static const char *scripts[256];
-static unsigned int numscripts;
+enum
+{
+  UC_JOINING_TYPE_U, /* Non_Joining */
+  UC_JOINING_TYPE_T, /* Transparent */
+  UC_JOINING_TYPE_C, /* Join_Causing */
+  UC_JOINING_TYPE_L, /* Left_Joining */
+  UC_JOINING_TYPE_R, /* Right_Joining */
+  UC_JOINING_TYPE_D  /* Dual_Joining */
+};
  
-static uint8_t unicode_scripts[0x110000];
+static uint8_t unicode_joining_type[0x110000];
+
+enum
+{
+  UC_JOINING_GROUP_NONE,                  /* No_Joining_Group */
+  UC_JOINING_GROUP_AIN,                   /* Ain */
+  UC_JOINING_GROUP_ALAPH,                 /* Alaph */
+  UC_JOINING_GROUP_ALEF,                  /* Alef */
+  UC_JOINING_GROUP_BEH,                   /* Beh */
+  UC_JOINING_GROUP_BETH,                  /* Beth */
+  UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
+  UC_JOINING_GROUP_DAL,                   /* Dal */
+  UC_JOINING_GROUP_DALATH_RISH,           /* Dalath_Rish */
+  UC_JOINING_GROUP_E,                     /* E */
+  UC_JOINING_GROUP_FARSI_YEH,             /* Farsi_Yeh */
+  UC_JOINING_GROUP_FE,                    /* Fe */
+  UC_JOINING_GROUP_FEH,                   /* Feh */
+  UC_JOINING_GROUP_FINAL_SEMKATH,         /* Final_Semkath */
+  UC_JOINING_GROUP_GAF,                   /* Gaf */
+  UC_JOINING_GROUP_GAMAL,                 /* Gamal */
+  UC_JOINING_GROUP_HAH,                   /* Hah */
+  UC_JOINING_GROUP_HE,                    /* He */
+  UC_JOINING_GROUP_HEH,                   /* Heh */
+  UC_JOINING_GROUP_HEH_GOAL,              /* Heh_Goal */
+  UC_JOINING_GROUP_HETH,                  /* Heth */
+  UC_JOINING_GROUP_KAF,                   /* Kaf */
+  UC_JOINING_GROUP_KAPH,                  /* Kaph */
+  UC_JOINING_GROUP_KHAPH,                 /* Khaph */
+  UC_JOINING_GROUP_KNOTTED_HEH,           /* Knotted_Heh */
+  UC_JOINING_GROUP_LAM,                   /* Lam */
+  UC_JOINING_GROUP_LAMADH,                /* Lamadh */
+  UC_JOINING_GROUP_MEEM,                  /* Meem */
+  UC_JOINING_GROUP_MIM,                   /* Mim */
+  UC_JOINING_GROUP_NOON,                  /* Noon */
+  UC_JOINING_GROUP_NUN,                   /* Nun */
+  UC_JOINING_GROUP_NYA,                   /* Nya */
+  UC_JOINING_GROUP_PE,                    /* Pe */
+  UC_JOINING_GROUP_QAF,                   /* Qaf */
+  UC_JOINING_GROUP_QAPH,                  /* Qaph */
+  UC_JOINING_GROUP_REH,                   /* Reh */
+  UC_JOINING_GROUP_REVERSED_PE,           /* Reversed_Pe */
+  UC_JOINING_GROUP_SAD,                   /* Sad */
+  UC_JOINING_GROUP_SADHE,                 /* Sadhe */
+  UC_JOINING_GROUP_SEEN,                  /* Seen */
+  UC_JOINING_GROUP_SEMKATH,               /* Semkath */
+  UC_JOINING_GROUP_SHIN,                  /* Shin */
+  UC_JOINING_GROUP_SWASH_KAF,             /* Swash_Kaf */
+  UC_JOINING_GROUP_SYRIAC_WAW,            /* Syriac_Waw */
+  UC_JOINING_GROUP_TAH,                   /* Tah */
+  UC_JOINING_GROUP_TAW,                   /* Taw */
+  UC_JOINING_GROUP_TEH_MARBUTA,           /* Teh_Marbuta */
+  UC_JOINING_GROUP_TEH_MARBUTA_GOAL,      /* Teh_Marbuta_Goal */
+  UC_JOINING_GROUP_TETH,                  /* Teth */
+  UC_JOINING_GROUP_WAW,                   /* Waw */
+  UC_JOINING_GROUP_YEH,                   /* Yeh */
+  UC_JOINING_GROUP_YEH_BARREE,            /* Yeh_Barree */
+  UC_JOINING_GROUP_YEH_WITH_TAIL,         /* Yeh_With_Tail */
+  UC_JOINING_GROUP_YUDH,                  /* Yudh */
+  UC_JOINING_GROUP_YUDH_HE,               /* Yudh_He */
+  UC_JOINING_GROUP_ZAIN,                  /* Zain */
+  UC_JOINING_GROUP_ZHAIN                  /* Zhain */
+};
+
+static uint8_t unicode_joining_group[0x110000];
  
  static void
-fill_scripts (const char *scripts_filename)
+fill_arabicshaping (const char *arabicshaping_filename)
  {
    FILE *stream;
    unsigned int i;
+  int lineno;
  
-  stream = fopen (scripts_filename, "r");
+  stream = fopen (arabicshaping_filename, "r");
    if (stream == NULL)
      {
-      fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+      fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
        exit (1);
      }
  
-  numscripts = 0;
-
    for (i = 0; i < 0x110000; i++)
-    unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+    {
+      unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
+      unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
+    }
  
+  lineno = 0;
    for (;;)
      {
-      char buf[200+1];
-      unsigned int i1, i2;
-      char padding[200+1];
-      char scriptname[200+1];
-      int script;
+      char buf[100+1];
+      char separator1[100+1];
+      char padding1[100+1];
+      char schematic_name[100+1];
+      char separator2[100+1];
+      char padding2[100+1];
+      char joining_type_name[100+1];
+      char separator3[100+1];
+      char padding3[100+1];
+      char joining_group_name[100+1];
+      int joining_type;
+      int joining_group;
  
-      if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+      lineno++;
+      if (fscanf (stream, "%100[^\n]\n", buf) < 1)
          break;
  
        if (buf[0] == '\0' || buf[0] == '#')
          continue;
  
-      if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+      if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
+                  &i, separator1, padding1, schematic_name, separator2,
+                  padding2, joining_type_name, separator3, padding3,
+                  joining_group_name) != 10)
          {
-          if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
-            {
-              fprintf (stderr, "parse error in '%s'\n", scripts_filename);
-              exit (1);
-            }
-          i2 = i1;
+          fprintf (stderr, "parse error in '%s':%d\n",
+                   arabicshaping_filename, lineno);
+          exit (1);
          }
-      if (i2 < i1)
-        abort ();
-      if (i2 >= 0x110000)
+      if (i >= 0x110000)
          abort ();
  
-      for (script = numscripts - 1; script >= 0; script--)
-        if (strcmp (scripts[script], scriptname) == 0)
-          break;
-      if (script < 0)
+#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
+      if (false) {}
+      TRY(UC_JOINING_TYPE_U)
+      TRY(UC_JOINING_TYPE_T)
+      TRY(UC_JOINING_TYPE_C)
+      TRY(UC_JOINING_TYPE_L)
+      TRY(UC_JOINING_TYPE_R)
+      TRY(UC_JOINING_TYPE_D)
+#undef TRY
+      else
          {
-          scripts[numscripts] = strdup (scriptname);
-          script = numscripts;
-          numscripts++;
-          if (numscripts == 256)
-            abort ();
+          fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
+                   joining_type_name, arabicshaping_filename, lineno);
+          exit (1);
          }
  
-      for (i = i1; i <= i2; i++)
+      /* Remove trailing spaces.  */
+      while (joining_group_name[0] != '\0'
+             && joining_group_name[strlen (joining_group_name) - 1] == ' ')
+        joining_group_name[strlen (joining_group_name) - 1] = '\0';
+
+#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
+      if (false) {}
+      TRY(UC_JOINING_GROUP_NONE,                  "No_Joining_Group")
+      TRY(UC_JOINING_GROUP_AIN,                   "AIN")
+      TRY(UC_JOINING_GROUP_ALAPH,                 "ALAPH")
+      TRY(UC_JOINING_GROUP_ALEF,                  "ALEF")
+      TRY(UC_JOINING_GROUP_BEH,                   "BEH")
+      TRY(UC_JOINING_GROUP_BETH,                  "BETH")
+      TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
+      TRY(UC_JOINING_GROUP_DAL,                   "DAL")
+      TRY(UC_JOINING_GROUP_DALATH_RISH,           "DALATH RISH")
+      TRY(UC_JOINING_GROUP_E,                     "E")
+      TRY(UC_JOINING_GROUP_FARSI_YEH,             "FARSI YEH")
+      TRY(UC_JOINING_GROUP_FE,                    "FE")
+      TRY(UC_JOINING_GROUP_FEH,                   "FEH")
+      TRY(UC_JOINING_GROUP_FINAL_SEMKATH,         "FINAL SEMKATH")
+      TRY(UC_JOINING_GROUP_GAF,                   "GAF")
+      TRY(UC_JOINING_GROUP_GAMAL,                 "GAMAL")
+      TRY(UC_JOINING_GROUP_HAH,                   "HAH")
+      TRY(UC_JOINING_GROUP_HE,                    "HE")
+      TRY(UC_JOINING_GROUP_HEH,                   "HEH")
+      TRY(UC_JOINING_GROUP_HEH_GOAL,              "HEH GOAL")
+      TRY(UC_JOINING_GROUP_HETH,                  "HETH")
+      TRY(UC_JOINING_GROUP_KAF,                   "KAF")
+      TRY(UC_JOINING_GROUP_KAPH,                  "KAPH")
+      TRY(UC_JOINING_GROUP_KHAPH,                 "KHAPH")
+      TRY(UC_JOINING_GROUP_KNOTTED_HEH,           "KNOTTED HEH")
+      TRY(UC_JOINING_GROUP_LAM,                   "LAM")
+      TRY(UC_JOINING_GROUP_LAMADH,                "LAMADH")
+      TRY(UC_JOINING_GROUP_MEEM,                  "MEEM")
+      TRY(UC_JOINING_GROUP_MIM,                   "MIM")
+      TRY(UC_JOINING_GROUP_NOON,                  "NOON")
+      TRY(UC_JOINING_GROUP_NUN,                   "NUN")
+      TRY(UC_JOINING_GROUP_NYA,                   "NYA")
+      TRY(UC_JOINING_GROUP_PE,                    "PE")
+      TRY(UC_JOINING_GROUP_QAF,                   "QAF")
+      TRY(UC_JOINING_GROUP_QAPH,                  "QAPH")
+      TRY(UC_JOINING_GROUP_REH,                   "REH")
+      TRY(UC_JOINING_GROUP_REVERSED_PE,           "REVERSED PE")
+      TRY(UC_JOINING_GROUP_SAD,                   "SAD")
+      TRY(UC_JOINING_GROUP_SADHE,                 "SADHE")
+      TRY(UC_JOINING_GROUP_SEEN,                  "SEEN")
+      TRY(UC_JOINING_GROUP_SEMKATH,               "SEMKATH")
+      TRY(UC_JOINING_GROUP_SHIN,                  "SHIN")
+      TRY(UC_JOINING_GROUP_SWASH_KAF,             "SWASH KAF")
+      TRY(UC_JOINING_GROUP_SYRIAC_WAW,            "SYRIAC WAW")
+      TRY(UC_JOINING_GROUP_TAH,                   "TAH")
+      TRY(UC_JOINING_GROUP_TAW,                   "TAW")
+      TRY(UC_JOINING_GROUP_TEH_MARBUTA,           "TEH MARBUTA")
+      TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL,      "TEH MARBUTA GOAL")
+      TRY(UC_JOINING_GROUP_TETH,                  "TETH")
+      TRY(UC_JOINING_GROUP_WAW,                   "WAW")
+      TRY(UC_JOINING_GROUP_YEH,                   "YEH")
+      TRY(UC_JOINING_GROUP_YEH_BARREE,            "YEH BARREE")
+      TRY(UC_JOINING_GROUP_YEH_WITH_TAIL,         "YEH WITH TAIL")
+      TRY(UC_JOINING_GROUP_YUDH,                  "YUDH")
+      TRY(UC_JOINING_GROUP_YUDH_HE,               "YUDH HE")
+      TRY(UC_JOINING_GROUP_ZAIN,                  "ZAIN")
+      TRY(UC_JOINING_GROUP_ZHAIN,                 "ZHAIN")
+#undef TRY
+      else
          {
-          if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
-            fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
-          unicode_scripts[i] = script;
+          fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
+                   joining_group_name, arabicshaping_filename, lineno);
+          exit (1);
          }
+
+      unicode_joining_type[i] = joining_type;
+      unicode_joining_group[i] = joining_group;
      }
  
    if (ferror (stream) || fclose (stream))
      {
-      fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+      fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
        exit (1);
      }
  }
  
-/* Construction of sparse 3-level tables.  */
-#define TABLE script_table
-#define ELEMENT uint8_t
-#define DEFAULT (uint8_t)~(uint8_t)0
-#define xmalloc malloc
-#define xrealloc realloc
-#include "3level.h"
+/* Convert a Joining_Type value to a C identifier.  */
+static const char *
+joining_type_as_c_identifier (int joining_type)
+{
+#define TRY(value) if (joining_type == value) return #value;
+  TRY(UC_JOINING_TYPE_U)
+  TRY(UC_JOINING_TYPE_T)
+  TRY(UC_JOINING_TYPE_C)
+  TRY(UC_JOINING_TYPE_L)
+  TRY(UC_JOINING_TYPE_R)
+  TRY(UC_JOINING_TYPE_D)
+#undef TRY
+  abort ();
+}
  
  static void
-output_scripts (const char *version)
+output_joining_type_test (const char *filename, const char *version)
  {
-  const char *filename = "unictype/scripts.h";
    FILE *stream;
-  unsigned int ch, s, i;
-  struct script_table t;
-  unsigned int level1_offset, level2_offset, level3_offset;
-
-  typedef struct
-  {
-    const char *lowercase_name;
-  }
-  scriptinfo_t;
-  scriptinfo_t scriptinfo[256];
+  bool need_comma;
+  unsigned int ch;
  
    stream = fopen (filename, "w");
    if (stream == NULL)
@@ -3646,81 +3929,74 @@ output_scripts (const char *version)
      }
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
-  fprintf (stream, "/* Unicode scripts.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Arabic joining type of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
-  for (s = 0; s < numscripts; s++)
+  need_comma = false;
+  for (ch = 0; ch < 0x110000; ch++)
      {
-      char *lcp = strdup (scripts[s]);
-      char *cp;
-
-      for (cp = lcp; *cp != '\0'; cp++)
-        if (*cp >= 'A' && *cp <= 'Z')
-          *cp += 'a' - 'A';
+      int value = unicode_joining_type[ch];
  
-      scriptinfo[s].lowercase_name = lcp;
+      if (value != (uint8_t)~(uint8_t)0)
+        {
+          if (need_comma)
+            fprintf (stream, ",\n");
+          fprintf (stream, "    { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
+          need_comma = true;
+        }
      }
+  if (need_comma)
+    fprintf (stream, "\n");
  
-  for (s = 0; s < numscripts; s++)
+  if (ferror (stream) || fclose (stream))
      {
-      fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
-               scriptinfo[s].lowercase_name);
-      fprintf (stream, "{\n");
-      i = 0;
-      for (ch = 0; ch < 0x110000; ch++)
-        if (unicode_scripts[ch] == s)
-          {
-            unsigned int start;
-            unsigned int end;
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
  
-            start = ch;
-            while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
-              ch++;
-            end = ch;
+/* Construction of sparse 3-level tables.  */
+#define TABLE joining_type_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
  
-            if (i > 0)
-              fprintf (stream, ",\n");
-            if (start == end)
-              fprintf (stream, "  { 0x%04X, 1, 1 }", start);
-            else
-              fprintf (stream, "  { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
-                       start, end);
-            i++;
-          }
-      fprintf (stream, "\n");
-      fprintf (stream, "};\n");
-    }
+static void
+output_joining_type (const char *filename, const char *version)
+{
+  FILE *stream;
+  unsigned int ch, i;
+  struct joining_type_table t;
+  unsigned int level1_offset, level2_offset, level3_offset;
+  uint8_t *level3_packed;
  
-  fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
-  fprintf (stream, "{\n");
-  for (s = 0; s < numscripts; s++)
+  stream = fopen (filename, "w");
+  if (stream == NULL)
      {
-      fprintf (stream, "  {\n");
-      fprintf (stream, "    sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
-               scriptinfo[s].lowercase_name);
-      fprintf (stream, "    script_%s_intervals,\n",
-               scriptinfo[s].lowercase_name);
-      fprintf (stream, "    \"%s\"\n", scripts[s]);
-      fprintf (stream, "  }");
-      if (s+1 < numscripts)
-        fprintf (stream, ",");
-      fprintf (stream, "\n");
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
      }
-  fprintf (stream, "};\n");
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Arabic joining type of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+           version);
  
    t.p = 7;
    t.q = 9;
-  script_table_init (&t);
+  joining_type_table_init (&t);
  
    for (ch = 0; ch < 0x110000; ch++)
      {
-      unsigned int s = unicode_scripts[ch];
-      if (s != (uint8_t)~(uint8_t)0)
-        script_table_add (&t, ch, s);
+      uint8_t value = unicode_joining_type[ch];
+
+      joining_type_table_add (&t, ch, value);
      }
  
-  script_table_finalize (&t);
+  joining_type_table_finalize (&t);
  
    /* Offsets in t.result, in memory of this process.  */
    level1_offset =
@@ -3734,16 +4010,17 @@ output_scripts (const char *version)
      + (t.level2_size << t.q) * sizeof (uint32_t);
  
    for (i = 0; i < 5; i++)
-    fprintf (stream, "#define script_header_%d %d\n", i,
+    fprintf (stream, "#define joining_type_header_%d %d\n", i,
               ((uint32_t *) t.result)[i]);
    fprintf (stream, "static const\n");
    fprintf (stream, "struct\n");
    fprintf (stream, "  {\n");
    fprintf (stream, "    int level1[%zu];\n", t.level1_size);
    fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
-  fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+  fprintf (stream, "    unsigned char level3[%zu * %d];\n", t.level3_size,
+           (1 << t.p) * 4 / 8);
    fprintf (stream, "  }\n");
-  fprintf (stream, "u_script =\n");
+  fprintf (stream, "u_joining_type =\n");
    fprintf (stream, "{\n");
    fprintf (stream, "  {");
    if (t.level1_size > 8)
@@ -3785,12 +4062,479 @@ output_scripts (const char *version)
    if (t.level2_size << t.q > 8)
      fprintf (stream, "\n ");
    fprintf (stream, " },\n");
-  fprintf (stream, "  {");
-  if (t.level3_size << t.p > 8)
-    fprintf (stream, "\n   ");
+  /* Pack the level3 array.  Each entry needs 4 bits only.  */
+  level3_packed =
+    (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
    for (i = 0; i < t.level3_size << t.p; i++)
      {
-      if (i > 0 && (i % 8) == 0)
+      unsigned int j = (i * 4) / 8;
+      unsigned int k = (i * 4) % 8;
+      uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
+      level3_packed[j] |= (value << k);
+    }
+  fprintf (stream, "  {");
+  if ((t.level3_size << t.p) * 4 / 8 > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
+    {
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      fprintf (stream, " 0x%02x", level3_packed[i]);
+      if (i+1 < (t.level3_size << t.p) * 4 / 8)
+        fprintf (stream, ",");
+    }
+  if ((t.level3_size << t.p) * 4 / 8 > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " }\n");
+  free (level3_packed);
+  fprintf (stream, "};\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* Convert a Joining_Group value to a C identifier.  */
+static const char *
+joining_group_as_c_identifier (int joining_group)
+{
+#define TRY(value) if (joining_group == value) return #value;
+  TRY(UC_JOINING_GROUP_NONE)
+  TRY(UC_JOINING_GROUP_AIN)
+  TRY(UC_JOINING_GROUP_ALAPH)
+  TRY(UC_JOINING_GROUP_ALEF)
+  TRY(UC_JOINING_GROUP_BEH)
+  TRY(UC_JOINING_GROUP_BETH)
+  TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
+  TRY(UC_JOINING_GROUP_DAL)
+  TRY(UC_JOINING_GROUP_DALATH_RISH)
+  TRY(UC_JOINING_GROUP_E)
+  TRY(UC_JOINING_GROUP_FARSI_YEH)
+  TRY(UC_JOINING_GROUP_FE)
+  TRY(UC_JOINING_GROUP_FEH)
+  TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
+  TRY(UC_JOINING_GROUP_GAF)
+  TRY(UC_JOINING_GROUP_GAMAL)
+  TRY(UC_JOINING_GROUP_HAH)
+  TRY(UC_JOINING_GROUP_HE)
+  TRY(UC_JOINING_GROUP_HEH)
+  TRY(UC_JOINING_GROUP_HEH_GOAL)
+  TRY(UC_JOINING_GROUP_HETH)
+  TRY(UC_JOINING_GROUP_KAF)
+  TRY(UC_JOINING_GROUP_KAPH)
+  TRY(UC_JOINING_GROUP_KHAPH)
+  TRY(UC_JOINING_GROUP_KNOTTED_HEH)
+  TRY(UC_JOINING_GROUP_LAM)
+  TRY(UC_JOINING_GROUP_LAMADH)
+  TRY(UC_JOINING_GROUP_MEEM)
+  TRY(UC_JOINING_GROUP_MIM)
+  TRY(UC_JOINING_GROUP_NOON)
+  TRY(UC_JOINING_GROUP_NUN)
+  TRY(UC_JOINING_GROUP_NYA)
+  TRY(UC_JOINING_GROUP_PE)
+  TRY(UC_JOINING_GROUP_QAF)
+  TRY(UC_JOINING_GROUP_QAPH)
+  TRY(UC_JOINING_GROUP_REH)
+  TRY(UC_JOINING_GROUP_REVERSED_PE)
+  TRY(UC_JOINING_GROUP_SAD)
+  TRY(UC_JOINING_GROUP_SADHE)
+  TRY(UC_JOINING_GROUP_SEEN)
+  TRY(UC_JOINING_GROUP_SEMKATH)
+  TRY(UC_JOINING_GROUP_SHIN)
+  TRY(UC_JOINING_GROUP_SWASH_KAF)
+  TRY(UC_JOINING_GROUP_SYRIAC_WAW)
+  TRY(UC_JOINING_GROUP_TAH)
+  TRY(UC_JOINING_GROUP_TAW)
+  TRY(UC_JOINING_GROUP_TEH_MARBUTA)
+  TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
+  TRY(UC_JOINING_GROUP_TETH)
+  TRY(UC_JOINING_GROUP_WAW)
+  TRY(UC_JOINING_GROUP_YEH)
+  TRY(UC_JOINING_GROUP_YEH_BARREE)
+  TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
+  TRY(UC_JOINING_GROUP_YUDH)
+  TRY(UC_JOINING_GROUP_YUDH_HE)
+  TRY(UC_JOINING_GROUP_ZAIN)
+  TRY(UC_JOINING_GROUP_ZHAIN)
+#undef TRY
+  abort ();
+}
+
+static void
+output_joining_group_test (const char *filename, const char *version)
+{
+  FILE *stream;
+  bool need_comma;
+  unsigned int ch;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Arabic joining group of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+           version);
+
+  need_comma = false;
+  for (ch = 0; ch < 0x110000; ch++)
+    {
+      int value = unicode_joining_group[ch];
+
+      if (value != UC_JOINING_GROUP_NONE)
+        {
+          if (need_comma)
+            fprintf (stream, ",\n");
+          fprintf (stream, "    { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
+          need_comma = true;
+        }
+    }
+  if (need_comma)
+    fprintf (stream, "\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+static void
+output_joining_group (const char *filename, const char *version)
+{
+  FILE *stream;
+  unsigned int ch_min, ch_max, ch, i;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Arabic joining type of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+           version);
+
+  ch_min = 0x10FFFF;
+  for (ch = 0; ch < 0x110000; ch++)
+    if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+      {
+        ch_min = ch;
+        break;
+      }
+
+  ch_max = 0;
+  for (ch = 0x10FFFF; ch > 0; ch--)
+    if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
+      {
+        ch_max = ch;
+        break;
+      }
+
+  if (!(ch_min <= ch_max))
+    abort ();
+
+  /* If the interval [ch_min, ch_max] is too large, we should better use a
+     3-level table.  */
+  if (!(ch_max - ch_min < 0x200))
+    abort ();
+
+  fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
+  fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
+           ch_max + 1, ch_min);
+  fprintf (stream, "{");
+  for (i = 0; i <= ch_max - ch_min; i++)
+    {
+      const char *s;
+
+      ch = ch_min + i;
+      if ((i % 2) == 0)
+        fprintf (stream, "\n ");
+      s = joining_group_as_c_identifier (unicode_joining_group[ch]);
+      fprintf (stream, " %s", s);
+      if (i+1 <= ch_max - ch_min)
+        {
+          fprintf (stream, ",");
+          if (((i+1) % 2) != 0)
+            fprintf (stream, "%*s", 38 - (int) strlen (s), "");
+        }
+    }
+  fprintf (stream, "\n");
+  fprintf (stream, "};\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* ========================================================================= */
+
+/* Scripts.  */
+
+static const char *scripts[256];
+static unsigned int numscripts;
+
+static uint8_t unicode_scripts[0x110000];
+
+static void
+fill_scripts (const char *scripts_filename)
+{
+  FILE *stream;
+  unsigned int i;
+
+  stream = fopen (scripts_filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
+      exit (1);
+    }
+
+  numscripts = 0;
+
+  for (i = 0; i < 0x110000; i++)
+    unicode_scripts[i] = (uint8_t)~(uint8_t)0;
+
+  for (;;)
+    {
+      char buf[200+1];
+      unsigned int i1, i2;
+      char padding[200+1];
+      char scriptname[200+1];
+      int script;
+
+      if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+        break;
+
+      if (buf[0] == '\0' || buf[0] == '#')
+        continue;
+
+      if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
+        {
+          if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
+            {
+              fprintf (stderr, "parse error in '%s'\n", scripts_filename);
+              exit (1);
+            }
+          i2 = i1;
+        }
+      if (i2 < i1)
+        abort ();
+      if (i2 >= 0x110000)
+        abort ();
+
+      for (script = numscripts - 1; script >= 0; script--)
+        if (strcmp (scripts[script], scriptname) == 0)
+          break;
+      if (script < 0)
+        {
+          scripts[numscripts] = strdup (scriptname);
+          script = numscripts;
+          numscripts++;
+          if (numscripts == 256)
+            abort ();
+        }
+
+      for (i = i1; i <= i2; i++)
+        {
+          if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
+            fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
+          unicode_scripts[i] = script;
+        }
+    }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", scripts_filename);
+      exit (1);
+    }
+}
+
+/* Construction of sparse 3-level tables.  */
+#define TABLE script_table
+#define ELEMENT uint8_t
+#define DEFAULT (uint8_t)~(uint8_t)0
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+static void
+output_scripts (const char *version)
+{
+  const char *filename = "unictype/scripts.h";
+  FILE *stream;
+  unsigned int ch, s, i;
+  struct script_table t;
+  unsigned int level1_offset, level2_offset, level3_offset;
+
+  typedef struct
+  {
+    const char *lowercase_name;
+  }
+  scriptinfo_t;
+  scriptinfo_t scriptinfo[256];
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Unicode scripts.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+           version);
+
+  for (s = 0; s < numscripts; s++)
+    {
+      char *lcp = strdup (scripts[s]);
+      char *cp;
+
+      for (cp = lcp; *cp != '\0'; cp++)
+        if (*cp >= 'A' && *cp <= 'Z')
+          *cp += 'a' - 'A';
+
+      scriptinfo[s].lowercase_name = lcp;
+    }
+
+  for (s = 0; s < numscripts; s++)
+    {
+      fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
+               scriptinfo[s].lowercase_name);
+      fprintf (stream, "{\n");
+      i = 0;
+      for (ch = 0; ch < 0x110000; ch++)
+        if (unicode_scripts[ch] == s)
+          {
+            unsigned int start;
+            unsigned int end;
+
+            start = ch;
+            while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
+              ch++;
+            end = ch;
+
+            if (i > 0)
+              fprintf (stream, ",\n");
+            if (start == end)
+              fprintf (stream, "  { 0x%04X, 1, 1 }", start);
+            else
+              fprintf (stream, "  { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
+                       start, end);
+            i++;
+          }
+      fprintf (stream, "\n");
+      fprintf (stream, "};\n");
+    }
+
+  fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
+  fprintf (stream, "{\n");
+  for (s = 0; s < numscripts; s++)
+    {
+      fprintf (stream, "  {\n");
+      fprintf (stream, "    sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
+               scriptinfo[s].lowercase_name);
+      fprintf (stream, "    script_%s_intervals,\n",
+               scriptinfo[s].lowercase_name);
+      fprintf (stream, "    \"%s\"\n", scripts[s]);
+      fprintf (stream, "  }");
+      if (s+1 < numscripts)
+        fprintf (stream, ",");
+      fprintf (stream, "\n");
+    }
+  fprintf (stream, "};\n");
+
+  t.p = 7;
+  t.q = 9;
+  script_table_init (&t);
+
+  for (ch = 0; ch < 0x110000; ch++)
+    {
+      unsigned int s = unicode_scripts[ch];
+      if (s != (uint8_t)~(uint8_t)0)
+        script_table_add (&t, ch, s);
+    }
+
+  script_table_finalize (&t);
+
+  /* Offsets in t.result, in memory of this process.  */
+  level1_offset =
+    5 * sizeof (uint32_t);
+  level2_offset =
+    5 * sizeof (uint32_t)
+    + t.level1_size * sizeof (uint32_t);
+  level3_offset =
+    5 * sizeof (uint32_t)
+    + t.level1_size * sizeof (uint32_t)
+    + (t.level2_size << t.q) * sizeof (uint32_t);
+
+  for (i = 0; i < 5; i++)
+    fprintf (stream, "#define script_header_%d %d\n", i,
+             ((uint32_t *) t.result)[i]);
+  fprintf (stream, "static const\n");
+  fprintf (stream, "struct\n");
+  fprintf (stream, "  {\n");
+  fprintf (stream, "    int level1[%zu];\n", t.level1_size);
+  fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
+  fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
+  fprintf (stream, "  }\n");
+  fprintf (stream, "u_script =\n");
+  fprintf (stream, "{\n");
+  fprintf (stream, "  {");
+  if (t.level1_size > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < t.level1_size; i++)
+    {
+      uint32_t offset;
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      offset = ((uint32_t *) (t.result + level1_offset))[i];
+      if (offset == 0)
+        fprintf (stream, " %5d", -1);
+      else
+        fprintf (stream, " %5zu",
+                 (offset - level2_offset) / sizeof (uint32_t));
+      if (i+1 < t.level1_size)
+        fprintf (stream, ",");
+    }
+  if (t.level1_size > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " },\n");
+  fprintf (stream, "  {");
+  if (t.level2_size << t.q > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < t.level2_size << t.q; i++)
+    {
+      uint32_t offset;
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      offset = ((uint32_t *) (t.result + level2_offset))[i];
+      if (offset == 0)
+        fprintf (stream, " %5d", -1);
+      else
+        fprintf (stream, " %5zu",
+                 (offset - level3_offset) / sizeof (uint8_t));
+      if (i+1 < t.level2_size << t.q)
+        fprintf (stream, ",");
+    }
+  if (t.level2_size << t.q > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " },\n");
+  fprintf (stream, "  {");
+  if (t.level3_size << t.p > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < t.level3_size << t.p; i++)
+    {
+      if (i > 0 && (i % 8) == 0)
          fprintf (stream, "\n   ");
        fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
        if (i+1 < t.level3_size << t.p)
@@ -3824,9 +4568,9 @@ output_scripts_byname (const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Unicode scripts.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
-  fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
+  fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
    fprintf (stream, "%%struct-type\n");
    fprintf (stream, "%%language=ANSI-C\n");
    fprintf (stream, "%%define hash-function-name scripts_hash\n");
@@ -3834,6 +4578,8 @@ output_scripts_byname (const char *version)
    fprintf (stream, "%%readonly-tables\n");
    fprintf (stream, "%%global-table\n");
    fprintf (stream, "%%define word-array-name script_names\n");
+  fprintf (stream, "%%pic\n");
+  fprintf (stream, "%%define string-pool-name script_stringpool\n");
    fprintf (stream, "%%%%\n");
    for (s = 0; s < numscripts; s++)
      fprintf (stream, "%s, %u\n", scripts[s], s);
@@ -3964,7 +4710,7 @@ output_blocks (const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Unicode blocks.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    fprintf (stream, "static const uc_block_t blocks[] =\n");
@@ -4381,7 +5127,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Language syntax properties of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7; /* or 8 */
@@ -4926,7 +5672,7 @@ output_tables (const char *filename, const char *version)
    fprintf (stream, "escape_char /\n");
    fprintf (stream, "comment_char %%\n");
    fprintf (stream, "\n");
-  fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
+  fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
             version);
    fprintf (stream, "\n");
  
@@ -5134,61 +5880,291 @@ fill_width (const char *width_filename)
            unicode_width[i] = strdup (field1);
          }
      }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", width_filename);
+      exit (1);
+    }
+}
+
+/* ========================================================================= */
+
+/* Non-spacing attribute and width.  */
+
+/* The non-spacing attribute table consists of:
+   - Non-spacing characters; generated from PropList.txt or
+     "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
+   - Format control characters; generated from
+     "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+   - Zero width characters; generated from
+     "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+ */
+
+static bool
+is_nonspacing (unsigned int ch)
+{
+  return (unicode_attributes[ch].name != NULL
+          && (get_bidi_category (ch) == UC_BIDI_NSM
+              || is_category_Cc (ch) || is_category_Cf (ch)
+              || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
+}
+
+static void
+output_nonspacing_property (const char *filename)
+{
+  FILE *stream;
+  int ind[0x110000 / 0x200];
+  unsigned int i;
+  unsigned int i_max;
+  int next_ind;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  next_ind = 0;
+  for (i = 0; i < 0x110000 / 0x200; i++)
+    {
+      bool nontrivial = false;
+      unsigned int ch;
+
+      if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code.  */
+        for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
+          if (is_nonspacing (ch))
+            {
+              nontrivial = true;
+              break;
+            }
+      if (nontrivial)
+        ind[i] = next_ind++;
+      else
+        ind[i] = -1;
+    }
+
+  fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
+           next_ind);
+  i_max = 0;
+  for (i = 0; i < 0x110000 / 0x200; i++)
+    {
+      bool nontrivial = (ind[i] >= 0);
+
+      if (nontrivial)
+        {
+          unsigned int j;
+
+          fprintf (stream, "  /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
+          for (j = 0; j < 8; j++)
+            {
+              unsigned int k;
+
+              fprintf (stream, " ");
+              for (k = 0; k < 8; k++)
+                {
+                  unsigned int l;
+                  unsigned char bits = 0;
+
+                  for (l = 0; l < 8; l++)
+                    {
+                      unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
+
+                      if (is_nonspacing (ch))
+                        bits |= 1 << l;
+                    }
+                  fprintf (stream, " 0x%02x%c", bits,
+                           ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
+                }
+              fprintf (stream, " /* 0x%04x-0x%04x */\n",
+                       i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
+            }
+          i_max = i;
+        }
+    }
+  fprintf (stream, "};\n");
+
+  i_max = ((i_max + 8 - 1) / 8) * 8;
+  fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
+           i_max);
+  {
+    unsigned int j;
+
+    for (j = 0; j < i_max / 8; j++)
+      {
+        unsigned int k;
+
+        fprintf (stream, " ");
+        for (k = 0; k < 8; k++)
+          {
+            i = j * 8 + k;
+            fprintf (stream, " %2d%c", ind[i],
+                     j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
+          }
+        fprintf (stream, " /* 0x%04x-0x%04x */\n",
+                 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
+      }
+  }
+  fprintf (stream, "};\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* Returns the width of ch as one of 0, '0', '1', '2', 'A'.  */
+static char
+symbolic_width (unsigned int ch)
+{
+  /* Test for unassigned character.  */
+  if (is_property_unassigned_code_value (ch))
+    {
+      /* Unicode TR#11 section "Unassigned and Private-Use Characters".  */
+      if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
+        return 'A';
+      if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
+          || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
+          || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
+          || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
+          || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
+        return '2';
+      return 0;
+    }
+  else
+    {
+      /* Test for non-spacing or control character.  */
+      if (is_category_Cc (ch) && ch < 0x00A0)
+        return 0;
+      if (is_nonspacing (ch))
+        return '0';
+      /* Test for double-width character.  */
+      if (unicode_width[ch] != NULL
+          && (strcmp (unicode_width[ch], "W") == 0
+              || strcmp (unicode_width[ch], "F") == 0))
+        return '2';
+      /* Test for half-width character.  */
+      if (unicode_width[ch] != NULL
+          && strcmp (unicode_width[ch], "H") == 0)
+        return '1';
+    }
+  /* In ancient CJK encodings, Cyrillic and most other characters are
+     double-width as well.  */
+  if (ch >= 0x00A1 && ch < 0x10000)
+    return 'A';
+  return '1';
+}
+
+static void
+output_width_property_test (const char *filename)
+{
+  FILE *stream;
+  unsigned int interval_start, interval_end, ch;
+  char interval_value;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  interval_value = 0;
+  interval_start = interval_end = 0; /* avoid GCC warning */
+  for (ch = 0; ch < 0x110000; ch++)
+    {
+      char value = symbolic_width (ch);
+      if (value != 0) /* skip Cc control characters and unassigned characters */
+        {
+          if (value == interval_value)
+            /* Extend the interval.  */
+            interval_end = ch;
+          else
+            {
+              /* Terminate the interval.  */
+              if (interval_value != 0)
+                {
+                  if (interval_end == interval_start)
+                    fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+                  else
+                    fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+                }
+              /* Start a new interval.  */
+              interval_start = interval_end = ch;
+              interval_value = value;
+            }
+        }
+    }
+  /* Terminate the last interval.  */
+  if (interval_value != 0)
+    {
+      if (interval_end == interval_start)
+        fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
+      else
+        fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
+    }
+
    if (ferror (stream) || fclose (stream))
      {
-      fprintf (stderr, "error reading from '%s'\n", width_filename);
+      fprintf (stderr, "error writing to '%s'\n", filename);
        exit (1);
      }
  }
  
-/* Line breaking classification.  */
+/* ========================================================================= */
+
+/* Line breaking classification.
+   Updated for Unicode TR #14 revision 26.  */
  
  enum
  {
-  /* Values >= 24 are resolved at run time. */
-  LBP_BK = 24, /* mandatory break */
+  /* Values >= 25 are resolved at run time. */
+  LBP_BK = 25, /* mandatory break */
  /*LBP_CR,         carriage return - not used here because it's a DOSism */
  /*LBP_LF,         line feed - not used here because it's a DOSism */
-  LBP_CM = 25, /* attached characters and combining marks */
+  LBP_CM = 26, /* attached characters and combining marks */
  /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK */
  /*LBP_SG,         surrogates - not used here because they are not characters */
    LBP_WJ =  0, /* word joiner */
-  LBP_ZW = 26, /* zero width space */
+  LBP_ZW = 27, /* zero width space */
    LBP_GL =  1, /* non-breaking (glue) */
-  LBP_SP = 27, /* space */
+  LBP_SP = 28, /* space */
    LBP_B2 =  2, /* break opportunity before and after */
    LBP_BA =  3, /* break opportunity after */
    LBP_BB =  4, /* break opportunity before */
    LBP_HY =  5, /* hyphen */
-  LBP_CB = 28, /* contingent break opportunity */
+  LBP_CB = 29, /* contingent break opportunity */
    LBP_CL =  6, /* closing punctuation */
-  LBP_EX =  7, /* exclamation/interrogation */
-  LBP_IN =  8, /* inseparable */
-  LBP_NS =  9, /* non starter */
-  LBP_OP = 10, /* opening punctuation */
-  LBP_QU = 11, /* ambiguous quotation */
-  LBP_IS = 12, /* infix separator (numeric) */
-  LBP_NU = 13, /* numeric */
-  LBP_PO = 14, /* postfix (numeric) */
-  LBP_PR = 15, /* prefix (numeric) */
-  LBP_SY = 16, /* symbols allowing breaks */
-  LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
-  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
-  LBP_H2 = 18, /* Hangul LV syllable */
-  LBP_H3 = 19, /* Hangul LVT syllable */
-  LBP_ID = 20, /* ideographic */
-  LBP_JL = 21, /* Hangul L Jamo */
-  LBP_JV = 22, /* Hangul V Jamo */
-  LBP_JT = 23, /* Hangul T Jamo */
-  LBP_SA = 30, /* complex context (South East Asian) */
-  LBP_XX = 31  /* unknown */
+  LBP_CP =  7, /* closing parenthesis */
+  LBP_EX =  8, /* exclamation/interrogation */
+  LBP_IN =  9, /* inseparable */
+  LBP_NS = 10, /* non starter */
+  LBP_OP = 11, /* opening punctuation */
+  LBP_QU = 12, /* ambiguous quotation */
+  LBP_IS = 13, /* infix separator (numeric) */
+  LBP_NU = 14, /* numeric */
+  LBP_PO = 15, /* postfix (numeric) */
+  LBP_PR = 16, /* prefix (numeric) */
+  LBP_SY = 17, /* symbols allowing breaks */
+  LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+  LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+  LBP_H2 = 19, /* Hangul LV syllable */
+  LBP_H3 = 20, /* Hangul LVT syllable */
+  LBP_ID = 21, /* ideographic */
+  LBP_JL = 22, /* Hangul L Jamo */
+  LBP_JV = 23, /* Hangul V Jamo */
+  LBP_JT = 24, /* Hangul T Jamo */
+  LBP_SA = 31, /* complex context (South East Asian) */
+  LBP_XX = 32  /* unknown */
  };
  
  /* Returns the line breaking classification for ch, as a bit mask.  */
-static int
+static int64_t
  get_lbp (unsigned int ch)
  {
-  int attr = 0;
+  int64_t attr = 0;
  
    if (unicode_attributes[ch].name != NULL)
      {
@@ -5198,15 +6174,15 @@ get_lbp (unsigned int ch)
            || ch == 0x000B /* line tabulation */
            || ch == 0x2028 /* LINE SEPARATOR */
            || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
-        attr |= 1 << LBP_BK;
+        attr |= (int64_t) 1 << LBP_BK;
  
        if (ch == 0x2060 /* WORD JOINER */
            || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
-        attr |= 1 << LBP_WJ;
+        attr |= (int64_t) 1 << LBP_WJ;
  
        /* zero width space */
        if (ch == 0x200B /* ZERO WIDTH SPACE */)
-        attr |= 1 << LBP_ZW;
+        attr |= (int64_t) 1 << LBP_ZW;
  
        /* non-breaking (glue) */
        if (ch == 0x00A0 /* NO-BREAK SPACE */
@@ -5218,19 +6194,23 @@ get_lbp (unsigned int ch)
            || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
            || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
            || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
-          || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
-        attr |= 1 << LBP_GL;
+          || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
+          || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
+        attr |= (int64_t) 1 << LBP_GL;
  
        /* space */
        if (ch == 0x0020 /* SPACE */)
-        attr |= 1 << LBP_SP;
+        attr |= (int64_t) 1 << LBP_SP;
  
        /* break opportunity before and after */
        if (ch == 0x2014 /* EM DASH */)
-        attr |= 1 << LBP_B2;
+        attr |= (int64_t) 1 << LBP_B2;
  
        /* break opportunity after */
-      if (ch == 0x1680 /* OGHAM SPACE MARK */
+      if (/* Breaking Spaces */
+          ch == 0x1680 /* OGHAM SPACE MARK */
            || ch == 0x2000 /* EN QUAD */
            || ch == 0x2001 /* EM QUAD */
            || ch == 0x2002 /* EN SPACE */
@@ -5242,12 +6222,17 @@ get_lbp (unsigned int ch)
            || ch == 0x2009 /* THIN SPACE */
            || ch == 0x200A /* HAIR SPACE */
            || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
+          /* Tabs */
            || ch == 0x0009 /* tab */
+          /* Conditional Hyphens */
            || ch == 0x00AD /* SOFT HYPHEN */
+          /* Breaking Hyphens */
            || ch == 0x058A /* ARMENIAN HYPHEN */
+          || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
            || ch == 0x2010 /* HYPHEN */
            || ch == 0x2012 /* FIGURE DASH */
            || ch == 0x2013 /* EN DASH */
+          /* Visible Word Dividers */
            || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
            || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
            || ch == 0x1361 /* ETHIOPIC WORDSPACE */
@@ -5255,6 +6240,7 @@ get_lbp (unsigned int ch)
            || ch == 0x17DA /* KHMER SIGN KOOMUUT */
            || ch == 0x2027 /* HYPHENATION POINT */
            || ch == 0x007C /* VERTICAL LINE */
+          /* Historic Word Separators */
            || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
            || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
            || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
@@ -5271,6 +6257,7 @@ get_lbp (unsigned int ch)
            || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
            || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
            || ch == 0x2E30 /* RING POINT */
+          || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
            || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
            || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
            || ch == 0x10102 /* AEGEAN CHECK MARK */
@@ -5278,6 +6265,7 @@ get_lbp (unsigned int ch)
            || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
            || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
            || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
+          /* Dandas */
            || ch == 0x0964 /* DEVANAGARI DANDA */
            || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
            || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
@@ -5297,17 +6285,18 @@ get_lbp (unsigned int ch)
            || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
            || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
            || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
+          /* Tibetan */
            || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
            || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
            || ch == 0x0F85 /* TIBETAN MARK PALUTA */
            || ch == 0x0FBE /* TIBETAN KU RU KHA */
            || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
            || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
+          /* Other Terminating Punctuation */
            || ch == 0x1804 /* MONGOLIAN COLON */
            || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
            || ch == 0x1B5A /* BALINESE PANTI */
            || ch == 0x1B5B /* BALINESE PAMADA */
-          || ch == 0x1B5C /* BALINESE WINDU */
            || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
            || ch == 0x1B60 /* BALINESE PAMENENG */
            || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
@@ -5334,10 +6323,36 @@ get_lbp (unsigned int ch)
            || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
            || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
+          || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
+          || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
+          || ch == 0xA6F3 /* BAMUM FULL STOP */
+          || ch == 0xA6F4 /* BAMUM COLON */
+          || ch == 0xA6F5 /* BAMUM COMMA */
+          || ch == 0xA6F6 /* BAMUM SEMICOLON */
+          || ch == 0xA6F7 /* BAMUM QUESTION MARK */
+          || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
+          || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
+          || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+          || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
+          || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
+          || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
+          || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
+          || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
+          || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
+          || ch == 0x11047 /* BRAHMI DANDA */
+          || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
+          || ch == 0x110BE /* KAITHI SECTION MARK */
+          || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
+          || ch == 0x110C0 /* KAITHI DANDA */
+          || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
            || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
            || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
            || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
-        attr |= 1 << LBP_BA;
+        attr |= (int64_t) 1 << LBP_BA;
  
        /* break opportunity before */
        if (ch == 0x00B4 /* ACUTE ACCENT */
@@ -5359,19 +6374,25 @@ get_lbp (unsigned int ch)
            || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
            || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
            || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
-        attr |= 1 << LBP_BB;
+        attr |= (int64_t) 1 << LBP_BB;
  
        /* hyphen */
        if (ch == 0x002D /* HYPHEN-MINUS */)
-        attr |= 1 << LBP_HY;
+        attr |= (int64_t) 1 << LBP_HY;
  
        /* contingent break opportunity */
        if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
-        attr |= 1 << LBP_CB;
+        attr |= (int64_t) 1 << LBP_CB;
+
+      /* closing parenthesis */
+      if (ch == 0x0029 /* RIGHT PARENTHESIS */
+          || ch == 0x005D /* RIGHT SQUARE BRACKET */)
+        attr |= (int64_t) 1 << LBP_CP;
  
        /* closing punctuation */
        if ((unicode_attributes[ch].category[0] == 'P'
-           && unicode_attributes[ch].category[1] == 'e')
+           && unicode_attributes[ch].category[1] == 'e'
+           && !(attr & ((int64_t) 1 << LBP_CP)))
            || ch == 0x3001 /* IDEOGRAPHIC COMMA */
            || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
            || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
@@ -5381,8 +6402,17 @@ get_lbp (unsigned int ch)
            || ch == 0xFF0C /* FULLWIDTH COMMA */
            || ch == 0xFF0E /* FULLWIDTH FULL STOP */
            || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
-          || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
-        attr |= 1 << LBP_CL;
+          || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
+          || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
+          || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
+          || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
+          || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
+          || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
+          || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
+          || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
+        attr |= (int64_t) 1 << LBP_CL;
  
        /* exclamation/interrogation */
        if (ch == 0x0021 /* EXCLAMATION MARK */
@@ -5410,9 +6440,6 @@ get_lbp (unsigned int ch)
            || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
            || ch == 0x2CFE /* COPTIC FULL STOP */
            || ch == 0x2E2E /* REVERSED QUESTION MARK */
-#if REVISION_22
-          || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
-#endif
            || ch == 0xA60E /* VAI FULL STOP */
            || ch == 0xA876 /* PHAGS-PA MARK SHAD */
            || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
@@ -5422,14 +6449,14 @@ get_lbp (unsigned int ch)
            || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
            || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
            || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
-        attr |= 1 << LBP_EX;
+        attr |= (int64_t) 1 << LBP_EX;
  
        /* inseparable */
        if (ch == 0x2024 /* ONE DOT LEADER */
            || ch == 0x2025 /* TWO DOT LEADER */
            || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
            || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
-        attr |= 1 << LBP_IN;
+        attr |= (int64_t) 1 << LBP_IN;
  
        /* non starter */
        if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
@@ -5462,15 +6489,22 @@ get_lbp (unsigned int ch)
            || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
            || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
            || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
-        attr |= 1 << LBP_NS;
+        attr |= (int64_t) 1 << LBP_NS;
  
        /* opening punctuation */
        if ((unicode_attributes[ch].category[0] == 'P'
             && unicode_attributes[ch].category[1] == 's')
            || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
            || ch == 0x00BF /* INVERTED QUESTION MARK */
-          || ch == 0x2E18 /* INVERTED INTERROBANG */)
-        attr |= 1 << LBP_OP;
+          || ch == 0x2E18 /* INVERTED INTERROBANG */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
+          || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
+          || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
+          || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
+          || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
+          || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
+        attr |= (int64_t) 1 << LBP_OP;
  
        /* ambiguous quotation */
        if ((unicode_attributes[ch].category[0] == 'P'
@@ -5488,7 +6522,7 @@ get_lbp (unsigned int ch)
            || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
            || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
            || ch == 0x2E0B /* RAISED SQUARE */)
-        attr |= 1 << LBP_QU;
+        attr |= (int64_t) 1 << LBP_QU;
  
        /* infix separator (numeric) */
        if (ch == 0x002C /* COMMA */
@@ -5504,7 +6538,7 @@ get_lbp (unsigned int ch)
            || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
            || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
            || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
-        attr |= 1 << LBP_IS;
+        attr |= (int64_t) 1 << LBP_IS;
  
        /* numeric */
        if ((unicode_attributes[ch].category[0] == 'N'
@@ -5512,7 +6546,7 @@ get_lbp (unsigned int ch)
             && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
            || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
            || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
-        attr |= 1 << LBP_NU;
+        attr |= (int64_t) 1 << LBP_NU;
  
        /* postfix (numeric) */
        if (ch == 0x0025 /* PERCENT SIGN */
@@ -5538,8 +6572,13 @@ get_lbp (unsigned int ch)
            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
            || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
            || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
-          || ch == 0x0D79 /* MALAYALAM DATE MARK */)
-        attr |= 1 << LBP_PO;
+          || ch == 0x09F2 /* BENGALI RUPEE MARK */
+          || ch == 0x09F3 /* BENGALI RUPEE SIGN */
+          || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
+          || ch == 0x0D79 /* MALAYALAM DATE MARK */
+          || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
+          || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
+        attr |= (int64_t) 1 << LBP_PO;
  
        /* prefix (numeric) */
        if ((unicode_attributes[ch].category[0] == 'S'
@@ -5550,27 +6589,27 @@ get_lbp (unsigned int ch)
            || ch == 0x2116 /* NUMERO SIGN */
            || ch == 0x2212 /* MINUS SIGN */
            || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
-        if (!(attr & (1 << LBP_PO)))
-          attr |= 1 << LBP_PR;
+        if (!(attr & ((int64_t) 1 << LBP_PO)))
+          attr |= (int64_t) 1 << LBP_PR;
  
        /* symbols allowing breaks */
        if (ch == 0x002F /* SOLIDUS */)
-        attr |= 1 << LBP_SY;
+        attr |= (int64_t) 1 << LBP_SY;
  
        if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
-        attr |= 1 << LBP_H2;
+        attr |= (int64_t) 1 << LBP_H2;
  
        if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
-        attr |= 1 << LBP_H3;
+        attr |= (int64_t) 1 << LBP_H3;
  
-      if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
-        attr |= 1 << LBP_JL;
+      if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
+        attr |= (int64_t) 1 << LBP_JL;
  
-      if (ch >= 0x1160 && ch <= 0x11A2)
-        attr |= 1 << LBP_JV;
+      if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
+        attr |= (int64_t) 1 << LBP_JV;
  
-      if (ch >= 0x11A8 && ch <= 0x11F9)
-        attr |= 1 << LBP_JT;
+      if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
+        attr |= (int64_t) 1 << LBP_JT;
  
        /* complex context (South East Asian) */
        if (((unicode_attributes[ch].category[0] == 'C'
@@ -5580,17 +6619,24 @@ get_lbp (unsigned int ch)
                     || unicode_attributes[ch].category[1] == 'o'))
             || (unicode_attributes[ch].category[0] == 'M'
                 && (unicode_attributes[ch].category[1] == 'c'
-                   || unicode_attributes[ch].category[1] == 'n'))
+                   || unicode_attributes[ch].category[1] == 'n')
+               && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
             /* Extra characters for compatibility with Unicode LineBreak.txt.  */
             || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
             || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
+           || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
             || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
-           || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
-          && ((ch >= 0x0E00 && ch <= 0x0EFF)
-              || (ch >= 0x1000 && ch <= 0x109F)
-              || (ch >= 0x1780 && ch <= 0x17FF)
-              || (ch >= 0x1950 && ch <= 0x19DF)))
-        attr |= 1 << LBP_SA;
+           || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
+           || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
+           || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
+           || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
+          && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
+              || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
+              || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
+              || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
+              || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
+              || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
+        attr |= (int64_t) 1 << LBP_SA;
  
        /* attached characters and combining marks */
        if ((unicode_attributes[ch].category[0] == 'M'
@@ -5599,17 +6645,18 @@ get_lbp (unsigned int ch)
                 || unicode_attributes[ch].category[1] == 'n'))
            || (unicode_attributes[ch].category[0] == 'C'
                && (unicode_attributes[ch].category[1] == 'c'
-                  || unicode_attributes[ch].category[1] == 'f')))
-        if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
-          attr |= 1 << LBP_CM;
+                  || unicode_attributes[ch].category[1] == 'f')
+              && ch != 0x110BD /* KAITHI NUMBER SIGN */))
+        if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+          attr |= (int64_t) 1 << LBP_CM;
  
        /* ideographic */
        if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
            || ch == 0x3000 /* IDEOGRAPHIC SPACE */
            || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
            || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
-          || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
-          || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
+          || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
+          || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
            || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
            || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
            || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
@@ -5623,7 +6670,7 @@ get_lbp (unsigned int ch)
            || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
            || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
            || (ch >= 0x3000 && ch <= 0x33FF
-              && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
+              && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
            || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
            || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
@@ -5666,8 +6713,14 @@ get_lbp (unsigned int ch)
            || ch == 0xFF5E /* FULLWIDTH TILDE */
            || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
            || ch == 0xFFE3 /* FULLWIDTH MACRON */
-          || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
-        if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
+          || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
+          || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
+          || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
+          || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
+          || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
+        if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
            {
              /* ambiguous (ideograph) ? */
              if ((unicode_width[ch] != NULL
@@ -5675,9 +6728,9 @@ get_lbp (unsigned int ch)
                   && ch >= 0x2000)
                  || ch == 0x24EA /* CIRCLED DIGIT ZERO */
                  || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
-              attr |= 1 << LBP_AI;
+              attr |= (int64_t) 1 << LBP_AI;
              else
-              attr |= 1 << LBP_ID;
+              attr |= (int64_t) 1 << LBP_ID;
            }
  
        /* ordinary alphabetic and symbol characters */
@@ -5707,8 +6760,10 @@ get_lbp (unsigned int ch)
            || ch == 0x2061 /* FUNCTION APPLICATION */
            || ch == 0x2062 /* INVISIBLE TIMES */
            || ch == 0x2063 /* INVISIBLE SEPARATOR */
-          || ch == 0x2064 /* INVISIBLE PLUS */)
-        if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
+          || ch == 0x2064 /* INVISIBLE PLUS */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x110BD /* KAITHI NUMBER SIGN */)
+        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
            {
              /* ambiguous (alphabetic) ? */
              if ((unicode_width[ch] != NULL
@@ -5726,8 +6781,6 @@ get_lbp (unsigned int ch)
                   && ch != 0x21E7 /* UPWARDS WHITE ARROW */
                   && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
                   && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
-#if !REVISION_22
-                || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
                  || ch == 0x00A7 /* SECTION SIGN */
                  || ch == 0x00A8 /* DIAERESIS */
                  || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
@@ -5741,7 +6794,6 @@ get_lbp (unsigned int ch)
                  || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
                  || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
                  || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
-                || ch == 0x00BF /* INVERTED QUESTION MARK */
                  || ch == 0x00D7 /* MULTIPLICATION SIGN */
                  || ch == 0x00F7 /* DIVISION SIGN */
                  || ch == 0x02C7 /* CARON */
@@ -5755,7 +6807,6 @@ get_lbp (unsigned int ch)
                  || ch == 0x02DA /* RING ABOVE */
                  || ch == 0x02DB /* OGONEK */
                  || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
-#endif
                  || ch == 0x24EA /* CIRCLED DIGIT ZERO */
                  || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
                  /* Extra characters for compatibility with Unicode LineBreak.txt.  */
@@ -5763,16 +6814,30 @@ get_lbp (unsigned int ch)
                  || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
                  || ch == 0x2616 /* WHITE SHOGI PIECE */
                  || ch == 0x2617 /* BLACK SHOGI PIECE */)
-              attr |= 1 << LBP_AI;
+              attr |= (int64_t) 1 << LBP_AI;
              else
-              attr |= 1 << LBP_AL;
-            attr &= ~(1 << LBP_CM);
+              attr |= (int64_t) 1 << LBP_AL;
+            attr &= ~((int64_t) 1 << LBP_CM);
            }
      }
+  else
+    {
+      /* Unassigned character.  */
+      if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+          || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
+          || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+          || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
+          || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
+                                                 Supplementary Ideographic Plane (Plane 2) outside of blocks */
+          || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
+                                                 Supplementary Ideographic Plane (Plane 2) outside of blocks */
+          || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
+        attr |= (int64_t) 1 << LBP_ID;
+    }
  
    if (attr == 0)
      /* unknown */
-    attr |= 1 << LBP_XX;
+    attr |= (int64_t) 1 << LBP_XX;
  
    return attr;
  }
@@ -5785,12 +6850,12 @@ debug_output_lbp (FILE *stream)
  
    for (i = 0; i < 0x110000; i++)
      {
-      int attr = get_lbp (i);
-      if (attr != 1 << LBP_XX)
+      int64_t attr = get_lbp (i);
+      if (attr != (int64_t) 1 << LBP_XX)
          {
            fprintf (stream, "0x%04X", i);
  #define PRINT_BIT(attr,bit) \
-  if (attr & (1 << bit)) fprintf (stream, " " #bit);
+  if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
            PRINT_BIT(attr,LBP_BK);
            PRINT_BIT(attr,LBP_CM);
            PRINT_BIT(attr,LBP_WJ);
@@ -5803,6 +6868,7 @@ debug_output_lbp (FILE *stream)
            PRINT_BIT(attr,LBP_HY);
            PRINT_BIT(attr,LBP_CB);
            PRINT_BIT(attr,LBP_CL);
+          PRINT_BIT(attr,LBP_CP);
            PRINT_BIT(attr,LBP_EX);
            PRINT_BIT(attr,LBP_IN);
            PRINT_BIT(attr,LBP_NS);
@@ -5916,6 +6982,7 @@ fill_org_lbp (const char *linebreak_filename)
        TRY(LBP_HY)
        TRY(LBP_CB)
        TRY(LBP_CL)
+      TRY(LBP_CP)
        TRY(LBP_EX)
        TRY(LBP_IN)
        TRY(LBP_NS)
@@ -5961,6 +7028,7 @@ fill_org_lbp (const char *linebreak_filename)
            unicode_org_lbp[i] = value;
          }
      }
+
    if (ferror (stream) || fclose (stream))
      {
        fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
@@ -5994,6 +7062,7 @@ debug_output_org_lbp (FILE *stream)
            PRINT_BIT(attr,LBP_HY);
            PRINT_BIT(attr,LBP_CB);
            PRINT_BIT(attr,LBP_CL);
+          PRINT_BIT(attr,LBP_CP);
            PRINT_BIT(attr,LBP_EX);
            PRINT_BIT(attr,LBP_IN);
            PRINT_BIT(attr,LBP_NS);
@@ -6062,13 +7131,13 @@ output_lbp (FILE *stream1, FILE *stream2)
  
    for (i = 0; i < 0x110000; i++)
      {
-      int attr = get_lbp (i);
+      int64_t attr = get_lbp (i);
  
        /* Now attr should contain exactly one bit.  */
        if (attr == 0 || ((attr & (attr - 1)) != 0))
          abort ();
  
-      if (attr != 1 << LBP_XX)
+      if (attr != (int64_t) 1 << LBP_XX)
          {
            unsigned int log2_attr;
            for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
@@ -6166,6 +7235,7 @@ output_lbp (FILE *stream1, FILE *stream2)
            CASE(LBP_HY);
            CASE(LBP_CB);
            CASE(LBP_CL);
+          CASE(LBP_CP);
            CASE(LBP_EX);
            CASE(LBP_IN);
            CASE(LBP_NS);
@@ -6227,7 +7297,7 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve
  
        fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
        fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
-      fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s.  */\n",
+      fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
                 version);
        fprintf (stream, "\n");
  
@@ -6265,7 +7335,8 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve
  
  /* ========================================================================= */
  
-/* Word break property.  */
+/* Word break property.
+   Updated for Unicode TR #29 revision 17.  */
  
  /* Possible values of the Word_Break property.  */
  enum
@@ -6311,7 +7382,7 @@ get_wbp (unsigned int ch)
  
        if (unicode_attributes[ch].category != NULL
            && strcmp (unicode_attributes[ch].category, "Cf") == 0
-          && ch != 0x200C && ch != 0x200D)
+          && ch != 0x200B && ch != 0x200C && ch != 0x200D)
          attr |= 1 << WBP_FORMAT;
  
        if ((unicode_scripts[ch] < numscripts
@@ -6331,12 +7402,10 @@ get_wbp (unsigned int ch)
            && (attr & (1 << WBP_EXTEND)) == 0)
          attr |= 1 << WBP_ALETTER;
  
-      if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
-          || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
+      if (is_WBP_MIDNUMLET (ch))
          attr |= 1 << WBP_MIDNUMLET;
  
-      if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
-          || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
+      if (is_WBP_MIDLETTER (ch))
          attr |= 1 << WBP_MIDLETTER;
  
        if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
@@ -6709,7 +7778,7 @@ output_wbrk_tables (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
    fprintf (stream, "\n");
  
@@ -6743,6 +7812,316 @@ output_wbrk_tables (const char *filename, const char *version)
  
  /* ========================================================================= */
  
+/* Grapheme break property.
+   Updated for Unicode TR #29 revision 17.  */
+
+/* Possible values of the Grapheme_Cluster_Break property.  */
+enum
+{
+  GBP_OTHER        = 0,
+  GBP_CR           = 1,
+  GBP_LF           = 2,
+  GBP_CONTROL      = 3,
+  GBP_EXTEND       = 4,
+  GBP_PREPEND      = 5,
+  GBP_SPACINGMARK  = 6,
+  GBP_L            = 7,
+  GBP_V            = 8,
+  GBP_T            = 9,
+  GBP_LV           = 10,
+  GBP_LVT          = 11
+};
+
+/* Construction of sparse 3-level tables.  */
+#define TABLE gbp_table
+#define ELEMENT unsigned char
+#define DEFAULT GBP_OTHER
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
+/* The grapheme break property from the GraphemeBreakProperty.txt file.  */
+int unicode_org_gbp[0x110000];
+
+/* Output the unit test data for the grapheme break property.  */
+static void
+output_gbp_test (const char *filename)
+{
+  FILE *stream;
+  bool need_comma;
+  unsigned int ch;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
+  fprintf (stream, "   Copyright (C) 2010 Free Software Foundation, Inc.\n");
+  fprintf (stream, "\n");
+  fprintf (stream, "   This program is free software: you can redistribute it and/or modify\n");
+  fprintf (stream, "   it under the terms of the GNU General Public License as published by\n");
+  fprintf (stream, "   the Free Software Foundation; either version 3 of the License, or\n");
+  fprintf (stream, "   (at your option) any later version.\n");
+  fprintf (stream, "\n");
+  fprintf (stream, "   This program is distributed in the hope that it will be useful,\n");
+  fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+  fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
+  fprintf (stream, "   GNU General Public License for more details.\n");
+  fprintf (stream, "\n");
+  fprintf (stream, "   You should have received a copy of the GNU General Public License\n");
+  fprintf (stream, "   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */\n");
+  fprintf (stream, "\n");
+
+  need_comma = false;
+  for (ch = 0; ch < 0x110000; ch++)
+    {
+      int gbp = unicode_org_gbp[ch];
+      const char *gbp_string;
+
+      while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
+        ch++;
+
+      switch (gbp)
+        {
+#define CASE(x) case x: gbp_string = #x; break;
+      CASE (GBP_OTHER)
+      CASE (GBP_CR)
+      CASE (GBP_LF)
+      CASE (GBP_CONTROL)
+      CASE (GBP_EXTEND)
+      CASE (GBP_PREPEND)
+      CASE (GBP_SPACINGMARK)
+      CASE (GBP_L)
+      CASE (GBP_V)
+      CASE (GBP_T)
+      CASE (GBP_LV)
+      CASE (GBP_LVT)
+#undef CASE
+        default:
+          abort ();
+        }
+
+      if (need_comma)
+        fprintf (stream, ",\n");
+      fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
+
+      need_comma = true;
+    }
+  fprintf (stream, "\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* Output the per-character grapheme break property table.  */
+static void
+output_gbp_table (const char *filename, const char *version)
+{
+  FILE *stream;
+  unsigned int ch, i;
+  struct gbp_table t;
+  unsigned int level1_offset, level2_offset, level3_offset;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Grapheme break property of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+           version);
+
+  t.p = 7;
+  t.q = 9;
+  gbp_table_init (&t);
+
+  for (ch = 0; ch < 0x110000; ch++)
+    gbp_table_add (&t, ch, unicode_org_gbp[ch]);
+
+  gbp_table_finalize (&t);
+
+  /* Offsets in t.result, in memory of this process.  */
+  level1_offset =
+    5 * sizeof (uint32_t);
+  level2_offset =
+    5 * sizeof (uint32_t)
+    + t.level1_size * sizeof (uint32_t);
+  level3_offset =
+    5 * sizeof (uint32_t)
+    + t.level1_size * sizeof (uint32_t)
+    + (t.level2_size << t.q) * sizeof (uint32_t);
+
+  for (i = 0; i < 5; i++)
+    fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
+             ((uint32_t *) t.result)[i]);
+  fprintf (stream, "static const\n");
+  fprintf (stream, "struct\n");
+  fprintf (stream, "  {\n");
+  fprintf (stream, "    int level1[%zu];\n", t.level1_size);
+  fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
+  fprintf (stream, "    unsigned char level3[(%zu << %d) / 2];\n",
+           t.level3_size, t.p);
+  fprintf (stream, "  }\n");
+  fprintf (stream, "unigbrkprop =\n");
+  fprintf (stream, "{\n");
+  fprintf (stream, "  {");
+  if (t.level1_size > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < t.level1_size; i++)
+    {
+      uint32_t offset;
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      offset = ((uint32_t *) (t.result + level1_offset))[i];
+      if (offset == 0)
+        fprintf (stream, " %5d", -1);
+      else
+        fprintf (stream, " %5zu",
+                 (offset - level2_offset) / sizeof (uint32_t));
+      if (i+1 < t.level1_size)
+        fprintf (stream, ",");
+    }
+  if (t.level1_size > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " },\n");
+  fprintf (stream, "  {");
+  if (t.level2_size << t.q > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < t.level2_size << t.q; i++)
+    {
+      uint32_t offset;
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      offset = ((uint32_t *) (t.result + level2_offset))[i];
+      if (offset == 0)
+        fprintf (stream, " %5d", -1);
+      else
+        fprintf (stream, " %5zu",
+                 (offset - level3_offset) / sizeof (uint8_t) / 2);
+      if (i+1 < t.level2_size << t.q)
+        fprintf (stream, ",");
+    }
+  if (t.level2_size << t.q > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " },\n");
+  fprintf (stream, "  {");
+  if (t.level3_size << t.p > 8)
+    fprintf (stream, "\n   ");
+  for (i = 0; i < (t.level3_size << t.p) / 2; i++)
+    {
+      unsigned char *p = (unsigned char *) (t.result + level3_offset);
+      unsigned char value0 = p[i * 2];
+      unsigned char value1 = p[i * 2 + 1];
+      if (i > 0 && (i % 8) == 0)
+        fprintf (stream, "\n   ");
+      fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
+               (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
+    }
+  if (t.level3_size << t.p > 8)
+    fprintf (stream, "\n ");
+  fprintf (stream, " }\n");
+  fprintf (stream, "};\n");
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* Stores in unicode_org_gbp[] the grapheme breaking property from the
+   GraphemeBreakProperty.txt file.  */
+static void
+fill_org_gbp (const char *graphemebreakproperty_filename)
+{
+  unsigned int i;
+  FILE *stream;
+  int lineno = 0;
+
+  for (i = 0; i < 0x110000; i++)
+    unicode_org_gbp[i] = GBP_OTHER;
+
+  stream = fopen (graphemebreakproperty_filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n",
+               graphemebreakproperty_filename);
+      exit (1);
+    }
+
+  for (;;)
+    {
+      char buf[200+1];
+      unsigned int i1, i2;
+      char padding[200+1];
+      char propname[200+1];
+      int propvalue;
+
+      lineno++;
+      if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+        break;
+
+      if (buf[0] == '\0' || buf[0] == '#')
+        continue;
+
+      if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
+        {
+          if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
+            {
+              fprintf (stderr, "parse error in '%s'\n",
+                       graphemebreakproperty_filename);
+              exit (1);
+            }
+          i2 = i1;
+        }
+#define PROP(name,value) \
+      if (strcmp (propname, name) == 0) propvalue = value; else
+      PROP ("CR", GBP_CR)
+      PROP ("LF", GBP_LF)
+      PROP ("Control", GBP_CONTROL)
+      PROP ("Extend", GBP_EXTEND)
+      PROP ("Prepend", GBP_PREPEND)
+      PROP ("SpacingMark", GBP_SPACINGMARK)
+      PROP ("L", GBP_L)
+      PROP ("V", GBP_V)
+      PROP ("T", GBP_T)
+      PROP ("LV", GBP_LV)
+      PROP ("LVT", GBP_LVT)
+#undef PROP
+        {
+          fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
+                   graphemebreakproperty_filename, lineno);
+          exit (1);
+        }
+      if (!(i1 <= i2 && i2 < 0x110000))
+        abort ();
+
+      for (i = i1; i <= i2; i++)
+        unicode_org_gbp[i] = propvalue;
+    }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
+      exit (1);
+    }
+}
+
+/* ========================================================================= */
+
+/* Composition and decomposition.
+   Updated for Unicode TR #15 revision 33.  */
+
  /* Maximum number of characters into which a single Unicode character can be
     decomposed.  */
  #define MAX_DECOMP_LENGTH 18
@@ -7149,7 +8528,7 @@ output_composition_tables (const char *filename, const char *version)
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Canonical composition of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
    fprintf (stream, "\n");
  
@@ -7191,7 +8570,7 @@ output_composition_tables (const char *filename, const char *version)
       1527, which is quite good (60% filled).  It requires an auxiliary table
       lookup in a table of size 0.5 KB.  The total tables size is 11 KB.  */
  
-  fprintf (stream, "struct composition_rule { char codes[4]; };\n");
+  fprintf (stream, "struct composition_rule { char codes[6]; };\n");
    fprintf (stream, "%%struct-type\n");
    fprintf (stream, "%%language=ANSI-C\n");
    fprintf (stream, "%%define slot-name codes\n");
@@ -7229,16 +8608,9 @@ output_composition_tables (const char *filename, const char *version)
                if (strcmp (unicode_attributes[combined].combining, "0") != 0)
                  abort ();
  
-              if (!(code1 < 0x10000))
-                abort ();
-              if (!(code2 < 0x10000))
-                abort ();
-              if (!(combined < 0x10000))
-                abort ();
-
-              fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
-                       (code1 >> 8) & 0xff, code1 & 0xff,
-                       (code2 >> 8) & 0xff, code2 & 0xff,
+              fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
+                       (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
+                       (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
                         combined);
              }
          }
@@ -7289,7 +8661,7 @@ output_simple_mapping_test (const char *filename,
    fprintf (stream, "   You should have received a copy of the GNU General Public License\n");
    fprintf (stream, "   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */\n");
    fprintf (stream, "\n");
-  fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
    fprintf (stream, "\n");
    fprintf (stream, "#include \"test-mapping-part1.h\"\n");
@@ -7351,7 +8723,7 @@ output_simple_mapping (const char *filename,
  
    fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
    fprintf (stream, "/* Simple character mapping of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
             version);
  
    t.p = 7;
@@ -8273,20 +9645,22 @@ main (int argc, char * argv[])
    const char *unicodedata_filename;
    const char *proplist_filename;
    const char *derivedproplist_filename;
+  const char *arabicshaping_filename;
    const char *scripts_filename;
    const char *blocks_filename;
    const char *proplist30_filename;
    const char *eastasianwidth_filename;
    const char *linebreak_filename;
    const char *wordbreakproperty_filename;
+  const char *graphemebreakproperty_filename;
    const char *compositionexclusions_filename;
    const char *specialcasing_filename;
    const char *casefolding_filename;
    const char *version;
  
-  if (argc != 14)
+  if (argc != 16)
      {
-      fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
+      fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
                 argv[0]);
        exit (1);
      }
@@ -8294,27 +9668,31 @@ main (int argc, char * argv[])
    unicodedata_filename = argv[1];
    proplist_filename = argv[2];
    derivedproplist_filename = argv[3];
-  scripts_filename = argv[4];
-  blocks_filename = argv[5];
-  proplist30_filename = argv[6];
-  eastasianwidth_filename = argv[7];
-  linebreak_filename = argv[8];
-  wordbreakproperty_filename = argv[9];
-  compositionexclusions_filename = argv[10];
-  specialcasing_filename = argv[11];
-  casefolding_filename = argv[12];
-  version = argv[13];
+  arabicshaping_filename = argv[4];
+  scripts_filename = argv[5];
+  blocks_filename = argv[6];
+  proplist30_filename = argv[7];
+  eastasianwidth_filename = argv[8];
+  linebreak_filename = argv[9];
+  wordbreakproperty_filename = argv[10];
+  graphemebreakproperty_filename = argv[11];
+  compositionexclusions_filename = argv[12];
+  specialcasing_filename = argv[13];
+  casefolding_filename = argv[14];
+  version = argv[15];
  
    fill_attributes (unicodedata_filename);
    clear_properties ();
    fill_properties (proplist_filename);
    fill_properties (derivedproplist_filename);
    fill_properties30 (proplist30_filename);
+  fill_arabicshaping (arabicshaping_filename);
    fill_scripts (scripts_filename);
    fill_blocks (blocks_filename);
    fill_width (eastasianwidth_filename);
    fill_org_lbp (linebreak_filename);
    fill_org_wbp (wordbreakproperty_filename);
+  fill_org_gbp (graphemebreakproperty_filename);
    fill_composition_exclusions (compositionexclusions_filename);
    fill_casing_rules (specialcasing_filename);
    fill_casefolding_rules (casefolding_filename);
@@ -8323,7 +9701,7 @@ main (int argc, char * argv[])
  
    output_categories (version);
    output_category ("unictype/categ_of.h", version);
-  output_combclass ("unictype/combining.h", version);
+  output_combclass ("unictype/combiningclass.h", version);
    output_bidi_category ("unictype/bidi_of.h", version);
    output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
    output_decimal_digit ("unictype/decdigit.h", version);
@@ -8333,10 +9711,17 @@ main (int argc, char * argv[])
    output_numeric ("unictype/numeric.h", version);
    output_mirror ("unictype/mirror.h", version);
    output_properties (version);
+  output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
+  output_joining_type ("unictype/joiningtype_of.h", version);
+  output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
+  output_joining_group ("unictype/joininggroup_of.h", version);
+
    output_scripts (version);
    output_scripts_byname (version);
    output_blocks (version);
    output_ident_properties (version);
+  output_nonspacing_property ("uniwidth/width.c.part");
+  output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
    output_old_ctype (version);
  
    debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
@@ -8347,6 +9732,9 @@ main (int argc, char * argv[])
    debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
    output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
  
+  output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
+  output_gbp_table ("unigbrk/gbrkprop.h", version);
+
    output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
    debug_output_composition_tables ("uninorm/composition.txt");
    output_composition_tables ("uninorm/composition-table.gperf", version);
@@ -8370,19 +9758,23 @@ main (int argc, char * argv[])
   * compile-command: "
     gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
     ./gen-uni-tables \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
          /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
-        5.1.0
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
+        6.0.0 \
+   && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
+   && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt
     "
   * End:
   */