lib/unilbrk/gen-lbrk.c

   1 /* Generate a Unicode conforming Line Break Properties tables from a
   2    UnicodeData file.
   3    Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 /* Usage example:
  20      $ gen-lbrk /usr/local/share/Unidata/UnicodeData.txt \
  21                 /usr/local/share/Unidata/EastAsianWidth.txt \
  22                 /usr/local/share/Unidata/LineBreak.txt \
  23                 3.1.0
  24  */
  25
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdbool.h>
  29 #include <stdint.h>
  30 #include <string.h>
  31 #include <time.h>
  32
  33 /* This structure represents one line in the UnicodeData.txt file.  */
  34 struct unicode_attribute
  35 {
  36   const char *name;           /* Character name */
  37   const char *category;       /* General category */
  38   const char *combining;      /* Canonical combining classes */
  39   const char *bidi;           /* Bidirectional category */
  40   const char *decomposition;  /* Character decomposition mapping */
  41   const char *decdigit;       /* Decimal digit value */
  42   const char *digit;          /* Digit value */
  43   const char *numeric;        /* Numeric value */
  44   int mirrored;               /* mirrored */
  45   const char *oldname;        /* Old Unicode 1.0 name */
  46   const char *comment;        /* Comment */
  47   unsigned int upper;         /* Uppercase mapping */
  48   unsigned int lower;         /* Lowercase mapping */
  49   unsigned int title;         /* Titlecase mapping */
  50 };
  51
  52 /* Missing fields are represented with "" for strings, and NONE for
  53    characters.  */
  54 #define NONE (~(unsigned int)0)
  55
  56 /* The entire contents of the UnicodeData.txt file.  */
  57 struct unicode_attribute unicode_attributes [0x110000];
  58
  59 /* Stores in unicode_attributes[i] the values from the given fields.  */
  60 static void
  61 fill_attribute (unsigned int i,
  62                 const char *field1, const char *field2,
  63                 const char *field3, const char *field4,
  64                 const char *field5, const char *field6,
  65                 const char *field7, const char *field8,
  66                 const char *field9, const char *field10,
  67                 const char *field11, const char *field12,
  68                 const char *field13, const char *field14)
  69 {
  70   struct unicode_attribute * uni;
  71
  72   if (i >= 0x110000)
  73     {
  74       fprintf (stderr, "index too large\n");
  75       exit (1);
  76     }
  77   uni = &unicode_attributes[i];
  78   /* Copy the strings.  */
  79   uni->name          = strdup (field1);
  80   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  81   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  82   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  83   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  84   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  85   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  86   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  87   uni->mirrored      = (field9[0] == 'Y');
  88   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  89   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  90   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  91   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  92   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
  93 }
  94
  95 /* Maximum length of a field in the UnicodeData.txt file.  */
  96 #define FIELDLEN 120
  97
  98 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  99    Reads up to (but excluding) DELIM.
 100    Returns 1 when a field was successfully read, otherwise 0.  */
 101 static int
 102 getfield (FILE *stream, char *buffer, int delim)
 103 {
 104   int count = 0;
 105   int c;
 106
 107   for (; (c = getc (stream)), (c != EOF && c != delim); )
 108     {
 109       /* The original unicode.org UnicodeData.txt file happens to have
 110          CR/LF line terminators.  Silently convert to LF.  */
 111       if (c == '\r')
 112         continue;
 113
 114       /* Put c into the buffer.  */
 115       if (++count >= FIELDLEN - 1)
 116         {
 117           fprintf (stderr, "field too long\n");
 118           exit (1);
 119         }
 120       *buffer++ = c;
 121     }
 122
 123   if (c == EOF)
 124     return 0;
 125
 126   *buffer = '\0';
 127   return 1;
 128 }
 129
 130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 131    file.  */
 132 static void
 133 fill_attributes (const char *unicodedata_filename)
 134 {
 135   unsigned int i, j;
 136   FILE *stream;
 137   char field0[FIELDLEN];
 138   char field1[FIELDLEN];
 139   char field2[FIELDLEN];
 140   char field3[FIELDLEN];
 141   char field4[FIELDLEN];
 142   char field5[FIELDLEN];
 143   char field6[FIELDLEN];
 144   char field7[FIELDLEN];
 145   char field8[FIELDLEN];
 146   char field9[FIELDLEN];
 147   char field10[FIELDLEN];
 148   char field11[FIELDLEN];
 149   char field12[FIELDLEN];
 150   char field13[FIELDLEN];
 151   char field14[FIELDLEN];
 152   int lineno = 0;
 153
 154   for (i = 0; i < 0x110000; i++)
 155     unicode_attributes[i].name = NULL;
 156
 157   stream = fopen (unicodedata_filename, "r");
 158   if (stream == NULL)
 159     {
 160       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 161       exit (1);
 162     }
 163
 164   for (;;)
 165     {
 166       int n;
 167
 168       lineno++;
 169       n = getfield (stream, field0, ';');
 170       n += getfield (stream, field1, ';');
 171       n += getfield (stream, field2, ';');
 172       n += getfield (stream, field3, ';');
 173       n += getfield (stream, field4, ';');
 174       n += getfield (stream, field5, ';');
 175       n += getfield (stream, field6, ';');
 176       n += getfield (stream, field7, ';');
 177       n += getfield (stream, field8, ';');
 178       n += getfield (stream, field9, ';');
 179       n += getfield (stream, field10, ';');
 180       n += getfield (stream, field11, ';');
 181       n += getfield (stream, field12, ';');
 182       n += getfield (stream, field13, ';');
 183       n += getfield (stream, field14, '\n');
 184       if (n == 0)
 185         break;
 186       if (n != 15)
 187         {
 188           fprintf (stderr, "short line in'%s':%d\n",
 189                    unicodedata_filename, lineno);
 190           exit (1);
 191         }
 192       i = strtoul (field0, NULL, 16);
 193       if (field1[0] == '<'
 194           && strlen (field1) >= 9
 195           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 196         {
 197           /* Deal with a range. */
 198           lineno++;
 199           n = getfield (stream, field0, ';');
 200           n += getfield (stream, field1, ';');
 201           n += getfield (stream, field2, ';');
 202           n += getfield (stream, field3, ';');
 203           n += getfield (stream, field4, ';');
 204           n += getfield (stream, field5, ';');
 205           n += getfield (stream, field6, ';');
 206           n += getfield (stream, field7, ';');
 207           n += getfield (stream, field8, ';');
 208           n += getfield (stream, field9, ';');
 209           n += getfield (stream, field10, ';');
 210           n += getfield (stream, field11, ';');
 211           n += getfield (stream, field12, ';');
 212           n += getfield (stream, field13, ';');
 213           n += getfield (stream, field14, '\n');
 214           if (n != 15)
 215             {
 216               fprintf (stderr, "missing end range in '%s':%d\n",
 217                        unicodedata_filename, lineno);
 218               exit (1);
 219             }
 220           if (!(field1[0] == '<'
 221                 && strlen (field1) >= 8
 222                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 223             {
 224               fprintf (stderr, "missing end range in '%s':%d\n",
 225                        unicodedata_filename, lineno);
 226               exit (1);
 227             }
 228           field1[strlen (field1) - 7] = '\0';
 229           j = strtoul (field0, NULL, 16);
 230           for (; i <= j; i++)
 231             fill_attribute (i, field1+1, field2, field3, field4, field5,
 232                                field6, field7, field8, field9, field10,
 233                                field11, field12, field13, field14);
 234         }
 235       else
 236         {
 237           /* Single character line */
 238           fill_attribute (i, field1, field2, field3, field4, field5,
 239                              field6, field7, field8, field9, field10,
 240                              field11, field12, field13, field14);
 241         }
 242     }
 243   if (ferror (stream) || fclose (stream))
 244     {
 245       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 246       exit (1);
 247     }
 248 }
 249
 250 /* The width property from the EastAsianWidth.txt file.
 251    Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
 252 const char * unicode_width[0x110000];
 253
 254 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
 255    file.  */
 256 static void
 257 fill_width (const char *width_filename)
 258 {
 259   unsigned int i, j;
 260   FILE *stream;
 261   char field0[FIELDLEN];
 262   char field1[FIELDLEN];
 263   char field2[FIELDLEN];
 264   int lineno = 0;
 265
 266   for (i = 0; i < 0x110000; i++)
 267     unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
 268
 269   stream = fopen (width_filename, "r");
 270   if (stream == NULL)
 271     {
 272       fprintf (stderr, "error during fopen of '%s'\n", width_filename);
 273       exit (1);
 274     }
 275
 276   for (;;)
 277     {
 278       int n;
 279       int c;
 280
 281       lineno++;
 282       c = getc (stream);
 283       if (c == EOF)
 284         break;
 285       if (c == '#')
 286         {
 287           do c = getc (stream); while (c != EOF && c != '\n');
 288           continue;
 289         }
 290       ungetc (c, stream);
 291       n = getfield (stream, field0, ';');
 292       n += getfield (stream, field1, ' ');
 293       n += getfield (stream, field2, '\n');
 294       if (n == 0)
 295         break;
 296       if (n != 3)
 297         {
 298           fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
 299           exit (1);
 300         }
 301       i = strtoul (field0, NULL, 16);
 302       if (strstr (field0, "..") != NULL)
 303         {
 304           /* Deal with a range.  */
 305           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
 306           for (; i <= j; i++)
 307             unicode_width[i] = strdup (field1);
 308         }
 309       else
 310         {
 311           /* Single character line.  */
 312           unicode_width[i] = strdup (field1);
 313         }
 314     }
 315   if (ferror (stream) || fclose (stream))
 316     {
 317       fprintf (stderr, "error reading from '%s'\n", width_filename);
 318       exit (1);
 319     }
 320 }
 321
 322 /* Line breaking classification.  */
 323
 324 enum
 325 {
 326   /* Values >= 20 are resolved at run time. */
 327   LBP_BK =  0, /* mandatory break */
 328 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 329 /*LBP_LF,         line feed - not used here because it's a DOSism */
 330   LBP_CM = 20, /* attached characters and combining marks */
 331 /*LBP_SG,         surrogates - not used here because they are not characters */
 332   LBP_ZW =  1, /* zero width space */
 333   LBP_IN =  2, /* inseparable */
 334   LBP_GL =  3, /* non-breaking (glue) */
 335   LBP_CB = 22, /* contingent break opportunity */
 336   LBP_SP = 21, /* space */
 337   LBP_BA =  4, /* break opportunity after */
 338   LBP_BB =  5, /* break opportunity before */
 339   LBP_B2 =  6, /* break opportunity before and after */
 340   LBP_HY =  7, /* hyphen */
 341   LBP_NS =  8, /* non starter */
 342   LBP_OP =  9, /* opening punctuation */
 343   LBP_CL = 10, /* closing punctuation */
 344   LBP_QU = 11, /* ambiguous quotation */
 345   LBP_EX = 12, /* exclamation/interrogation */
 346   LBP_ID = 13, /* ideographic */
 347   LBP_NU = 14, /* numeric */
 348   LBP_IS = 15, /* infix separator (numeric) */
 349   LBP_SY = 16, /* symbols allowing breaks */
 350   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
 351   LBP_PR = 18, /* prefix (numeric) */
 352   LBP_PO = 19, /* postfix (numeric) */
 353   LBP_SA = 23, /* complex context (South East Asian) */
 354   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
 355   LBP_XX = 25  /* unknown */
 356 };
 357
 358 /* Returns the line breaking classification for ch, as a bit mask.  */
 359 static int
 360 get_lbp (unsigned int ch)
 361 {
 362   int attr = 0;
 363
 364   if (unicode_attributes[ch].name != NULL)
 365     {
 366       /* mandatory break */
 367       if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
 368           || ch == 0x000C /* form feed */
 369           || ch == 0x2028 /* LINE SEPARATOR */
 370           || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
 371         attr |= 1 << LBP_BK;
 372
 373       /* zero width space */
 374       if (ch == 0x200B /* ZERO WIDTH SPACE */)
 375         attr |= 1 << LBP_ZW;
 376
 377       /* inseparable */
 378       if (ch == 0x2024 /* ONE DOT LEADER */
 379           || ch == 0x2025 /* TWO DOT LEADER */
 380           || ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
 381         attr |= 1 << LBP_IN;
 382
 383       /* non-breaking (glue) */
 384       if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
 385           || ch == 0x00A0 /* NO-BREAK SPACE */
 386           || ch == 0x202F /* NARROW NO-BREAK SPACE */
 387           || ch == 0x2007 /* FIGURE SPACE */
 388           || ch == 0x2011 /* NON-BREAKING HYPHEN */
 389           || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
 390         attr |= 1 << LBP_GL;
 391
 392       /* contingent break opportunity */
 393       if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
 394         attr |= 1 << LBP_CB;
 395
 396       /* space */
 397       if (ch == 0x0020 /* SPACE */)
 398         attr |= 1 << LBP_SP;
 399
 400       /* break opportunity after */
 401       if (ch == 0x2000 /* EN QUAD */
 402           || ch == 0x2001 /* EM QUAD */
 403           || ch == 0x2002 /* EN SPACE */
 404           || ch == 0x2003 /* EM SPACE */
 405           || ch == 0x2004 /* THREE-PER-EM SPACE */
 406           || ch == 0x2005 /* FOUR-PER-EM SPACE */
 407           || ch == 0x2006 /* SIX-PER-EM SPACE */
 408           || ch == 0x2008 /* PUNCTUATION SPACE */
 409           || ch == 0x2009 /* THIN SPACE */
 410           || ch == 0x200A /* HAIR SPACE */
 411           || ch == 0x0009 /* tab */
 412           || ch == 0x058A /* ARMENIAN HYPHEN */
 413           || ch == 0x2010 /* HYPHEN */
 414           || ch == 0x2012 /* FIGURE DASH */
 415           || ch == 0x2013 /* EN DASH */
 416           || ch == 0x00AD /* SOFT HYPHEN */
 417           || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
 418           || ch == 0x1361 /* ETHIOPIC WORDSPACE */
 419           || ch == 0x1680 /* OGHAM SPACE MARK */
 420           || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
 421           || ch == 0x2027 /* HYPHENATION POINT */
 422           || ch == 0x007C /* VERTICAL LINE */)
 423         attr |= 1 << LBP_BA;
 424
 425       /* break opportunity before */
 426       if (ch == 0x00B4 /* ACUTE ACCENT */
 427           || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
 428           || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
 429           || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
 430         attr |= 1 << LBP_BB;
 431
 432       /* break opportunity before and after */
 433       if (ch == 0x2014 /* EM DASH */)
 434         attr |= 1 << LBP_B2;
 435
 436       /* hyphen */
 437       if (ch == 0x002D /* HYPHEN-MINUS */)
 438         attr |= 1 << LBP_HY;
 439
 440       /* exclamation/interrogation */
 441       if (ch == 0x0021 /* EXCLAMATION MARK */
 442           || ch == 0x003F /* QUESTION MARK */
 443           || ch == 0xFE56 /* SMALL QUESTION MARK */
 444           || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
 445           || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
 446           || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
 447         attr |= 1 << LBP_EX;
 448
 449       /* opening punctuation */
 450       if (unicode_attributes[ch].category[0] == 'P'
 451           && unicode_attributes[ch].category[1] == 's')
 452         attr |= 1 << LBP_OP;
 453
 454       /* closing punctuation */
 455       if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
 456           || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
 457           || ch == 0xFE50 /* SMALL COMMA */
 458           || ch == 0xFE52 /* SMALL FULL STOP */
 459           || ch == 0xFF0C /* FULLWIDTH COMMA */
 460           || ch == 0xFF0E /* FULLWIDTH FULL STOP */
 461           || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
 462           || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
 463           || (unicode_attributes[ch].category[0] == 'P'
 464               && unicode_attributes[ch].category[1] == 'e'))
 465         attr |= 1 << LBP_CL;
 466
 467       /* ambiguous quotation */
 468       if (ch == 0x0022 /* QUOTATION MARK */
 469           || ch == 0x0027 /* APOSTROPHE */
 470           || (unicode_attributes[ch].category[0] == 'P'
 471               && (unicode_attributes[ch].category[1] == 'f'
 472                   || unicode_attributes[ch].category[1] == 'i')))
 473         attr |= 1 << LBP_QU;
 474
 475       /* attached characters and combining marks */
 476       if ((unicode_attributes[ch].category[0] == 'M'
 477            && (unicode_attributes[ch].category[1] == 'n'
 478                || unicode_attributes[ch].category[1] == 'c'
 479                || unicode_attributes[ch].category[1] == 'e'))
 480           || (ch >= 0x1160 && ch <= 0x11F9)
 481           || (unicode_attributes[ch].category[0] == 'C'
 482               && (unicode_attributes[ch].category[1] == 'c'
 483                   || unicode_attributes[ch].category[1] == 'f')))
 484         if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
 485           attr |= 1 << LBP_CM;
 486
 487       /* non starter */
 488       if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
 489           || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
 490           || ch == 0x17D4 /* KHMER SIGN KHAN */
 491           || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
 492           || ch == 0x17D7 /* KHMER SIGN LEK TOO */
 493           || ch == 0x17D8 /* KHMER SIGN BEYYAL */
 494           || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
 495           || ch == 0x17DA /* KHMER SIGN KOOMUUT */
 496           || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
 497           || ch == 0x2044 /* FRACTION SLASH */
 498           || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
 499           || ch == 0x301C /* WAVE DASH */
 500           || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
 501           || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
 502           || ch == 0x309D /* HIRAGANA ITERATION MARK */
 503           || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
 504           || ch == 0x30FB /* KATAKANA MIDDLE DOT */
 505           || ch == 0x30FD /* KATAKANA ITERATION MARK */
 506           || ch == 0xFE54 /* SMALL SEMICOLON */
 507           || ch == 0xFE55 /* SMALL COLON */
 508           || ch == 0xFF1A /* FULLWIDTH COLON */
 509           || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
 510           || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
 511           || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
 512           || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
 513           || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
 514           || (unicode_attributes[ch].category[0] == 'L'
 515               && unicode_attributes[ch].category[1] == 'm'
 516               && (unicode_width[ch][0] == 'W'
 517                   || unicode_width[ch][0] == 'H'))
 518           || (unicode_attributes[ch].category[0] == 'S'
 519               && unicode_attributes[ch].category[1] == 'k'
 520               && unicode_width[ch][0] == 'W')
 521           || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
 522           || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
 523         attr |= 1 << LBP_NS;
 524
 525       /* numeric */
 526       if (unicode_attributes[ch].category[0] == 'N'
 527           && unicode_attributes[ch].category[1] == 'd'
 528           && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
 529         attr |= 1 << LBP_NU;
 530
 531       /* infix separator (numeric) */
 532       if (ch == 0x002C /* COMMA */
 533           || ch == 0x002E /* FULL STOP */
 534           || ch == 0x003A /* COLON */
 535           || ch == 0x003B /* SEMICOLON */
 536           || ch == 0x0589 /* ARMENIAN FULL STOP */)
 537         attr |= 1 << LBP_IS;
 538
 539       /* symbols allowing breaks */
 540       if (ch == 0x002F /* SOLIDUS */)
 541         attr |= 1 << LBP_SY;
 542
 543       /* postfix (numeric) */
 544       if (ch == 0x0025 /* PERCENT SIGN */
 545           || ch == 0x00A2 /* CENT SIGN */
 546           || ch == 0x00B0 /* DEGREE SIGN */
 547           || ch == 0x2030 /* PER MILLE SIGN */
 548           || ch == 0x2031 /* PER TEN THOUSAND SIGN */
 549           || ch == 0x2032 /* PRIME */
 550           || ch == 0x2033 /* DOUBLE PRIME */
 551           || ch == 0x2034 /* TRIPLE PRIME */
 552           || ch == 0x2035 /* REVERSED PRIME */
 553           || ch == 0x2036 /* REVERSED DOUBLE PRIME */
 554           || ch == 0x2037 /* REVERSED TRIPLE PRIME */
 555           || ch == 0x20A7 /* PESETA SIGN */
 556           || ch == 0x2103 /* DEGREE CELSIUS */
 557           || ch == 0x2109 /* DEGREE FAHRENHEIT */
 558           || ch == 0x2126 /* OHM SIGN */
 559           || ch == 0xFE6A /* SMALL PERCENT SIGN */
 560           || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
 561           || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
 562         attr |= 1 << LBP_PO;
 563
 564       /* prefix (numeric) */
 565       if (ch == 0x002B /* PLUS SIGN */
 566           || ch == 0x005C /* REVERSE SOLIDUS */
 567           || ch == 0x00B1 /* PLUS-MINUS SIGN */
 568           || ch == 0x2116 /* NUMERO SIGN */
 569           || ch == 0x2212 /* MINUS SIGN */
 570           || ch == 0x2213 /* MINUS-OR-PLUS SIGN */
 571           || (unicode_attributes[ch].category[0] == 'S'
 572               && unicode_attributes[ch].category[1] == 'c'))
 573         if (!(attr & (1 << LBP_PO)))
 574           attr |= 1 << LBP_PR;
 575
 576       /* complex context (South East Asian) */
 577       if (((ch >= 0x0E00 && ch <= 0x0EFF)
 578            || (ch >= 0x1000 && ch <= 0x109F)
 579            || (ch >= 0x1780 && ch <= 0x17FF))
 580           && unicode_attributes[ch].category[0] == 'L'
 581           && (unicode_attributes[ch].category[1] == 'm'
 582               || unicode_attributes[ch].category[1] == 'o'))
 583         if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
 584           attr |= 1 << LBP_SA;
 585
 586       /* ideographic */
 587       if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
 588           || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
 589           || ch == 0x3000 /* IDEOGRAPHIC SPACE */
 590           || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
 591           || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
 592           || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
 593           || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
 594           || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
 595           || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
 596           || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */
 597           || ch == 0xFE62 /* SMALL PLUS SIGN */
 598           || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
 599           || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
 600           || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
 601           || ch == 0xFE66 /* SMALL EQUALS SIGN */
 602           || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
 603           || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
 604           || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
 605           || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
 606           || (ch >= 0x3000 && ch <= 0x33FF
 607               && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
 608           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 609           || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
 610           || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
 611           || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
 612           || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
 613           || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
 614           || ch == 0xFE49 /* DASHED OVERLINE */
 615           || ch == 0xFE4A /* CENTRELINE OVERLINE */
 616           || ch == 0xFE4B /* WAVY OVERLINE */
 617           || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
 618           || ch == 0xFE4D /* DASHED LOW LINE */
 619           || ch == 0xFE4E /* CENTRELINE LOW LINE */
 620           || ch == 0xFE4F /* WAVY LOW LINE */
 621           || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
 622           || ch == 0xFE58 /* SMALL EM DASH */
 623           || ch == 0xFE5F /* SMALL NUMBER SIGN */
 624           || ch == 0xFE60 /* SMALL AMPERSAND */
 625           || ch == 0xFE61 /* SMALL ASTERISK */
 626           || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
 627           || ch == 0xFE6B /* SMALL COMMERCIAL AT */
 628           || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
 629           || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
 630           || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
 631           || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
 632           || ch == 0xFF0A /* FULLWIDTH ASTERISK */
 633           || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
 634           || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
 635           || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
 636           || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
 637           || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
 638           || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
 639           || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
 640           || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
 641           || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
 642           || ch == 0xFF3F /* FULLWIDTH LOW LINE */
 643           || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
 644           || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
 645           || ch == 0xFF5E /* FULLWIDTH TILDE */
 646           || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
 647           || ch == 0xFFE3 /* FULLWIDTH MACRON */
 648           || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */
 649         {
 650           /* ambiguous (ideograph) ? */
 651           if (unicode_width[ch] != NULL
 652               && unicode_width[ch][0] == 'A')
 653             attr |= 1 << LBP_AI;
 654           else
 655             attr |= 1 << LBP_ID;
 656         }
 657
 658       /* ordinary alphabetic and symbol characters */
 659       if ((unicode_attributes[ch].category[0] == 'L'
 660            && (unicode_attributes[ch].category[1] == 'u'
 661                || unicode_attributes[ch].category[1] == 'l'
 662                || unicode_attributes[ch].category[1] == 't'
 663                || unicode_attributes[ch].category[1] == 'm'
 664                || unicode_attributes[ch].category[1] == 'o'))
 665           || (unicode_attributes[ch].category[0] == 'S'
 666               && (unicode_attributes[ch].category[1] == 'm'
 667                   || unicode_attributes[ch].category[1] == 'c'
 668                   || unicode_attributes[ch].category[1] == 'k'
 669                   || unicode_attributes[ch].category[1] == 'o'))
 670           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 671           || ch == 0x0023 /* NUMBER SIGN */
 672           || ch == 0x0026 /* AMPERSAND */
 673           || ch == 0x002A /* ASTERISK */
 674           || ch == 0x0040 /* COMMERCIAL AT */
 675           || ch == 0x005F /* LOW LINE */
 676           || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
 677           || ch == 0x00B2 /* SUPERSCRIPT TWO */
 678           || ch == 0x00B3 /* SUPERSCRIPT THREE */
 679           || ch == 0x00B7 /* MIDDLE DOT */
 680           || ch == 0x00B9 /* SUPERSCRIPT ONE */
 681           || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
 682           || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
 683           || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
 684           || ch == 0x00BF /* INVERTED QUESTION MARK */
 685           || ch == 0x037E /* GREEK QUESTION MARK */
 686           || ch == 0x0387 /* GREEK ANO TELEIA */
 687           || ch == 0x055A /* ARMENIAN APOSTROPHE */
 688           || ch == 0x055B /* ARMENIAN EMPHASIS MARK */
 689           || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */
 690           || ch == 0x055D /* ARMENIAN COMMA */
 691           || ch == 0x055E /* ARMENIAN QUESTION MARK */
 692           || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */
 693           || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
 694           || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */
 695           || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */
 696           || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */
 697           || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */
 698           || ch == 0x060C /* ARABIC COMMA */
 699           || ch == 0x061B /* ARABIC SEMICOLON */
 700           || ch == 0x061F /* ARABIC QUESTION MARK */
 701           || ch == 0x066A /* ARABIC PERCENT SIGN */
 702           || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
 703           || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
 704           || ch == 0x066D /* ARABIC FIVE POINTED STAR */
 705           || ch == 0x06D4 /* ARABIC FULL STOP */
 706           || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */
 707           || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */
 708           || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */
 709           || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */
 710           || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */
 711           || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */
 712           || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */
 713           || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */
 714           || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */
 715           || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */
 716           || ch == 0x070A /* SYRIAC CONTRACTION */
 717           || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */
 718           || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */
 719           || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */
 720           || ch == 0x0964 /* DEVANAGARI DANDA */
 721           || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
 722           || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */
 723           || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */
 724           || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */
 725           || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */
 726           || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */
 727           || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
 728           || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
 729           || ch == 0x0BF0 /* TAMIL NUMBER TEN */
 730           || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */
 731           || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */
 732           || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */
 733           || ch == 0x0E4F /* THAI CHARACTER FONGMAN */
 734           || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
 735           || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */
 736           || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
 737           || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
 738           || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
 739           || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
 740           || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
 741           || ch == 0x0F0D /* TIBETAN MARK SHAD */
 742           || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
 743           || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
 744           || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
 745           || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
 746           || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
 747           || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */
 748           || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */
 749           || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */
 750           || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */
 751           || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */
 752           || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */
 753           || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */
 754           || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */
 755           || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */
 756           || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */
 757           || ch == 0x0F85 /* TIBETAN MARK PALUTA */
 758           || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
 759           || ch == 0x104B /* MYANMAR SIGN SECTION */
 760           || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */
 761           || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */
 762           || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */
 763           || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */
 764           || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */
 765           || ch == 0x1362 /* ETHIOPIC FULL STOP */
 766           || ch == 0x1363 /* ETHIOPIC COMMA */
 767           || ch == 0x1364 /* ETHIOPIC SEMICOLON */
 768           || ch == 0x1365 /* ETHIOPIC COLON */
 769           || ch == 0x1366 /* ETHIOPIC PREFACE COLON */
 770           || ch == 0x1367 /* ETHIOPIC QUESTION MARK */
 771           || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */
 772           || ch == 0x1372 /* ETHIOPIC NUMBER TEN */
 773           || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */
 774           || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */
 775           || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */
 776           || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */
 777           || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */
 778           || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */
 779           || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */
 780           || ch == 0x137A /* ETHIOPIC NUMBER NINETY */
 781           || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */
 782           || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */
 783           || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */
 784           || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */
 785           || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
 786           || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
 787           || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
 788           || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */
 789           || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */
 790           || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */
 791           || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */
 792           || ch == 0x1800 /* MONGOLIAN BIRGA */
 793           || ch == 0x1801 /* MONGOLIAN ELLIPSIS */
 794           || ch == 0x1802 /* MONGOLIAN COMMA */
 795           || ch == 0x1803 /* MONGOLIAN FULL STOP */
 796           || ch == 0x1804 /* MONGOLIAN COLON */
 797           || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
 798           || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */
 799           || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
 800           || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
 801           || ch == 0x180A /* MONGOLIAN NIRUGU */
 802           || ch == 0x2015 /* HORIZONTAL BAR */
 803           || ch == 0x2016 /* DOUBLE VERTICAL LINE */
 804           || ch == 0x2017 /* DOUBLE LOW LINE */
 805           || ch == 0x2020 /* DAGGER */
 806           || ch == 0x2021 /* DOUBLE DAGGER */
 807           || ch == 0x2022 /* BULLET */
 808           || ch == 0x2023 /* TRIANGULAR BULLET */
 809           || ch == 0x2038 /* CARET */
 810           || ch == 0x203B /* REFERENCE MARK */
 811           || ch == 0x203D /* INTERROBANG */
 812           || ch == 0x203E /* OVERLINE */
 813           || ch == 0x203F /* UNDERTIE */
 814           || ch == 0x2040 /* CHARACTER TIE */
 815           || ch == 0x2041 /* CARET INSERTION POINT */
 816           || ch == 0x2042 /* ASTERISM */
 817           || ch == 0x2043 /* HYPHEN BULLET */
 818           || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
 819           || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
 820           || ch == 0x204A /* TIRONIAN SIGN ET */
 821           || ch == 0x204B /* REVERSED PILCROW SIGN */
 822           || ch == 0x204C /* BLACK LEFTWARDS BULLET */
 823           || ch == 0x204D /* BLACK RIGHTWARDS BULLET */
 824           || ch == 0x2070 /* SUPERSCRIPT ZERO */
 825           || ch == 0x2074 /* SUPERSCRIPT FOUR */
 826           || ch == 0x2075 /* SUPERSCRIPT FIVE */
 827           || ch == 0x2076 /* SUPERSCRIPT SIX */
 828           || ch == 0x2077 /* SUPERSCRIPT SEVEN */
 829           || ch == 0x2078 /* SUPERSCRIPT EIGHT */
 830           || ch == 0x2079 /* SUPERSCRIPT NINE */
 831           || ch == 0x2080 /* SUBSCRIPT ZERO */
 832           || ch == 0x2081 /* SUBSCRIPT ONE */
 833           || ch == 0x2082 /* SUBSCRIPT TWO */
 834           || ch == 0x2083 /* SUBSCRIPT THREE */
 835           || ch == 0x2084 /* SUBSCRIPT FOUR */
 836           || ch == 0x2085 /* SUBSCRIPT FIVE */
 837           || ch == 0x2086 /* SUBSCRIPT SIX */
 838           || ch == 0x2087 /* SUBSCRIPT SEVEN */
 839           || ch == 0x2088 /* SUBSCRIPT EIGHT */
 840           || ch == 0x2089 /* SUBSCRIPT NINE */
 841           || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */
 842           || ch == 0x215F /* FRACTION NUMERATOR ONE */
 843           || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */
 844           || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */
 845           || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */
 846           || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */
 847           || ch == 0x24EA /* CIRCLED DIGIT ZERO */
 848           || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */
 849           || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */
 850           || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */
 851           || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */
 852           || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */
 853           || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
 854         if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
 855           {
 856             /* ambiguous (alphabetic) ? */
 857             if (unicode_width[ch] != NULL
 858                 && unicode_width[ch][0] == 'A')
 859               attr |= 1 << LBP_AI;
 860             else
 861               attr |= 1 << LBP_AL;
 862           }
 863     }
 864
 865   if (attr == 0)
 866     /* unknown */
 867     attr |= 1 << LBP_XX;
 868
 869   return attr;
 870 }
 871
 872 /* Output the line breaking properties in a human readable format.  */
 873 static void
 874 debug_output_lbp (FILE *stream)
 875 {
 876   unsigned int i;
 877
 878   for (i = 0; i < 0x110000; i++)
 879     {
 880       int attr = get_lbp (i);
 881       if (attr != 1 << LBP_XX)
 882         {
 883           fprintf (stream, "0x%04X", i);
 884 #define PRINT_BIT(attr,bit) \
 885   if (attr & (1 << bit)) fprintf (stream, " " #bit);
 886           PRINT_BIT(attr,LBP_BK);
 887           PRINT_BIT(attr,LBP_CM);
 888           PRINT_BIT(attr,LBP_ZW);
 889           PRINT_BIT(attr,LBP_IN);
 890           PRINT_BIT(attr,LBP_GL);
 891           PRINT_BIT(attr,LBP_CB);
 892           PRINT_BIT(attr,LBP_SP);
 893           PRINT_BIT(attr,LBP_BA);
 894           PRINT_BIT(attr,LBP_BB);
 895           PRINT_BIT(attr,LBP_B2);
 896           PRINT_BIT(attr,LBP_HY);
 897           PRINT_BIT(attr,LBP_NS);
 898           PRINT_BIT(attr,LBP_OP);
 899           PRINT_BIT(attr,LBP_CL);
 900           PRINT_BIT(attr,LBP_QU);
 901           PRINT_BIT(attr,LBP_EX);
 902           PRINT_BIT(attr,LBP_ID);
 903           PRINT_BIT(attr,LBP_NU);
 904           PRINT_BIT(attr,LBP_IS);
 905           PRINT_BIT(attr,LBP_SY);
 906           PRINT_BIT(attr,LBP_AL);
 907           PRINT_BIT(attr,LBP_PR);
 908           PRINT_BIT(attr,LBP_PO);
 909           PRINT_BIT(attr,LBP_SA);
 910           PRINT_BIT(attr,LBP_XX);
 911           PRINT_BIT(attr,LBP_AI);
 912 #undef PRINT_BIT
 913           fprintf (stream, "\n");
 914         }
 915     }
 916 }
 917
 918 static void
 919 debug_output_tables (const char *filename)
 920 {
 921   FILE *stream;
 922
 923   stream = fopen (filename, "w");
 924   if (stream == NULL)
 925     {
 926       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 927       exit (1);
 928     }
 929
 930   debug_output_lbp (stream);
 931
 932   if (ferror (stream) || fclose (stream))
 933     {
 934       fprintf (stderr, "error writing to '%s'\n", filename);
 935       exit (1);
 936     }
 937 }
 938
 939 /* The line breaking property from the LineBreak.txt file.  */
 940 int unicode_org_lbp[0x110000];
 941
 942 /* Stores in unicode_org_lbp[] the line breaking property from the
 943    LineBreak.txt file.  */
 944 static void
 945 fill_org_lbp (const char *linebreak_filename)
 946 {
 947   unsigned int i, j;
 948   FILE *stream;
 949   char field0[FIELDLEN];
 950   char field1[FIELDLEN];
 951   char field2[FIELDLEN];
 952   int lineno = 0;
 953
 954   for (i = 0; i < 0x110000; i++)
 955     unicode_org_lbp[i] = LBP_XX;
 956
 957   stream = fopen (linebreak_filename, "r");
 958   if (stream == NULL)
 959     {
 960       fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
 961       exit (1);
 962     }
 963
 964   for (;;)
 965     {
 966       int n;
 967       int c;
 968       int value;
 969
 970       lineno++;
 971       c = getc (stream);
 972       if (c == EOF)
 973         break;
 974       if (c == '#')
 975         {
 976           do c = getc (stream); while (c != EOF && c != '\n');
 977           continue;
 978         }
 979       ungetc (c, stream);
 980       n = getfield (stream, field0, ';');
 981       n += getfield (stream, field1, ' ');
 982       n += getfield (stream, field2, '\n');
 983       if (n == 0)
 984         break;
 985       if (n != 3)
 986         {
 987           fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
 988                    lineno);
 989           exit (1);
 990         }
 991 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
 992       if (false) {}
 993       TRY(LBP_BK)
 994       TRY(LBP_CM)
 995       TRY(LBP_ZW)
 996       TRY(LBP_IN)
 997       TRY(LBP_GL)
 998       TRY(LBP_CB)
 999       TRY(LBP_SP)
1000       TRY(LBP_BA)
1001       TRY(LBP_BB)
1002       TRY(LBP_B2)
1003       TRY(LBP_HY)
1004       TRY(LBP_NS)
1005       TRY(LBP_OP)
1006       TRY(LBP_CL)
1007       TRY(LBP_QU)
1008       TRY(LBP_EX)
1009       TRY(LBP_ID)
1010       TRY(LBP_NU)
1011       TRY(LBP_IS)
1012       TRY(LBP_SY)
1013       TRY(LBP_AL)
1014       TRY(LBP_PR)
1015       TRY(LBP_PO)
1016       TRY(LBP_SA)
1017       TRY(LBP_XX)
1018       TRY(LBP_AI)
1019 #undef TRY
1020       else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1021       else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1022       else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1023       else
1024         {
1025           fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1026                    field1, linebreak_filename, lineno);
1027           exit (1);
1028         }
1029       i = strtoul (field0, NULL, 16);
1030       if (strstr (field0, "..") != NULL)
1031         {
1032           /* Deal with a range.  */
1033           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1034           for (; i <= j; i++)
1035             unicode_org_lbp[i] = value;
1036         }
1037       else
1038         {
1039           /* Single character line.  */
1040           unicode_org_lbp[i] = value;
1041         }
1042     }
1043   if (ferror (stream) || fclose (stream))
1044     {
1045       fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1046       exit (1);
1047     }
1048 }
1049
1050 /* Output the line breaking properties in a human readable format.  */
1051 static void
1052 debug_output_org_lbp (FILE *stream)
1053 {
1054   unsigned int i;
1055
1056   for (i = 0; i < 0x110000; i++)
1057     {
1058       int attr = unicode_org_lbp[i];
1059       if (attr != LBP_XX)
1060         {
1061           fprintf (stream, "0x%04X", i);
1062 #define PRINT_BIT(attr,bit) \
1063   if (attr == bit) fprintf (stream, " " #bit);
1064           PRINT_BIT(attr,LBP_BK);
1065           PRINT_BIT(attr,LBP_CM);
1066           PRINT_BIT(attr,LBP_ZW);
1067           PRINT_BIT(attr,LBP_IN);
1068           PRINT_BIT(attr,LBP_GL);
1069           PRINT_BIT(attr,LBP_CB);
1070           PRINT_BIT(attr,LBP_SP);
1071           PRINT_BIT(attr,LBP_BA);
1072           PRINT_BIT(attr,LBP_BB);
1073           PRINT_BIT(attr,LBP_B2);
1074           PRINT_BIT(attr,LBP_HY);
1075           PRINT_BIT(attr,LBP_NS);
1076           PRINT_BIT(attr,LBP_OP);
1077           PRINT_BIT(attr,LBP_CL);
1078           PRINT_BIT(attr,LBP_QU);
1079           PRINT_BIT(attr,LBP_EX);
1080           PRINT_BIT(attr,LBP_ID);
1081           PRINT_BIT(attr,LBP_NU);
1082           PRINT_BIT(attr,LBP_IS);
1083           PRINT_BIT(attr,LBP_SY);
1084           PRINT_BIT(attr,LBP_AL);
1085           PRINT_BIT(attr,LBP_PR);
1086           PRINT_BIT(attr,LBP_PO);
1087           PRINT_BIT(attr,LBP_SA);
1088           PRINT_BIT(attr,LBP_XX);
1089           PRINT_BIT(attr,LBP_AI);
1090 #undef PRINT_BIT
1091           fprintf (stream, "\n");
1092         }
1093     }
1094 }
1095
1096 static void
1097 debug_output_org_tables (const char *filename)
1098 {
1099   FILE *stream;
1100
1101   stream = fopen (filename, "w");
1102   if (stream == NULL)
1103     {
1104       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1105       exit (1);
1106     }
1107
1108   debug_output_org_lbp (stream);
1109
1110   if (ferror (stream) || fclose (stream))
1111     {
1112       fprintf (stderr, "error writing to '%s'\n", filename);
1113       exit (1);
1114     }
1115 }
1116
1117 /* Construction of sparse 3-level tables.  */
1118 #define TABLE lbp_table
1119 #define ELEMENT unsigned char
1120 #define DEFAULT LBP_XX
1121 #define xmalloc malloc
1122 #define xrealloc realloc
1123 #include "3level.h"
1124
1125 static void
1126 output_lbp (FILE *stream1, FILE *stream2)
1127 {
1128   unsigned int i;
1129   struct lbp_table t;
1130   unsigned int level1_offset, level2_offset, level3_offset;
1131
1132   t.p = 7;
1133   t.q = 9;
1134   lbp_table_init (&t);
1135
1136   for (i = 0; i < 0x110000; i++)
1137     {
1138       int attr = get_lbp (i);
1139
1140       /* Now attr should contain exactly one bit.  */
1141       if (attr == 0 || ((attr & (attr - 1)) != 0))
1142         abort ();
1143
1144       if (attr != 1 << LBP_XX)
1145         {
1146           unsigned int log2_attr;
1147           for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1148
1149           lbp_table_add (&t, i, log2_attr);
1150         }
1151     }
1152
1153   lbp_table_finalize (&t);
1154
1155   level1_offset =
1156     5 * sizeof (uint32_t);
1157   level2_offset =
1158     5 * sizeof (uint32_t)
1159     + t.level1_size * sizeof (uint32_t);
1160   level3_offset =
1161     5 * sizeof (uint32_t)
1162     + t.level1_size * sizeof (uint32_t)
1163     + (t.level2_size << t.q) * sizeof (uint32_t);
1164
1165   for (i = 0; i < 5; i++)
1166     fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
1167              ((uint32_t *) t.result)[i]);
1168   fprintf (stream1, "\n");
1169   fprintf (stream1, "typedef struct\n");
1170   fprintf (stream1, "  {\n");
1171   fprintf (stream1, "    int level1[%d];\n", t.level1_size);
1172   fprintf (stream1, "    int level2[%d << %d];\n", t.level2_size, t.q);
1173   fprintf (stream1, "    unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1174   fprintf (stream1, "  }\n");
1175   fprintf (stream1, "lbrkprop_t;\n");
1176   fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
1177
1178   fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
1179   fprintf (stream2, "{\n");
1180   fprintf (stream2, "  {");
1181   for (i = 0; i < t.level1_size; i++)
1182     {
1183       uint32_t offset;
1184       if (i > 0 && (i % 8) == 0)
1185         fprintf (stream2, "\n   ");
1186       offset = ((uint32_t *) (t.result + level1_offset))[i];
1187       fprintf (stream2, " %5d%s",
1188                offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1189                (i+1 < t.level1_size ? "," : ""));
1190     }
1191   fprintf (stream2, " },\n");
1192   fprintf (stream2, "  {");
1193   if (t.level2_size << t.q > 8)
1194     fprintf (stream2, "\n   ");
1195   for (i = 0; i < t.level2_size << t.q; i++)
1196     {
1197       uint32_t offset;
1198       if (i > 0 && (i % 8) == 0)
1199         fprintf (stream2, "\n   ");
1200       offset = ((uint32_t *) (t.result + level2_offset))[i];
1201       fprintf (stream2, " %5d%s",
1202                offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1203                (i+1 < t.level2_size << t.q ? "," : ""));
1204     }
1205   if (t.level2_size << t.q > 8)
1206     fprintf (stream2, "\n ");
1207   fprintf (stream2, " },\n");
1208   fprintf (stream2, "  {");
1209   if (t.level3_size << t.p > 8)
1210     fprintf (stream2, "\n   ");
1211   for (i = 0; i < t.level3_size << t.p; i++)
1212     {
1213       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1214       const char *value_string;
1215       switch (value)
1216         {
1217 #define CASE(x) case x: value_string = #x; break;
1218           CASE(LBP_BK);
1219           CASE(LBP_CM);
1220           CASE(LBP_ZW);
1221           CASE(LBP_IN);
1222           CASE(LBP_GL);
1223           CASE(LBP_CB);
1224           CASE(LBP_SP);
1225           CASE(LBP_BA);
1226           CASE(LBP_BB);
1227           CASE(LBP_B2);
1228           CASE(LBP_HY);
1229           CASE(LBP_NS);
1230           CASE(LBP_OP);
1231           CASE(LBP_CL);
1232           CASE(LBP_QU);
1233           CASE(LBP_EX);
1234           CASE(LBP_ID);
1235           CASE(LBP_NU);
1236           CASE(LBP_IS);
1237           CASE(LBP_SY);
1238           CASE(LBP_AL);
1239           CASE(LBP_PR);
1240           CASE(LBP_PO);
1241           CASE(LBP_SA);
1242           CASE(LBP_XX);
1243           CASE(LBP_AI);
1244 #undef CASE
1245           default:
1246             abort ();
1247         }
1248       if (i > 0 && (i % 8) == 0)
1249         fprintf (stream2, "\n   ");
1250       fprintf (stream2, " %s%s", value_string,
1251                (i+1 < t.level3_size << t.p ? "," : ""));
1252     }
1253   if (t.level3_size << t.p > 8)
1254     fprintf (stream2, "\n ");
1255   fprintf (stream2, " }\n");
1256   fprintf (stream2, "};\n");
1257 }
1258
1259 static void
1260 output_tables (const char *filename1, const char *filename2, const char *version)
1261 {
1262   const char *filenames[2];
1263   FILE *streams[2];
1264   size_t i;
1265
1266   filenames[0] = filename1;
1267   filenames[1] = filename2;
1268
1269   for (i = 0; i < 2; i++)
1270     {
1271       streams[i] = fopen (filenames[i], "w");
1272       if (streams[i] == NULL)
1273         {
1274           fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
1275           exit (1);
1276         }
1277     }
1278
1279   for (i = 0; i < 2; i++)
1280     {
1281       FILE *stream = streams[i];
1282
1283       fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1284       fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
1285       fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s.  */\n",
1286                version);
1287       fprintf (stream, "\n");
1288
1289       /* Put a GPL header on it.  The gnulib module is under LGPL (although it
1290          still carries the GPL header), and it's gnulib-tool which replaces the
1291          GPL header with an LGPL header.  */
1292       fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
1293       fprintf (stream, "\n");
1294       fprintf (stream, "   This program is free software: you can redistribute it and/or modify\n");
1295       fprintf (stream, "   it under the terms of the GNU General Public License as published by\n");
1296       fprintf (stream, "   the Free Software Foundation; either version 3 of the License, or\n");
1297       fprintf (stream, "   (at your option) any later version.\n");
1298       fprintf (stream, "\n");
1299       fprintf (stream, "   This program is distributed in the hope that it will be useful,\n");
1300       fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1301       fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
1302       fprintf (stream, "   GNU General Public License for more details.\n");
1303       fprintf (stream, "\n");
1304       fprintf (stream, "   You should have received a copy of the GNU General Public License\n");
1305       fprintf (stream, "   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */\n");
1306       fprintf (stream, "\n");
1307     }
1308
1309   output_lbp (streams[0], streams[1]);
1310
1311   for (i = 0; i < 2; i++)
1312     {
1313       if (ferror (streams[i]) || fclose (streams[i]))
1314         {
1315           fprintf (stderr, "error writing to '%s'\n", filenames[i]);
1316           exit (1);
1317         }
1318     }
1319 }
1320
1321 int
1322 main (int argc, char * argv[])
1323 {
1324   if (argc != 5)
1325     {
1326       fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt LineBreak.txt version\n",
1327                argv[0]);
1328       exit (1);
1329     }
1330
1331   fill_attributes (argv[1]);
1332   fill_width (argv[2]);
1333   fill_org_lbp (argv[3]);
1334
1335   debug_output_tables ("lbrkprop.txt");
1336   debug_output_org_tables ("lbrkprop_org.txt");
1337
1338   output_tables ("lbrkprop1.h", "lbrkprop2.h", argv[4]);
1339
1340   return 0;
1341 }