1 /* Generate a Unicode conforming Line Break Properties tables from a
3 Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-lbrk /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/EastAsianWidth.txt \
22 /usr/local/share/Unidata/LineBreak.txt \
33 /* This structure represents one line in the UnicodeData.txt file. */
34 struct unicode_attribute
36 const char *name; /* Character name */
37 const char *category; /* General category */
38 const char *combining; /* Canonical combining classes */
39 const char *bidi; /* Bidirectional category */
40 const char *decomposition; /* Character decomposition mapping */
41 const char *decdigit; /* Decimal digit value */
42 const char *digit; /* Digit value */
43 const char *numeric; /* Numeric value */
44 int mirrored; /* mirrored */
45 const char *oldname; /* Old Unicode 1.0 name */
46 const char *comment; /* Comment */
47 unsigned int upper; /* Uppercase mapping */
48 unsigned int lower; /* Lowercase mapping */
49 unsigned int title; /* Titlecase mapping */
52 /* Missing fields are represented with "" for strings, and NONE for
54 #define NONE (~(unsigned int)0)
56 /* The entire contents of the UnicodeData.txt file. */
57 struct unicode_attribute unicode_attributes [0x110000];
59 /* Stores in unicode_attributes[i] the values from the given fields. */
61 fill_attribute (unsigned int i,
62 const char *field1, const char *field2,
63 const char *field3, const char *field4,
64 const char *field5, const char *field6,
65 const char *field7, const char *field8,
66 const char *field9, const char *field10,
67 const char *field11, const char *field12,
68 const char *field13, const char *field14)
70 struct unicode_attribute * uni;
74 fprintf (stderr, "index too large\n");
77 uni = &unicode_attributes[i];
78 /* Copy the strings. */
79 uni->name = strdup (field1);
80 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
81 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
82 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
83 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
85 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
86 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
87 uni->mirrored = (field9[0] == 'Y');
88 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
89 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
90 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
95 /* Maximum length of a field in the UnicodeData.txt file. */
98 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
99 Reads up to (but excluding) DELIM.
100 Returns 1 when a field was successfully read, otherwise 0. */
102 getfield (FILE *stream, char *buffer, int delim)
107 for (; (c = getc (stream)), (c != EOF && c != delim); )
109 /* The original unicode.org UnicodeData.txt file happens to have
110 CR/LF line terminators. Silently convert to LF. */
114 /* Put c into the buffer. */
115 if (++count >= FIELDLEN - 1)
117 fprintf (stderr, "field too long\n");
130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
133 fill_attributes (const char *unicodedata_filename)
137 char field0[FIELDLEN];
138 char field1[FIELDLEN];
139 char field2[FIELDLEN];
140 char field3[FIELDLEN];
141 char field4[FIELDLEN];
142 char field5[FIELDLEN];
143 char field6[FIELDLEN];
144 char field7[FIELDLEN];
145 char field8[FIELDLEN];
146 char field9[FIELDLEN];
147 char field10[FIELDLEN];
148 char field11[FIELDLEN];
149 char field12[FIELDLEN];
150 char field13[FIELDLEN];
151 char field14[FIELDLEN];
154 for (i = 0; i < 0x110000; i++)
155 unicode_attributes[i].name = NULL;
157 stream = fopen (unicodedata_filename, "r");
160 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
169 n = getfield (stream, field0, ';');
170 n += getfield (stream, field1, ';');
171 n += getfield (stream, field2, ';');
172 n += getfield (stream, field3, ';');
173 n += getfield (stream, field4, ';');
174 n += getfield (stream, field5, ';');
175 n += getfield (stream, field6, ';');
176 n += getfield (stream, field7, ';');
177 n += getfield (stream, field8, ';');
178 n += getfield (stream, field9, ';');
179 n += getfield (stream, field10, ';');
180 n += getfield (stream, field11, ';');
181 n += getfield (stream, field12, ';');
182 n += getfield (stream, field13, ';');
183 n += getfield (stream, field14, '\n');
188 fprintf (stderr, "short line in'%s':%d\n",
189 unicodedata_filename, lineno);
192 i = strtoul (field0, NULL, 16);
194 && strlen (field1) >= 9
195 && !strcmp (field1 + strlen(field1) - 8, ", First>"))
197 /* Deal with a range. */
199 n = getfield (stream, field0, ';');
200 n += getfield (stream, field1, ';');
201 n += getfield (stream, field2, ';');
202 n += getfield (stream, field3, ';');
203 n += getfield (stream, field4, ';');
204 n += getfield (stream, field5, ';');
205 n += getfield (stream, field6, ';');
206 n += getfield (stream, field7, ';');
207 n += getfield (stream, field8, ';');
208 n += getfield (stream, field9, ';');
209 n += getfield (stream, field10, ';');
210 n += getfield (stream, field11, ';');
211 n += getfield (stream, field12, ';');
212 n += getfield (stream, field13, ';');
213 n += getfield (stream, field14, '\n');
216 fprintf (stderr, "missing end range in '%s':%d\n",
217 unicodedata_filename, lineno);
220 if (!(field1[0] == '<'
221 && strlen (field1) >= 8
222 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224 fprintf (stderr, "missing end range in '%s':%d\n",
225 unicodedata_filename, lineno);
228 field1[strlen (field1) - 7] = '\0';
229 j = strtoul (field0, NULL, 16);
231 fill_attribute (i, field1+1, field2, field3, field4, field5,
232 field6, field7, field8, field9, field10,
233 field11, field12, field13, field14);
237 /* Single character line */
238 fill_attribute (i, field1, field2, field3, field4, field5,
239 field6, field7, field8, field9, field10,
240 field11, field12, field13, field14);
243 if (ferror (stream) || fclose (stream))
245 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
250 /* The width property from the EastAsianWidth.txt file.
251 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
252 const char * unicode_width[0x110000];
254 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
257 fill_width (const char *width_filename)
261 char field0[FIELDLEN];
262 char field1[FIELDLEN];
263 char field2[FIELDLEN];
266 for (i = 0; i < 0x110000; i++)
267 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
269 stream = fopen (width_filename, "r");
272 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
287 do c = getc (stream); while (c != EOF && c != '\n');
291 n = getfield (stream, field0, ';');
292 n += getfield (stream, field1, ' ');
293 n += getfield (stream, field2, '\n');
298 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
301 i = strtoul (field0, NULL, 16);
302 if (strstr (field0, "..") != NULL)
304 /* Deal with a range. */
305 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
307 unicode_width[i] = strdup (field1);
311 /* Single character line. */
312 unicode_width[i] = strdup (field1);
315 if (ferror (stream) || fclose (stream))
317 fprintf (stderr, "error reading from '%s'\n", width_filename);
322 /* Line breaking classification. */
326 /* Values >= 20 are resolved at run time. */
327 LBP_BK = 0, /* mandatory break */
328 /*LBP_CR, carriage return - not used here because it's a DOSism */
329 /*LBP_LF, line feed - not used here because it's a DOSism */
330 LBP_CM = 20, /* attached characters and combining marks */
331 /*LBP_SG, surrogates - not used here because they are not characters */
332 LBP_ZW = 1, /* zero width space */
333 LBP_IN = 2, /* inseparable */
334 LBP_GL = 3, /* non-breaking (glue) */
335 LBP_CB = 22, /* contingent break opportunity */
336 LBP_SP = 21, /* space */
337 LBP_BA = 4, /* break opportunity after */
338 LBP_BB = 5, /* break opportunity before */
339 LBP_B2 = 6, /* break opportunity before and after */
340 LBP_HY = 7, /* hyphen */
341 LBP_NS = 8, /* non starter */
342 LBP_OP = 9, /* opening punctuation */
343 LBP_CL = 10, /* closing punctuation */
344 LBP_QU = 11, /* ambiguous quotation */
345 LBP_EX = 12, /* exclamation/interrogation */
346 LBP_ID = 13, /* ideographic */
347 LBP_NU = 14, /* numeric */
348 LBP_IS = 15, /* infix separator (numeric) */
349 LBP_SY = 16, /* symbols allowing breaks */
350 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
351 LBP_PR = 18, /* prefix (numeric) */
352 LBP_PO = 19, /* postfix (numeric) */
353 LBP_SA = 23, /* complex context (South East Asian) */
354 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
355 LBP_XX = 25 /* unknown */
358 /* Returns the line breaking classification for ch, as a bit mask. */
360 get_lbp (unsigned int ch)
364 if (unicode_attributes[ch].name != NULL)
366 /* mandatory break */
367 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
368 || ch == 0x000C /* form feed */
369 || ch == 0x2028 /* LINE SEPARATOR */
370 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
373 /* zero width space */
374 if (ch == 0x200B /* ZERO WIDTH SPACE */)
378 if (ch == 0x2024 /* ONE DOT LEADER */
379 || ch == 0x2025 /* TWO DOT LEADER */
380 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
383 /* non-breaking (glue) */
384 if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
385 || ch == 0x00A0 /* NO-BREAK SPACE */
386 || ch == 0x202F /* NARROW NO-BREAK SPACE */
387 || ch == 0x2007 /* FIGURE SPACE */
388 || ch == 0x2011 /* NON-BREAKING HYPHEN */
389 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
392 /* contingent break opportunity */
393 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
397 if (ch == 0x0020 /* SPACE */)
400 /* break opportunity after */
401 if (ch == 0x2000 /* EN QUAD */
402 || ch == 0x2001 /* EM QUAD */
403 || ch == 0x2002 /* EN SPACE */
404 || ch == 0x2003 /* EM SPACE */
405 || ch == 0x2004 /* THREE-PER-EM SPACE */
406 || ch == 0x2005 /* FOUR-PER-EM SPACE */
407 || ch == 0x2006 /* SIX-PER-EM SPACE */
408 || ch == 0x2008 /* PUNCTUATION SPACE */
409 || ch == 0x2009 /* THIN SPACE */
410 || ch == 0x200A /* HAIR SPACE */
411 || ch == 0x0009 /* tab */
412 || ch == 0x058A /* ARMENIAN HYPHEN */
413 || ch == 0x2010 /* HYPHEN */
414 || ch == 0x2012 /* FIGURE DASH */
415 || ch == 0x2013 /* EN DASH */
416 || ch == 0x00AD /* SOFT HYPHEN */
417 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
418 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
419 || ch == 0x1680 /* OGHAM SPACE MARK */
420 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
421 || ch == 0x2027 /* HYPHENATION POINT */
422 || ch == 0x007C /* VERTICAL LINE */)
425 /* break opportunity before */
426 if (ch == 0x00B4 /* ACUTE ACCENT */
427 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
428 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
429 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
432 /* break opportunity before and after */
433 if (ch == 0x2014 /* EM DASH */)
437 if (ch == 0x002D /* HYPHEN-MINUS */)
440 /* exclamation/interrogation */
441 if (ch == 0x0021 /* EXCLAMATION MARK */
442 || ch == 0x003F /* QUESTION MARK */
443 || ch == 0xFE56 /* SMALL QUESTION MARK */
444 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
445 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
446 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
449 /* opening punctuation */
450 if (unicode_attributes[ch].category[0] == 'P'
451 && unicode_attributes[ch].category[1] == 's')
454 /* closing punctuation */
455 if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
456 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
457 || ch == 0xFE50 /* SMALL COMMA */
458 || ch == 0xFE52 /* SMALL FULL STOP */
459 || ch == 0xFF0C /* FULLWIDTH COMMA */
460 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
461 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
462 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
463 || (unicode_attributes[ch].category[0] == 'P'
464 && unicode_attributes[ch].category[1] == 'e'))
467 /* ambiguous quotation */
468 if (ch == 0x0022 /* QUOTATION MARK */
469 || ch == 0x0027 /* APOSTROPHE */
470 || (unicode_attributes[ch].category[0] == 'P'
471 && (unicode_attributes[ch].category[1] == 'f'
472 || unicode_attributes[ch].category[1] == 'i')))
475 /* attached characters and combining marks */
476 if ((unicode_attributes[ch].category[0] == 'M'
477 && (unicode_attributes[ch].category[1] == 'n'
478 || unicode_attributes[ch].category[1] == 'c'
479 || unicode_attributes[ch].category[1] == 'e'))
480 || (ch >= 0x1160 && ch <= 0x11F9)
481 || (unicode_attributes[ch].category[0] == 'C'
482 && (unicode_attributes[ch].category[1] == 'c'
483 || unicode_attributes[ch].category[1] == 'f')))
484 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
488 if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
489 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
490 || ch == 0x17D4 /* KHMER SIGN KHAN */
491 || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
492 || ch == 0x17D7 /* KHMER SIGN LEK TOO */
493 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
494 || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
495 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
496 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
497 || ch == 0x2044 /* FRACTION SLASH */
498 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
499 || ch == 0x301C /* WAVE DASH */
500 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
501 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
502 || ch == 0x309D /* HIRAGANA ITERATION MARK */
503 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
504 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
505 || ch == 0x30FD /* KATAKANA ITERATION MARK */
506 || ch == 0xFE54 /* SMALL SEMICOLON */
507 || ch == 0xFE55 /* SMALL COLON */
508 || ch == 0xFF1A /* FULLWIDTH COLON */
509 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
510 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
511 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
512 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
513 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
514 || (unicode_attributes[ch].category[0] == 'L'
515 && unicode_attributes[ch].category[1] == 'm'
516 && (unicode_width[ch][0] == 'W'
517 || unicode_width[ch][0] == 'H'))
518 || (unicode_attributes[ch].category[0] == 'S'
519 && unicode_attributes[ch].category[1] == 'k'
520 && unicode_width[ch][0] == 'W')
521 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
522 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
526 if (unicode_attributes[ch].category[0] == 'N'
527 && unicode_attributes[ch].category[1] == 'd'
528 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
531 /* infix separator (numeric) */
532 if (ch == 0x002C /* COMMA */
533 || ch == 0x002E /* FULL STOP */
534 || ch == 0x003A /* COLON */
535 || ch == 0x003B /* SEMICOLON */
536 || ch == 0x0589 /* ARMENIAN FULL STOP */)
539 /* symbols allowing breaks */
540 if (ch == 0x002F /* SOLIDUS */)
543 /* postfix (numeric) */
544 if (ch == 0x0025 /* PERCENT SIGN */
545 || ch == 0x00A2 /* CENT SIGN */
546 || ch == 0x00B0 /* DEGREE SIGN */
547 || ch == 0x2030 /* PER MILLE SIGN */
548 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
549 || ch == 0x2032 /* PRIME */
550 || ch == 0x2033 /* DOUBLE PRIME */
551 || ch == 0x2034 /* TRIPLE PRIME */
552 || ch == 0x2035 /* REVERSED PRIME */
553 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
554 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
555 || ch == 0x20A7 /* PESETA SIGN */
556 || ch == 0x2103 /* DEGREE CELSIUS */
557 || ch == 0x2109 /* DEGREE FAHRENHEIT */
558 || ch == 0x2126 /* OHM SIGN */
559 || ch == 0xFE6A /* SMALL PERCENT SIGN */
560 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
561 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
564 /* prefix (numeric) */
565 if (ch == 0x002B /* PLUS SIGN */
566 || ch == 0x005C /* REVERSE SOLIDUS */
567 || ch == 0x00B1 /* PLUS-MINUS SIGN */
568 || ch == 0x2116 /* NUMERO SIGN */
569 || ch == 0x2212 /* MINUS SIGN */
570 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */
571 || (unicode_attributes[ch].category[0] == 'S'
572 && unicode_attributes[ch].category[1] == 'c'))
573 if (!(attr & (1 << LBP_PO)))
576 /* complex context (South East Asian) */
577 if (((ch >= 0x0E00 && ch <= 0x0EFF)
578 || (ch >= 0x1000 && ch <= 0x109F)
579 || (ch >= 0x1780 && ch <= 0x17FF))
580 && unicode_attributes[ch].category[0] == 'L'
581 && (unicode_attributes[ch].category[1] == 'm'
582 || unicode_attributes[ch].category[1] == 'o'))
583 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
587 if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
588 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
589 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
590 || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
591 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
592 || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
593 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
594 || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
595 || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
596 || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */
597 || ch == 0xFE62 /* SMALL PLUS SIGN */
598 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
599 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
600 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
601 || ch == 0xFE66 /* SMALL EQUALS SIGN */
602 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
603 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
604 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
605 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
606 || (ch >= 0x3000 && ch <= 0x33FF
607 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
608 /* Extra characters for compatibility with Unicode LineBreak.txt. */
609 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
610 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
611 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
612 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
613 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
614 || ch == 0xFE49 /* DASHED OVERLINE */
615 || ch == 0xFE4A /* CENTRELINE OVERLINE */
616 || ch == 0xFE4B /* WAVY OVERLINE */
617 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
618 || ch == 0xFE4D /* DASHED LOW LINE */
619 || ch == 0xFE4E /* CENTRELINE LOW LINE */
620 || ch == 0xFE4F /* WAVY LOW LINE */
621 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
622 || ch == 0xFE58 /* SMALL EM DASH */
623 || ch == 0xFE5F /* SMALL NUMBER SIGN */
624 || ch == 0xFE60 /* SMALL AMPERSAND */
625 || ch == 0xFE61 /* SMALL ASTERISK */
626 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
627 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
628 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
629 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
630 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
631 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
632 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
633 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
634 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
635 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
636 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
637 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
638 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
639 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
640 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
641 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
642 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
643 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
644 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
645 || ch == 0xFF5E /* FULLWIDTH TILDE */
646 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
647 || ch == 0xFFE3 /* FULLWIDTH MACRON */
648 || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */
650 /* ambiguous (ideograph) ? */
651 if (unicode_width[ch] != NULL
652 && unicode_width[ch][0] == 'A')
658 /* ordinary alphabetic and symbol characters */
659 if ((unicode_attributes[ch].category[0] == 'L'
660 && (unicode_attributes[ch].category[1] == 'u'
661 || unicode_attributes[ch].category[1] == 'l'
662 || unicode_attributes[ch].category[1] == 't'
663 || unicode_attributes[ch].category[1] == 'm'
664 || unicode_attributes[ch].category[1] == 'o'))
665 || (unicode_attributes[ch].category[0] == 'S'
666 && (unicode_attributes[ch].category[1] == 'm'
667 || unicode_attributes[ch].category[1] == 'c'
668 || unicode_attributes[ch].category[1] == 'k'
669 || unicode_attributes[ch].category[1] == 'o'))
670 /* Extra characters for compatibility with Unicode LineBreak.txt. */
671 || ch == 0x0023 /* NUMBER SIGN */
672 || ch == 0x0026 /* AMPERSAND */
673 || ch == 0x002A /* ASTERISK */
674 || ch == 0x0040 /* COMMERCIAL AT */
675 || ch == 0x005F /* LOW LINE */
676 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
677 || ch == 0x00B2 /* SUPERSCRIPT TWO */
678 || ch == 0x00B3 /* SUPERSCRIPT THREE */
679 || ch == 0x00B7 /* MIDDLE DOT */
680 || ch == 0x00B9 /* SUPERSCRIPT ONE */
681 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
682 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
683 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
684 || ch == 0x00BF /* INVERTED QUESTION MARK */
685 || ch == 0x037E /* GREEK QUESTION MARK */
686 || ch == 0x0387 /* GREEK ANO TELEIA */
687 || ch == 0x055A /* ARMENIAN APOSTROPHE */
688 || ch == 0x055B /* ARMENIAN EMPHASIS MARK */
689 || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */
690 || ch == 0x055D /* ARMENIAN COMMA */
691 || ch == 0x055E /* ARMENIAN QUESTION MARK */
692 || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */
693 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
694 || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */
695 || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */
696 || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */
697 || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */
698 || ch == 0x060C /* ARABIC COMMA */
699 || ch == 0x061B /* ARABIC SEMICOLON */
700 || ch == 0x061F /* ARABIC QUESTION MARK */
701 || ch == 0x066A /* ARABIC PERCENT SIGN */
702 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
703 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
704 || ch == 0x066D /* ARABIC FIVE POINTED STAR */
705 || ch == 0x06D4 /* ARABIC FULL STOP */
706 || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */
707 || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */
708 || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */
709 || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */
710 || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */
711 || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */
712 || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */
713 || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */
714 || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */
715 || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */
716 || ch == 0x070A /* SYRIAC CONTRACTION */
717 || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */
718 || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */
719 || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */
720 || ch == 0x0964 /* DEVANAGARI DANDA */
721 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
722 || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */
723 || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */
724 || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */
725 || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */
726 || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */
727 || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
728 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
729 || ch == 0x0BF0 /* TAMIL NUMBER TEN */
730 || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */
731 || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */
732 || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */
733 || ch == 0x0E4F /* THAI CHARACTER FONGMAN */
734 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
735 || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */
736 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
737 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
738 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
739 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
740 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
741 || ch == 0x0F0D /* TIBETAN MARK SHAD */
742 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
743 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
744 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
745 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
746 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
747 || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */
748 || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */
749 || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */
750 || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */
751 || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */
752 || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */
753 || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */
754 || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */
755 || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */
756 || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */
757 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
758 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
759 || ch == 0x104B /* MYANMAR SIGN SECTION */
760 || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */
761 || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */
762 || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */
763 || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */
764 || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */
765 || ch == 0x1362 /* ETHIOPIC FULL STOP */
766 || ch == 0x1363 /* ETHIOPIC COMMA */
767 || ch == 0x1364 /* ETHIOPIC SEMICOLON */
768 || ch == 0x1365 /* ETHIOPIC COLON */
769 || ch == 0x1366 /* ETHIOPIC PREFACE COLON */
770 || ch == 0x1367 /* ETHIOPIC QUESTION MARK */
771 || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */
772 || ch == 0x1372 /* ETHIOPIC NUMBER TEN */
773 || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */
774 || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */
775 || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */
776 || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */
777 || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */
778 || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */
779 || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */
780 || ch == 0x137A /* ETHIOPIC NUMBER NINETY */
781 || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */
782 || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */
783 || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */
784 || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */
785 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
786 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
787 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
788 || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */
789 || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */
790 || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */
791 || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */
792 || ch == 0x1800 /* MONGOLIAN BIRGA */
793 || ch == 0x1801 /* MONGOLIAN ELLIPSIS */
794 || ch == 0x1802 /* MONGOLIAN COMMA */
795 || ch == 0x1803 /* MONGOLIAN FULL STOP */
796 || ch == 0x1804 /* MONGOLIAN COLON */
797 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
798 || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */
799 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
800 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
801 || ch == 0x180A /* MONGOLIAN NIRUGU */
802 || ch == 0x2015 /* HORIZONTAL BAR */
803 || ch == 0x2016 /* DOUBLE VERTICAL LINE */
804 || ch == 0x2017 /* DOUBLE LOW LINE */
805 || ch == 0x2020 /* DAGGER */
806 || ch == 0x2021 /* DOUBLE DAGGER */
807 || ch == 0x2022 /* BULLET */
808 || ch == 0x2023 /* TRIANGULAR BULLET */
809 || ch == 0x2038 /* CARET */
810 || ch == 0x203B /* REFERENCE MARK */
811 || ch == 0x203D /* INTERROBANG */
812 || ch == 0x203E /* OVERLINE */
813 || ch == 0x203F /* UNDERTIE */
814 || ch == 0x2040 /* CHARACTER TIE */
815 || ch == 0x2041 /* CARET INSERTION POINT */
816 || ch == 0x2042 /* ASTERISM */
817 || ch == 0x2043 /* HYPHEN BULLET */
818 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
819 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
820 || ch == 0x204A /* TIRONIAN SIGN ET */
821 || ch == 0x204B /* REVERSED PILCROW SIGN */
822 || ch == 0x204C /* BLACK LEFTWARDS BULLET */
823 || ch == 0x204D /* BLACK RIGHTWARDS BULLET */
824 || ch == 0x2070 /* SUPERSCRIPT ZERO */
825 || ch == 0x2074 /* SUPERSCRIPT FOUR */
826 || ch == 0x2075 /* SUPERSCRIPT FIVE */
827 || ch == 0x2076 /* SUPERSCRIPT SIX */
828 || ch == 0x2077 /* SUPERSCRIPT SEVEN */
829 || ch == 0x2078 /* SUPERSCRIPT EIGHT */
830 || ch == 0x2079 /* SUPERSCRIPT NINE */
831 || ch == 0x2080 /* SUBSCRIPT ZERO */
832 || ch == 0x2081 /* SUBSCRIPT ONE */
833 || ch == 0x2082 /* SUBSCRIPT TWO */
834 || ch == 0x2083 /* SUBSCRIPT THREE */
835 || ch == 0x2084 /* SUBSCRIPT FOUR */
836 || ch == 0x2085 /* SUBSCRIPT FIVE */
837 || ch == 0x2086 /* SUBSCRIPT SIX */
838 || ch == 0x2087 /* SUBSCRIPT SEVEN */
839 || ch == 0x2088 /* SUBSCRIPT EIGHT */
840 || ch == 0x2089 /* SUBSCRIPT NINE */
841 || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */
842 || ch == 0x215F /* FRACTION NUMERATOR ONE */
843 || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */
844 || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */
845 || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */
846 || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */
847 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
848 || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */
849 || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */
850 || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */
851 || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */
852 || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */
853 || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
854 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
856 /* ambiguous (alphabetic) ? */
857 if (unicode_width[ch] != NULL
858 && unicode_width[ch][0] == 'A')
872 /* Output the line breaking properties in a human readable format. */
874 debug_output_lbp (FILE *stream)
878 for (i = 0; i < 0x110000; i++)
880 int attr = get_lbp (i);
881 if (attr != 1 << LBP_XX)
883 fprintf (stream, "0x%04X", i);
884 #define PRINT_BIT(attr,bit) \
885 if (attr & (1 << bit)) fprintf (stream, " " #bit);
886 PRINT_BIT(attr,LBP_BK);
887 PRINT_BIT(attr,LBP_CM);
888 PRINT_BIT(attr,LBP_ZW);
889 PRINT_BIT(attr,LBP_IN);
890 PRINT_BIT(attr,LBP_GL);
891 PRINT_BIT(attr,LBP_CB);
892 PRINT_BIT(attr,LBP_SP);
893 PRINT_BIT(attr,LBP_BA);
894 PRINT_BIT(attr,LBP_BB);
895 PRINT_BIT(attr,LBP_B2);
896 PRINT_BIT(attr,LBP_HY);
897 PRINT_BIT(attr,LBP_NS);
898 PRINT_BIT(attr,LBP_OP);
899 PRINT_BIT(attr,LBP_CL);
900 PRINT_BIT(attr,LBP_QU);
901 PRINT_BIT(attr,LBP_EX);
902 PRINT_BIT(attr,LBP_ID);
903 PRINT_BIT(attr,LBP_NU);
904 PRINT_BIT(attr,LBP_IS);
905 PRINT_BIT(attr,LBP_SY);
906 PRINT_BIT(attr,LBP_AL);
907 PRINT_BIT(attr,LBP_PR);
908 PRINT_BIT(attr,LBP_PO);
909 PRINT_BIT(attr,LBP_SA);
910 PRINT_BIT(attr,LBP_XX);
911 PRINT_BIT(attr,LBP_AI);
913 fprintf (stream, "\n");
919 debug_output_tables (const char *filename)
923 stream = fopen (filename, "w");
926 fprintf (stderr, "cannot open '%s' for writing\n", filename);
930 debug_output_lbp (stream);
932 if (ferror (stream) || fclose (stream))
934 fprintf (stderr, "error writing to '%s'\n", filename);
939 /* The line breaking property from the LineBreak.txt file. */
940 int unicode_org_lbp[0x110000];
942 /* Stores in unicode_org_lbp[] the line breaking property from the
943 LineBreak.txt file. */
945 fill_org_lbp (const char *linebreak_filename)
949 char field0[FIELDLEN];
950 char field1[FIELDLEN];
951 char field2[FIELDLEN];
954 for (i = 0; i < 0x110000; i++)
955 unicode_org_lbp[i] = LBP_XX;
957 stream = fopen (linebreak_filename, "r");
960 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
976 do c = getc (stream); while (c != EOF && c != '\n');
980 n = getfield (stream, field0, ';');
981 n += getfield (stream, field1, ' ');
982 n += getfield (stream, field2, '\n');
987 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
991 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1020 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1021 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1022 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1025 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1026 field1, linebreak_filename, lineno);
1029 i = strtoul (field0, NULL, 16);
1030 if (strstr (field0, "..") != NULL)
1032 /* Deal with a range. */
1033 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1035 unicode_org_lbp[i] = value;
1039 /* Single character line. */
1040 unicode_org_lbp[i] = value;
1043 if (ferror (stream) || fclose (stream))
1045 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1050 /* Output the line breaking properties in a human readable format. */
1052 debug_output_org_lbp (FILE *stream)
1056 for (i = 0; i < 0x110000; i++)
1058 int attr = unicode_org_lbp[i];
1061 fprintf (stream, "0x%04X", i);
1062 #define PRINT_BIT(attr,bit) \
1063 if (attr == bit) fprintf (stream, " " #bit);
1064 PRINT_BIT(attr,LBP_BK);
1065 PRINT_BIT(attr,LBP_CM);
1066 PRINT_BIT(attr,LBP_ZW);
1067 PRINT_BIT(attr,LBP_IN);
1068 PRINT_BIT(attr,LBP_GL);
1069 PRINT_BIT(attr,LBP_CB);
1070 PRINT_BIT(attr,LBP_SP);
1071 PRINT_BIT(attr,LBP_BA);
1072 PRINT_BIT(attr,LBP_BB);
1073 PRINT_BIT(attr,LBP_B2);
1074 PRINT_BIT(attr,LBP_HY);
1075 PRINT_BIT(attr,LBP_NS);
1076 PRINT_BIT(attr,LBP_OP);
1077 PRINT_BIT(attr,LBP_CL);
1078 PRINT_BIT(attr,LBP_QU);
1079 PRINT_BIT(attr,LBP_EX);
1080 PRINT_BIT(attr,LBP_ID);
1081 PRINT_BIT(attr,LBP_NU);
1082 PRINT_BIT(attr,LBP_IS);
1083 PRINT_BIT(attr,LBP_SY);
1084 PRINT_BIT(attr,LBP_AL);
1085 PRINT_BIT(attr,LBP_PR);
1086 PRINT_BIT(attr,LBP_PO);
1087 PRINT_BIT(attr,LBP_SA);
1088 PRINT_BIT(attr,LBP_XX);
1089 PRINT_BIT(attr,LBP_AI);
1091 fprintf (stream, "\n");
1097 debug_output_org_tables (const char *filename)
1101 stream = fopen (filename, "w");
1104 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1108 debug_output_org_lbp (stream);
1110 if (ferror (stream) || fclose (stream))
1112 fprintf (stderr, "error writing to '%s'\n", filename);
1117 /* Construction of sparse 3-level tables. */
1118 #define TABLE lbp_table
1119 #define ELEMENT unsigned char
1120 #define DEFAULT LBP_XX
1121 #define xmalloc malloc
1122 #define xrealloc realloc
1126 output_lbp (FILE *stream1, FILE *stream2)
1130 unsigned int level1_offset, level2_offset, level3_offset;
1134 lbp_table_init (&t);
1136 for (i = 0; i < 0x110000; i++)
1138 int attr = get_lbp (i);
1140 /* Now attr should contain exactly one bit. */
1141 if (attr == 0 || ((attr & (attr - 1)) != 0))
1144 if (attr != 1 << LBP_XX)
1146 unsigned int log2_attr;
1147 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1149 lbp_table_add (&t, i, log2_attr);
1153 lbp_table_finalize (&t);
1156 5 * sizeof (uint32_t);
1158 5 * sizeof (uint32_t)
1159 + t.level1_size * sizeof (uint32_t);
1161 5 * sizeof (uint32_t)
1162 + t.level1_size * sizeof (uint32_t)
1163 + (t.level2_size << t.q) * sizeof (uint32_t);
1165 for (i = 0; i < 5; i++)
1166 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
1167 ((uint32_t *) t.result)[i]);
1168 fprintf (stream1, "\n");
1169 fprintf (stream1, "typedef struct\n");
1170 fprintf (stream1, " {\n");
1171 fprintf (stream1, " int level1[%d];\n", t.level1_size);
1172 fprintf (stream1, " int level2[%d << %d];\n", t.level2_size, t.q);
1173 fprintf (stream1, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1174 fprintf (stream1, " }\n");
1175 fprintf (stream1, "lbrkprop_t;\n");
1176 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
1178 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
1179 fprintf (stream2, "{\n");
1180 fprintf (stream2, " {");
1181 for (i = 0; i < t.level1_size; i++)
1184 if (i > 0 && (i % 8) == 0)
1185 fprintf (stream2, "\n ");
1186 offset = ((uint32_t *) (t.result + level1_offset))[i];
1187 fprintf (stream2, " %5d%s",
1188 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1189 (i+1 < t.level1_size ? "," : ""));
1191 fprintf (stream2, " },\n");
1192 fprintf (stream2, " {");
1193 if (t.level2_size << t.q > 8)
1194 fprintf (stream2, "\n ");
1195 for (i = 0; i < t.level2_size << t.q; i++)
1198 if (i > 0 && (i % 8) == 0)
1199 fprintf (stream2, "\n ");
1200 offset = ((uint32_t *) (t.result + level2_offset))[i];
1201 fprintf (stream2, " %5d%s",
1202 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1203 (i+1 < t.level2_size << t.q ? "," : ""));
1205 if (t.level2_size << t.q > 8)
1206 fprintf (stream2, "\n ");
1207 fprintf (stream2, " },\n");
1208 fprintf (stream2, " {");
1209 if (t.level3_size << t.p > 8)
1210 fprintf (stream2, "\n ");
1211 for (i = 0; i < t.level3_size << t.p; i++)
1213 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1214 const char *value_string;
1217 #define CASE(x) case x: value_string = #x; break;
1248 if (i > 0 && (i % 8) == 0)
1249 fprintf (stream2, "\n ");
1250 fprintf (stream2, " %s%s", value_string,
1251 (i+1 < t.level3_size << t.p ? "," : ""));
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream2, "\n ");
1255 fprintf (stream2, " }\n");
1256 fprintf (stream2, "};\n");
1260 output_tables (const char *filename1, const char *filename2, const char *version)
1262 const char *filenames[2];
1266 filenames[0] = filename1;
1267 filenames[1] = filename2;
1269 for (i = 0; i < 2; i++)
1271 streams[i] = fopen (filenames[i], "w");
1272 if (streams[i] == NULL)
1274 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
1279 for (i = 0; i < 2; i++)
1281 FILE *stream = streams[i];
1283 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1284 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
1285 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
1287 fprintf (stream, "\n");
1289 /* Put a GPL header on it. The gnulib module is under LGPL (although it
1290 still carries the GPL header), and it's gnulib-tool which replaces the
1291 GPL header with an LGPL header. */
1292 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
1293 fprintf (stream, "\n");
1294 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
1295 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
1296 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
1297 fprintf (stream, " (at your option) any later version.\n");
1298 fprintf (stream, "\n");
1299 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
1300 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1301 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
1302 fprintf (stream, " GNU General Public License for more details.\n");
1303 fprintf (stream, "\n");
1304 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
1305 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
1306 fprintf (stream, "\n");
1309 output_lbp (streams[0], streams[1]);
1311 for (i = 0; i < 2; i++)
1313 if (ferror (streams[i]) || fclose (streams[i]))
1315 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
1322 main (int argc, char * argv[])
1326 fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt LineBreak.txt version\n",
1331 fill_attributes (argv[1]);
1332 fill_width (argv[2]);
1333 fill_org_lbp (argv[3]);
1335 debug_output_tables ("lbrkprop.txt");
1336 debug_output_org_tables ("lbrkprop_org.txt");
1338 output_tables ("lbrkprop1.h", "lbrkprop2.h", argv[4]);