1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "linebreak.h"
30 #include "uniwidth/cjk.h"
35 is_utf8_encoding (const char *encoding)
37 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
43 /* Determine the line break points in S, and store the result at p[0..n-1]. */
44 /* We don't support line breaking of complex-context dependent characters
45 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
47 /* Line breaking classification. */
51 /* Values >= 20 are resolved at run time. */
52 LBP_BK = 0, /* mandatory break */
53 /*LBP_CR, carriage return - not used here because it's a DOSism */
54 /*LBP_LF, line feed - not used here because it's a DOSism */
55 LBP_CM = 20, /* attached characters and combining marks */
56 /*LBP_SG, surrogates - not used here because they are not characters */
57 LBP_ZW = 1, /* zero width space */
58 LBP_IN = 2, /* inseparable */
59 LBP_GL = 3, /* non-breaking (glue) */
60 LBP_CB = 22, /* contingent break opportunity */
61 LBP_SP = 21, /* space */
62 LBP_BA = 4, /* break opportunity after */
63 LBP_BB = 5, /* break opportunity before */
64 LBP_B2 = 6, /* break opportunity before and after */
65 LBP_HY = 7, /* hyphen */
66 LBP_NS = 8, /* non starter */
67 LBP_OP = 9, /* opening punctuation */
68 LBP_CL = 10, /* closing punctuation */
69 LBP_QU = 11, /* ambiguous quotation */
70 LBP_EX = 12, /* exclamation/interrogation */
71 LBP_ID = 13, /* ideographic */
72 LBP_NU = 14, /* numeric */
73 LBP_IS = 15, /* infix separator (numeric) */
74 LBP_SY = 16, /* symbols allowing breaks */
75 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
76 LBP_PR = 18, /* prefix (numeric) */
77 LBP_PO = 19, /* postfix (numeric) */
78 LBP_SA = 23, /* complex context (South East Asian) */
79 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
80 LBP_XX = 25 /* unknown */
85 static inline unsigned char
86 lbrkprop_lookup (unsigned int uc)
88 unsigned int index1 = uc >> lbrkprop_header_0;
89 if (index1 < lbrkprop_header_1)
91 int lookup1 = lbrkprop.level1[index1];
94 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
95 int lookup2 = lbrkprop.level2[lookup1 + index2];
98 unsigned int index3 = uc & lbrkprop_header_4;
99 return lbrkprop.level3[lookup2 + index3];
106 /* Table indexed by two line breaking classifications. */
107 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
108 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
109 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
110 static const unsigned char lbrk_table[19][19] = {
112 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
113 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
114 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
115 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
116 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
117 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
118 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
119 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
120 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
121 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
122 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
123 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
124 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
125 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
126 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
127 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
128 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
129 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
130 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
131 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
135 /* Note: The (B2,B2) entry should probably be D instead of P. */
136 /* Note: The (PR,ID) entry should probably be D instead of I. */
139 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
141 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
142 const unsigned char *s_end = s + n;
143 int last_prop = LBP_BK; /* line break property of last non-space character */
144 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
145 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
147 /* Don't break inside multibyte characters. */
148 memset (p, UC_BREAK_PROHIBITED, n);
153 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
154 int prop = lbrkprop_lookup (uc);
158 /* Mandatory break. */
159 *p = UC_BREAK_MANDATORY;
168 /* Resolve property values whose behaviour is not fixed. */
172 /* Resolve ambiguous. */
173 prop = LBP_AI_REPLACEMENT;
176 /* This is arbitrary. */
180 /* We don't handle complex scripts yet.
181 Treat LBP_SA like LBP_XX. */
183 /* This is arbitrary. */
188 /* Deal with combining characters. */
192 /* Don't break just before a combining character. */
193 *p = UC_BREAK_PROHIBITED;
194 /* A combining character turns a preceding space into LBP_AL. */
195 if (seen_space != NULL)
198 seen_space = seen_space2;
200 goto lookup_via_table;
203 else if (prop == LBP_SP)
205 /* Don't break just before a space. */
206 *p = UC_BREAK_PROHIBITED;
207 seen_space2 = seen_space;
213 /* prop must be usable as an index for table 7.3 of UTR #14. */
214 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
217 if (last_prop == LBP_BK)
219 /* Don't break at the beginning of a line. */
220 *q = UC_BREAK_PROHIBITED;
224 switch (lbrk_table [last_prop-1] [prop-1])
227 *q = UC_BREAK_POSSIBLE;
230 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
233 *q = UC_BREAK_PROHIBITED;
251 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
253 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
254 const unsigned short *s_end = s + n;
255 int last_prop = LBP_BK; /* line break property of last non-space character */
256 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
257 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
259 /* Don't break inside multibyte characters. */
260 memset (p, UC_BREAK_PROHIBITED, n);
265 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
266 int prop = lbrkprop_lookup (uc);
270 /* Mandatory break. */
271 *p = UC_BREAK_MANDATORY;
280 /* Resolve property values whose behaviour is not fixed. */
284 /* Resolve ambiguous. */
285 prop = LBP_AI_REPLACEMENT;
288 /* This is arbitrary. */
292 /* We don't handle complex scripts yet.
293 Treat LBP_SA like LBP_XX. */
295 /* This is arbitrary. */
300 /* Deal with combining characters. */
304 /* Don't break just before a combining character. */
305 *p = UC_BREAK_PROHIBITED;
306 /* A combining character turns a preceding space into LBP_AL. */
307 if (seen_space != NULL)
310 seen_space = seen_space2;
312 goto lookup_via_table;
315 else if (prop == LBP_SP)
317 /* Don't break just before a space. */
318 *p = UC_BREAK_PROHIBITED;
319 seen_space2 = seen_space;
325 /* prop must be usable as an index for table 7.3 of UTR #14. */
326 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
329 if (last_prop == LBP_BK)
331 /* Don't break at the beginning of a line. */
332 *q = UC_BREAK_PROHIBITED;
336 switch (lbrk_table [last_prop-1] [prop-1])
339 *q = UC_BREAK_POSSIBLE;
342 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
345 *q = UC_BREAK_PROHIBITED;
363 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
365 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
366 const unsigned int *s_end = s + n;
367 int last_prop = LBP_BK; /* line break property of last non-space character */
368 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
369 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
373 unsigned int uc = *s;
374 int prop = lbrkprop_lookup (uc);
378 /* Mandatory break. */
379 *p = UC_BREAK_MANDATORY;
388 /* Resolve property values whose behaviour is not fixed. */
392 /* Resolve ambiguous. */
393 prop = LBP_AI_REPLACEMENT;
396 /* This is arbitrary. */
400 /* We don't handle complex scripts yet.
401 Treat LBP_SA like LBP_XX. */
403 /* This is arbitrary. */
408 /* Deal with combining characters. */
412 /* Don't break just before a combining character. */
413 *p = UC_BREAK_PROHIBITED;
414 /* A combining character turns a preceding space into LBP_AL. */
415 if (seen_space != NULL)
418 seen_space = seen_space2;
420 goto lookup_via_table;
423 else if (prop == LBP_SP)
425 /* Don't break just before a space. */
426 *p = UC_BREAK_PROHIBITED;
427 seen_space2 = seen_space;
433 /* prop must be usable as an index for table 7.3 of UTR #14. */
434 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
437 if (last_prop == LBP_BK)
439 /* Don't break at the beginning of a line. */
440 *q = UC_BREAK_PROHIBITED;
444 switch (lbrk_table [last_prop-1] [prop-1])
447 *q = UC_BREAK_POSSIBLE;
450 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
453 *q = UC_BREAK_PROHIBITED;
471 /* Choose the best line breaks, assuming the uc_width function.
472 Return the column after the end of the string. */
475 u8_width_linebreaks (const unsigned char *s, size_t n,
476 int width, int start_column, int at_end_columns,
477 const char *o, const char *encoding,
480 const unsigned char *s_end;
485 u8_possible_linebreaks (s, n, encoding, p);
489 last_column = start_column;
494 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
496 /* Respect the override. */
497 if (o != NULL && *o != UC_BREAK_UNDEFINED)
500 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
502 /* An atomic piece of text ends here. */
503 if (last_p != NULL && last_column + piece_width > width)
505 /* Insert a line break. */
506 *last_p = UC_BREAK_POSSIBLE;
511 if (*p == UC_BREAK_MANDATORY)
513 /* uc is a line break character. */
514 /* Start a new piece at column 0. */
521 /* uc is not a line break character. */
524 if (*p == UC_BREAK_POSSIBLE)
526 /* Start a new piece. */
528 last_column += piece_width;
530 /* No line break for the moment, may be turned into
531 UC_BREAK_POSSIBLE later, via last_p. */
534 *p = UC_BREAK_PROHIBITED;
536 w = uc_width (uc, encoding);
537 if (w >= 0) /* ignore control characters in the string */
547 /* The last atomic piece of text ends here. */
548 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
550 /* Insert a line break. */
551 *last_p = UC_BREAK_POSSIBLE;
555 return last_column + piece_width;
559 u16_width_linebreaks (const unsigned short *s, size_t n,
560 int width, int start_column, int at_end_columns,
561 const char *o, const char *encoding,
564 const unsigned short *s_end;
569 u16_possible_linebreaks (s, n, encoding, p);
573 last_column = start_column;
578 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
580 /* Respect the override. */
581 if (o != NULL && *o != UC_BREAK_UNDEFINED)
584 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
586 /* An atomic piece of text ends here. */
587 if (last_p != NULL && last_column + piece_width > width)
589 /* Insert a line break. */
590 *last_p = UC_BREAK_POSSIBLE;
595 if (*p == UC_BREAK_MANDATORY)
597 /* uc is a line break character. */
598 /* Start a new piece at column 0. */
605 /* uc is not a line break character. */
608 if (*p == UC_BREAK_POSSIBLE)
610 /* Start a new piece. */
612 last_column += piece_width;
614 /* No line break for the moment, may be turned into
615 UC_BREAK_POSSIBLE later, via last_p. */
618 *p = UC_BREAK_PROHIBITED;
620 w = uc_width (uc, encoding);
621 if (w >= 0) /* ignore control characters in the string */
631 /* The last atomic piece of text ends here. */
632 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
634 /* Insert a line break. */
635 *last_p = UC_BREAK_POSSIBLE;
639 return last_column + piece_width;
643 u32_width_linebreaks (const unsigned int *s, size_t n,
644 int width, int start_column, int at_end_columns,
645 const char *o, const char *encoding,
648 const unsigned int *s_end;
653 u32_possible_linebreaks (s, n, encoding, p);
657 last_column = start_column;
661 unsigned int uc = *s;
663 /* Respect the override. */
664 if (o != NULL && *o != UC_BREAK_UNDEFINED)
667 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
669 /* An atomic piece of text ends here. */
670 if (last_p != NULL && last_column + piece_width > width)
672 /* Insert a line break. */
673 *last_p = UC_BREAK_POSSIBLE;
678 if (*p == UC_BREAK_MANDATORY)
680 /* uc is a line break character. */
681 /* Start a new piece at column 0. */
688 /* uc is not a line break character. */
691 if (*p == UC_BREAK_POSSIBLE)
693 /* Start a new piece. */
695 last_column += piece_width;
697 /* No line break for the moment, may be turned into
698 UC_BREAK_POSSIBLE later, via last_p. */
701 *p = UC_BREAK_PROHIBITED;
703 w = uc_width (uc, encoding);
704 if (w >= 0) /* ignore control characters in the string */
714 /* The last atomic piece of text ends here. */
715 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
717 /* Insert a line break. */
718 *last_p = UC_BREAK_POSSIBLE;
722 return last_column + piece_width;
730 /* Read the contents of an input stream, and return it, terminated with a NUL
733 read_file (FILE *stream)
741 while (! feof (stream))
743 if (size + BUFSIZE > alloc)
745 alloc = alloc + alloc / 2;
746 if (alloc < size + BUFSIZE)
747 alloc = size + BUFSIZE;
748 buf = realloc (buf, alloc);
751 fprintf (stderr, "out of memory\n");
755 count = fread (buf + size, 1, BUFSIZE, stream);
767 buf = realloc (buf, size + 1);
770 fprintf (stderr, "out of memory\n");
779 main (int argc, char * argv[])
783 /* Display all the break opportunities in the input string. */
784 char *input = read_file (stdin);
785 int length = strlen (input);
786 char *breaks = malloc (length);
789 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
791 for (i = 0; i < length; i++)
795 case UC_BREAK_POSSIBLE:
796 /* U+2027 in UTF-8 encoding */
797 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
799 case UC_BREAK_MANDATORY:
800 /* U+21B2 (or U+21B5) in UTF-8 encoding */
801 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
803 case UC_BREAK_PROHIBITED:
808 putc (input[i], stdout);
817 /* Insert line breaks for a given width. */
818 int width = atoi (argv[1]);
819 char *input = read_file (stdin);
820 int length = strlen (input);
821 char *breaks = malloc (length);
824 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
826 for (i = 0; i < length; i++)
830 case UC_BREAK_POSSIBLE:
833 case UC_BREAK_MANDATORY:
835 case UC_BREAK_PROHIBITED:
840 putc (input[i], stdout);
854 /* Now the same thing with an arbitrary encoding.
856 We convert the input string to Unicode.
858 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
859 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
860 \U0000FFFF. UTF-16 and variants support only characters up to
861 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
862 UCS-4 specification leaves doubts about endianness and byte order mark.
863 glibc currently interprets it as big endian without byte order mark,
864 but this is not backed by an RFC. So we use UTF-8. It supports
865 characters up to \U7FFFFFFF and is unambiguously defined. */
872 /* Luckily, the encoding's name is platform independent. */
873 #define UTF8_NAME "UTF-8"
875 /* Return the length of a string after conversion through an iconv_t. */
877 iconv_string_length (iconv_t cd, const char *s, size_t n)
879 #define TMPBUFSIZE 4096
881 char tmpbuf[TMPBUFSIZE];
882 const char *inptr = s;
886 char *outptr = tmpbuf;
887 size_t outsize = TMPBUFSIZE;
888 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
889 if (res == (size_t)(-1) && errno != E2BIG)
891 count += outptr - tmpbuf;
893 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
894 #if defined _LIBICONV_VERSION \
895 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
897 char *outptr = tmpbuf;
898 size_t outsize = TMPBUFSIZE;
899 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
900 if (res == (size_t)(-1))
902 count += outptr - tmpbuf;
904 /* Return to the initial state. */
905 iconv (cd, NULL, NULL, NULL, NULL);
912 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
913 size_t *offtable, char *t, size_t m)
920 /* Avoid glibc-2.1 bug. */
921 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
922 const size_t extra = 1;
924 const size_t extra = 0;
927 for (i = 0; i < n; i++)
928 offtable[i] = (size_t)(-1);
934 while (inptr < s_end)
936 const char *saved_inptr;
940 offtable[inptr - s] = outptr - t;
944 for (insize = 1; inptr + insize <= s_end; insize++)
946 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
947 if (!(res == (size_t)(-1) && errno == EINVAL))
949 /* We expect that no input bytes have been consumed so far. */
950 if (inptr != saved_inptr)
953 /* After we verified the convertibility and computed the translation's
954 size m, there shouldn't be any conversion error here. */
955 if (res == (size_t)(-1))
958 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
959 #if defined _LIBICONV_VERSION \
960 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
961 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
964 /* We should have produced exactly m output bytes. */
965 if (outsize != extra)
969 #endif /* HAVE_ICONV */
973 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
974 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
976 is_all_ascii (const char *s, size_t n)
978 for (; n > 0; s++, n--)
980 unsigned char c = (unsigned char) *s;
982 if (!(c_isprint (c) || c_isspace (c)))
988 #endif /* C_CTYPE_ASCII */
991 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
996 if (is_utf8_encoding (encoding))
997 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1002 /* Avoid glibc-2.1 bug with EUC-KR. */
1003 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1004 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1005 to_utf8 = (iconv_t)(-1);
1008 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1010 # if defined __sun && !defined _LIBICONV_VERSION
1011 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1012 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1013 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1014 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1015 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1016 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1017 to_utf8 = (iconv_t)(-1);
1020 to_utf8 = iconv_open (UTF8_NAME, encoding);
1021 if (to_utf8 != (iconv_t)(-1))
1023 /* Determine the length of the resulting UTF-8 string. */
1024 size_t m = iconv_string_length (to_utf8, s, n);
1025 if (m != (size_t)(-1))
1027 /* Convert the string to UTF-8 and build a translation table
1028 from offsets into s to offsets into the translated string. */
1029 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1031 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1034 size_t *offtable = (size_t *) memory;
1035 char *t = (char *) (offtable + n);
1036 char *q = (char *) (t + m);
1039 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1041 /* Determine the possible line breaks of the UTF-8 string. */
1042 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1044 /* Translate the result back to the original string. */
1045 memset (p, UC_BREAK_PROHIBITED, n);
1046 for (i = 0; i < n; i++)
1047 if (offtable[i] != (size_t)(-1))
1048 p[i] = q[offtable[i]];
1051 iconv_close (to_utf8);
1055 iconv_close (to_utf8);
1058 /* Impossible to convert. */
1060 if (is_all_ascii (s, n))
1062 /* ASCII is a subset of UTF-8. */
1063 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1067 /* We have a non-ASCII string and cannot convert it.
1068 Don't produce line breaks except those already present in the
1069 input string. All we assume here is that the encoding is
1070 minimally ASCII compatible. */
1072 const char *s_end = s + n;
1075 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1084 mbs_width_linebreaks (const char *s, size_t n,
1085 int width, int start_column, int at_end_columns,
1086 const char *o, const char *encoding,
1090 return start_column;
1091 if (is_utf8_encoding (encoding))
1092 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1097 /* Avoid glibc-2.1 bug with EUC-KR. */
1098 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1099 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1100 to_utf8 = (iconv_t)(-1);
1103 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1105 # if defined __sun && !defined _LIBICONV_VERSION
1106 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1107 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1108 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1109 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1110 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1111 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1112 to_utf8 = (iconv_t)(-1);
1115 to_utf8 = iconv_open (UTF8_NAME, encoding);
1116 if (to_utf8 != (iconv_t)(-1))
1118 /* Determine the length of the resulting UTF-8 string. */
1119 size_t m = iconv_string_length (to_utf8, s, n);
1120 if (m != (size_t)(-1))
1122 /* Convert the string to UTF-8 and build a translation table
1123 from offsets into s to offsets into the translated string. */
1124 size_t memory_size =
1125 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1126 (o != NULL ? m : 0));
1129 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1132 size_t *offtable = (size_t *) memory;
1133 char *t = (char *) (offtable + n);
1134 char *q = (char *) (t + m);
1135 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1139 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1141 /* Translate the overrides to the UTF-8 string. */
1144 memset (o8, UC_BREAK_UNDEFINED, m);
1145 for (i = 0; i < n; i++)
1146 if (offtable[i] != (size_t)(-1))
1147 o8[offtable[i]] = o[i];
1150 /* Determine the line breaks of the UTF-8 string. */
1152 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1154 /* Translate the result back to the original string. */
1155 memset (p, UC_BREAK_PROHIBITED, n);
1156 for (i = 0; i < n; i++)
1157 if (offtable[i] != (size_t)(-1))
1158 p[i] = q[offtable[i]];
1161 iconv_close (to_utf8);
1165 iconv_close (to_utf8);
1168 /* Impossible to convert. */
1170 if (is_all_ascii (s, n))
1172 /* ASCII is a subset of UTF-8. */
1173 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1176 /* We have a non-ASCII string and cannot convert it.
1177 Don't produce line breaks except those already present in the
1178 input string. All we assume here is that the encoding is
1179 minimally ASCII compatible. */
1181 const char *s_end = s + n;
1184 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1185 ? UC_BREAK_MANDATORY
1186 : UC_BREAK_PROHIBITED);
1192 /* We cannot compute widths in this case. */
1193 return start_column;
1204 /* Read the contents of an input stream, and return it, terminated with a NUL
1207 read_file (FILE *stream)
1209 #define BUFSIZE 4096
1215 while (! feof (stream))
1217 if (size + BUFSIZE > alloc)
1219 alloc = alloc + alloc / 2;
1220 if (alloc < size + BUFSIZE)
1221 alloc = size + BUFSIZE;
1222 buf = realloc (buf, alloc);
1225 fprintf (stderr, "out of memory\n");
1229 count = fread (buf + size, 1, BUFSIZE, stream);
1232 if (ferror (stream))
1241 buf = realloc (buf, size + 1);
1244 fprintf (stderr, "out of memory\n");
1253 main (int argc, char * argv[])
1255 setlocale (LC_CTYPE, "");
1258 /* Display all the break opportunities in the input string. */
1259 char *input = read_file (stdin);
1260 int length = strlen (input);
1261 char *breaks = malloc (length);
1264 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1266 for (i = 0; i < length; i++)
1270 case UC_BREAK_POSSIBLE:
1273 case UC_BREAK_MANDATORY:
1275 case UC_BREAK_PROHIBITED:
1280 putc (input[i], stdout);
1289 /* Insert line breaks for a given width. */
1290 int width = atoi (argv[1]);
1291 char *input = read_file (stdin);
1292 int length = strlen (input);
1293 char *breaks = malloc (length);
1296 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1298 for (i = 0; i < length; i++)
1302 case UC_BREAK_POSSIBLE:
1303 putc ('\n', stdout);
1305 case UC_BREAK_MANDATORY:
1307 case UC_BREAK_PROHIBITED:
1312 putc (input[i], stdout);