1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "linebreak.h"
29 #include "utf8-ucs4.h"
31 #include "utf16-ucs4.h"
35 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
43 /* Help GCC to generate good code for string comparisons with
45 #if defined (__GNUC__) && defined (__OPTIMIZE__)
48 streq9 (const char *s1, const char *s2)
50 return strcmp (s1 + 9, s2 + 9) == 0;
54 streq8 (const char *s1, const char *s2, char s28)
61 return streq9 (s1, s2);
68 streq7 (const char *s1, const char *s2, char s27, char s28)
75 return streq8 (s1, s2, s28);
82 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
89 return streq7 (s1, s2, s27, s28);
96 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
103 return streq6 (s1, s2, s26, s27, s28);
110 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
117 return streq5 (s1, s2, s25, s26, s27, s28);
124 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
131 return streq4 (s1, s2, s24, s25, s26, s27, s28);
138 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
145 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
152 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
159 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
166 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
173 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
179 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
180 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
184 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
185 (strcmp (s1, s2) == 0)
191 is_cjk_encoding (const char *encoding)
194 /* Legacy Japanese encodings */
195 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
196 /* Legacy Chinese encodings */
197 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
198 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
199 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
200 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
201 /* Legacy Korean encodings */
202 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
203 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
204 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
210 is_utf8_encoding (const char *encoding)
212 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
218 /* Determine number of column positions required for UC. */
219 int uc_width (unsigned int uc, const char *encoding);
222 * Non-spacing attribute table.
224 * - Non-spacing characters; generated from PropList.txt or
225 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
226 * - Format control characters; generated from
227 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
228 * - Zero width characters; generated from
229 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
231 static const unsigned char nonspacing_table_data[16*64] = {
233 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
234 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
235 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
246 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
247 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
253 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
255 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
257 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
258 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
260 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
261 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
262 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
263 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
264 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
265 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
266 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
267 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
273 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
274 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
275 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
276 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
278 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
279 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
281 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
282 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
283 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
284 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
285 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
287 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
288 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
290 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
292 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
294 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
296 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
297 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
299 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
300 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
301 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
302 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
303 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
305 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
306 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
315 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
318 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
319 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
320 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
321 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
323 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
324 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
325 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
327 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
329 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
332 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
333 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
334 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
335 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
342 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
343 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
354 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
356 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
359 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
360 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
367 /* 0x1d000-0x1d1ff */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
373 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
374 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
375 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
377 static const signed char nonspacing_table_ind[240] = {
378 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
379 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
380 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
381 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
382 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
393 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
407 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
410 /* Determine number of column positions required for UC. */
412 uc_width (unsigned int uc, const char *encoding)
414 /* Test for non-spacing or control character. */
417 int ind = nonspacing_table_ind[uc >> 9];
419 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
421 if (uc > 0 && uc < 0xa0)
427 else if ((uc >> 9) == (0xe0000 >> 9))
430 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
434 /* Test for double-width character.
435 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
436 * and "grep '^....;[^WF]' EastAsianWidth.txt"
439 && ((uc < 0x1160) /* Hangul Jamo */
440 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
442 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
443 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
444 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
445 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
446 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
447 || (uc >= 0xffe0 && uc < 0xffe7)
448 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
449 || (uc >= 0x30000 && uc <= 0x3fffd)
452 /* In ancient CJK encodings, Cyrillic and most other characters are
453 double-width as well. */
454 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
455 && is_cjk_encoding (encoding))
461 /* Determine number of column positions required for first N units
462 (or fewer if S ends before this) in S. */
465 u8_width (const unsigned char *s, size_t n, const char *encoding)
467 const unsigned char *s_end = s + n;
475 s += u8_mbtouc (&uc, s, s_end - s);
478 break; /* end of string reached */
480 w = uc_width (uc, encoding);
481 if (w >= 0) /* ignore control characters in the string */
489 u16_width (const unsigned short *s, size_t n, const char *encoding)
491 const unsigned short *s_end = s + n;
499 s += u16_mbtouc (&uc, s, s_end - s);
502 break; /* end of string reached */
504 w = uc_width (uc, encoding);
505 if (w >= 0) /* ignore control characters in the string */
513 u32_width (const unsigned int *s, size_t n, const char *encoding)
515 const unsigned int *s_end = s + n;
520 unsigned int uc = *s++;
524 break; /* end of string reached */
526 w = uc_width (uc, encoding);
527 if (w >= 0) /* ignore control characters in the string */
535 /* Determine the line break points in S, and store the result at p[0..n-1]. */
536 /* We don't support line breaking of complex-context dependent characters
537 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
539 /* Line breaking classification. */
543 /* Values >= 20 are resolved at run time. */
544 LBP_BK = 0, /* mandatory break */
545 /*LBP_CR, carriage return - not used here because it's a DOSism */
546 /*LBP_LF, line feed - not used here because it's a DOSism */
547 LBP_CM = 20, /* attached characters and combining marks */
548 /*LBP_SG, surrogates - not used here because they are not characters */
549 LBP_ZW = 1, /* zero width space */
550 LBP_IN = 2, /* inseparable */
551 LBP_GL = 3, /* non-breaking (glue) */
552 LBP_CB = 22, /* contingent break opportunity */
553 LBP_SP = 21, /* space */
554 LBP_BA = 4, /* break opportunity after */
555 LBP_BB = 5, /* break opportunity before */
556 LBP_B2 = 6, /* break opportunity before and after */
557 LBP_HY = 7, /* hyphen */
558 LBP_NS = 8, /* non starter */
559 LBP_OP = 9, /* opening punctuation */
560 LBP_CL = 10, /* closing punctuation */
561 LBP_QU = 11, /* ambiguous quotation */
562 LBP_EX = 12, /* exclamation/interrogation */
563 LBP_ID = 13, /* ideographic */
564 LBP_NU = 14, /* numeric */
565 LBP_IS = 15, /* infix separator (numeric) */
566 LBP_SY = 16, /* symbols allowing breaks */
567 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
568 LBP_PR = 18, /* prefix (numeric) */
569 LBP_PO = 19, /* postfix (numeric) */
570 LBP_SA = 23, /* complex context (South East Asian) */
571 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
572 LBP_XX = 25 /* unknown */
575 #include "lbrkprop.h"
577 static inline unsigned char
578 lbrkprop_lookup (unsigned int uc)
580 unsigned int index1 = uc >> lbrkprop_header_0;
581 if (index1 < lbrkprop_header_1)
583 int lookup1 = lbrkprop.level1[index1];
586 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
587 int lookup2 = lbrkprop.level2[lookup1 + index2];
590 unsigned int index3 = uc & lbrkprop_header_4;
591 return lbrkprop.level3[lookup2 + index3];
598 /* Table indexed by two line breaking classifications. */
599 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
600 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
601 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
602 static const unsigned char lbrk_table[19][19] = {
604 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
605 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
606 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
607 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
608 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
609 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
610 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
612 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
614 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
615 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
616 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
617 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
618 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
619 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
620 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
621 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
622 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
623 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
627 /* Note: The (B2,B2) entry should probably be D instead of P. */
628 /* Note: The (PR,ID) entry should probably be D instead of I. */
631 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
633 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
634 const unsigned char *s_end = s + n;
635 int last_prop = LBP_BK; /* line break property of last non-space character */
636 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
637 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
639 /* Don't break inside multibyte characters. */
640 memset (p, UC_BREAK_PROHIBITED, n);
645 int count = u8_mbtouc (&uc, s, s_end - s);
646 int prop = lbrkprop_lookup (uc);
650 /* Mandatory break. */
651 *p = UC_BREAK_MANDATORY;
660 /* Resolve property values whose behaviour is not fixed. */
664 /* Resolve ambiguous. */
665 prop = LBP_AI_REPLACEMENT;
668 /* This is arbitrary. */
672 /* We don't handle complex scripts yet.
673 Treat LBP_SA like LBP_XX. */
675 /* This is arbitrary. */
680 /* Deal with combining characters. */
684 /* Don't break just before a combining character. */
685 *p = UC_BREAK_PROHIBITED;
686 /* A combining character turns a preceding space into LBP_AL. */
687 if (seen_space != NULL)
690 seen_space = seen_space2;
692 goto lookup_via_table;
695 else if (prop == LBP_SP)
697 /* Don't break just before a space. */
698 *p = UC_BREAK_PROHIBITED;
699 seen_space2 = seen_space;
705 /* prop must be usable as an index for table 7.3 of UTR #14. */
706 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
709 if (last_prop == LBP_BK)
711 /* Don't break at the beginning of a line. */
712 *q = UC_BREAK_PROHIBITED;
716 switch (lbrk_table [last_prop-1] [prop-1])
719 *q = UC_BREAK_POSSIBLE;
722 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
725 *q = UC_BREAK_PROHIBITED;
743 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
745 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
746 const unsigned short *s_end = s + n;
747 int last_prop = LBP_BK; /* line break property of last non-space character */
748 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
749 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
751 /* Don't break inside multibyte characters. */
752 memset (p, UC_BREAK_PROHIBITED, n);
757 int count = u16_mbtouc (&uc, s, s_end - s);
758 int prop = lbrkprop_lookup (uc);
762 /* Mandatory break. */
763 *p = UC_BREAK_MANDATORY;
772 /* Resolve property values whose behaviour is not fixed. */
776 /* Resolve ambiguous. */
777 prop = LBP_AI_REPLACEMENT;
780 /* This is arbitrary. */
784 /* We don't handle complex scripts yet.
785 Treat LBP_SA like LBP_XX. */
787 /* This is arbitrary. */
792 /* Deal with combining characters. */
796 /* Don't break just before a combining character. */
797 *p = UC_BREAK_PROHIBITED;
798 /* A combining character turns a preceding space into LBP_AL. */
799 if (seen_space != NULL)
802 seen_space = seen_space2;
804 goto lookup_via_table;
807 else if (prop == LBP_SP)
809 /* Don't break just before a space. */
810 *p = UC_BREAK_PROHIBITED;
811 seen_space2 = seen_space;
817 /* prop must be usable as an index for table 7.3 of UTR #14. */
818 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
821 if (last_prop == LBP_BK)
823 /* Don't break at the beginning of a line. */
824 *q = UC_BREAK_PROHIBITED;
828 switch (lbrk_table [last_prop-1] [prop-1])
831 *q = UC_BREAK_POSSIBLE;
834 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
837 *q = UC_BREAK_PROHIBITED;
855 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
857 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
858 const unsigned int *s_end = s + n;
859 int last_prop = LBP_BK; /* line break property of last non-space character */
860 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
861 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
865 unsigned int uc = *s;
866 int prop = lbrkprop_lookup (uc);
870 /* Mandatory break. */
871 *p = UC_BREAK_MANDATORY;
880 /* Resolve property values whose behaviour is not fixed. */
884 /* Resolve ambiguous. */
885 prop = LBP_AI_REPLACEMENT;
888 /* This is arbitrary. */
892 /* We don't handle complex scripts yet.
893 Treat LBP_SA like LBP_XX. */
895 /* This is arbitrary. */
900 /* Deal with combining characters. */
904 /* Don't break just before a combining character. */
905 *p = UC_BREAK_PROHIBITED;
906 /* A combining character turns a preceding space into LBP_AL. */
907 if (seen_space != NULL)
910 seen_space = seen_space2;
912 goto lookup_via_table;
915 else if (prop == LBP_SP)
917 /* Don't break just before a space. */
918 *p = UC_BREAK_PROHIBITED;
919 seen_space2 = seen_space;
925 /* prop must be usable as an index for table 7.3 of UTR #14. */
926 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
929 if (last_prop == LBP_BK)
931 /* Don't break at the beginning of a line. */
932 *q = UC_BREAK_PROHIBITED;
936 switch (lbrk_table [last_prop-1] [prop-1])
939 *q = UC_BREAK_POSSIBLE;
942 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
945 *q = UC_BREAK_PROHIBITED;
963 /* Choose the best line breaks, assuming the uc_width function.
964 Return the column after the end of the string. */
967 u8_width_linebreaks (const unsigned char *s, size_t n,
968 int width, int start_column, int at_end_columns,
969 const char *o, const char *encoding,
972 const unsigned char *s_end;
977 u8_possible_linebreaks (s, n, encoding, p);
981 last_column = start_column;
986 int count = u8_mbtouc (&uc, s, s_end - s);
988 /* Respect the override. */
989 if (o != NULL && *o != UC_BREAK_UNDEFINED)
992 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
994 /* An atomic piece of text ends here. */
995 if (last_p != NULL && last_column + piece_width > width)
997 /* Insert a line break. */
998 *last_p = UC_BREAK_POSSIBLE;
1003 if (*p == UC_BREAK_MANDATORY)
1005 /* uc is a line break character. */
1006 /* Start a new piece at column 0. */
1013 /* uc is not a line break character. */
1016 if (*p == UC_BREAK_POSSIBLE)
1018 /* Start a new piece. */
1020 last_column += piece_width;
1022 /* No line break for the moment, may be turned into
1023 UC_BREAK_POSSIBLE later, via last_p. */
1026 *p = UC_BREAK_PROHIBITED;
1028 w = uc_width (uc, encoding);
1029 if (w >= 0) /* ignore control characters in the string */
1039 /* The last atomic piece of text ends here. */
1040 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1042 /* Insert a line break. */
1043 *last_p = UC_BREAK_POSSIBLE;
1047 return last_column + piece_width;
1051 u16_width_linebreaks (const unsigned short *s, size_t n,
1052 int width, int start_column, int at_end_columns,
1053 const char *o, const char *encoding,
1056 const unsigned short *s_end;
1061 u16_possible_linebreaks (s, n, encoding, p);
1065 last_column = start_column;
1070 int count = u16_mbtouc (&uc, s, s_end - s);
1072 /* Respect the override. */
1073 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1076 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1078 /* An atomic piece of text ends here. */
1079 if (last_p != NULL && last_column + piece_width > width)
1081 /* Insert a line break. */
1082 *last_p = UC_BREAK_POSSIBLE;
1087 if (*p == UC_BREAK_MANDATORY)
1089 /* uc is a line break character. */
1090 /* Start a new piece at column 0. */
1097 /* uc is not a line break character. */
1100 if (*p == UC_BREAK_POSSIBLE)
1102 /* Start a new piece. */
1104 last_column += piece_width;
1106 /* No line break for the moment, may be turned into
1107 UC_BREAK_POSSIBLE later, via last_p. */
1110 *p = UC_BREAK_PROHIBITED;
1112 w = uc_width (uc, encoding);
1113 if (w >= 0) /* ignore control characters in the string */
1123 /* The last atomic piece of text ends here. */
1124 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1126 /* Insert a line break. */
1127 *last_p = UC_BREAK_POSSIBLE;
1131 return last_column + piece_width;
1135 u32_width_linebreaks (const unsigned int *s, size_t n,
1136 int width, int start_column, int at_end_columns,
1137 const char *o, const char *encoding,
1140 const unsigned int *s_end;
1145 u32_possible_linebreaks (s, n, encoding, p);
1149 last_column = start_column;
1153 unsigned int uc = *s;
1155 /* Respect the override. */
1156 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1159 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1161 /* An atomic piece of text ends here. */
1162 if (last_p != NULL && last_column + piece_width > width)
1164 /* Insert a line break. */
1165 *last_p = UC_BREAK_POSSIBLE;
1170 if (*p == UC_BREAK_MANDATORY)
1172 /* uc is a line break character. */
1173 /* Start a new piece at column 0. */
1180 /* uc is not a line break character. */
1183 if (*p == UC_BREAK_POSSIBLE)
1185 /* Start a new piece. */
1187 last_column += piece_width;
1189 /* No line break for the moment, may be turned into
1190 UC_BREAK_POSSIBLE later, via last_p. */
1193 *p = UC_BREAK_PROHIBITED;
1195 w = uc_width (uc, encoding);
1196 if (w >= 0) /* ignore control characters in the string */
1206 /* The last atomic piece of text ends here. */
1207 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1209 /* Insert a line break. */
1210 *last_p = UC_BREAK_POSSIBLE;
1214 return last_column + piece_width;
1222 /* Read the contents of an input stream, and return it, terminated with a NUL
1225 read_file (FILE *stream)
1227 #define BUFSIZE 4096
1233 while (! feof (stream))
1235 if (size + BUFSIZE > alloc)
1237 alloc = alloc + alloc / 2;
1238 if (alloc < size + BUFSIZE)
1239 alloc = size + BUFSIZE;
1240 buf = realloc (buf, alloc);
1243 fprintf (stderr, "out of memory\n");
1247 count = fread (buf + size, 1, BUFSIZE, stream);
1250 if (ferror (stream))
1259 buf = realloc (buf, size + 1);
1262 fprintf (stderr, "out of memory\n");
1271 main (int argc, char * argv[])
1275 /* Display all the break opportunities in the input string. */
1276 char *input = read_file (stdin);
1277 int length = strlen (input);
1278 char *breaks = malloc (length);
1281 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1283 for (i = 0; i < length; i++)
1287 case UC_BREAK_POSSIBLE:
1288 /* U+2027 in UTF-8 encoding */
1289 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1291 case UC_BREAK_MANDATORY:
1292 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1293 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1295 case UC_BREAK_PROHIBITED:
1300 putc (input[i], stdout);
1309 /* Insert line breaks for a given width. */
1310 int width = atoi (argv[1]);
1311 char *input = read_file (stdin);
1312 int length = strlen (input);
1313 char *breaks = malloc (length);
1316 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1318 for (i = 0; i < length; i++)
1322 case UC_BREAK_POSSIBLE:
1323 putc ('\n', stdout);
1325 case UC_BREAK_MANDATORY:
1327 case UC_BREAK_PROHIBITED:
1332 putc (input[i], stdout);
1346 /* Now the same thing with an arbitrary encoding.
1348 We convert the input string to Unicode.
1350 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1351 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1352 \U0000FFFF. UTF-16 and variants support only characters up to
1353 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1354 UCS-4 specification leaves doubts about endianness and byte order mark.
1355 glibc currently interprets it as big endian without byte order mark,
1356 but this is not backed by an RFC. So we use UTF-8. It supports
1357 characters up to \U7FFFFFFF and is unambiguously defined. */
1364 /* Luckily, the encoding's name is platform independent. */
1365 #define UTF8_NAME "UTF-8"
1367 /* Return the length of a string after conversion through an iconv_t. */
1369 iconv_string_length (iconv_t cd, const char *s, size_t n)
1371 #define TMPBUFSIZE 4096
1373 char tmpbuf[TMPBUFSIZE];
1374 const char *inptr = s;
1378 char *outptr = tmpbuf;
1379 size_t outsize = TMPBUFSIZE;
1380 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1381 if (res == (size_t)(-1) && errno != E2BIG)
1382 return (size_t)(-1);
1383 count += outptr - tmpbuf;
1385 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1386 #if defined _LIBICONV_VERSION \
1387 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1389 char *outptr = tmpbuf;
1390 size_t outsize = TMPBUFSIZE;
1391 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1392 if (res == (size_t)(-1))
1393 return (size_t)(-1);
1394 count += outptr - tmpbuf;
1396 /* Return to the initial state. */
1397 iconv (cd, NULL, NULL, NULL, NULL);
1404 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1405 size_t *offtable, char *t, size_t m)
1412 /* Avoid glibc-2.1 bug. */
1413 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1414 const size_t extra = 1;
1416 const size_t extra = 0;
1419 for (i = 0; i < n; i++)
1420 offtable[i] = (size_t)(-1);
1425 outsize = m + extra;
1426 while (inptr < s_end)
1428 const char *saved_inptr;
1432 offtable[inptr - s] = outptr - t;
1434 saved_inptr = inptr;
1436 for (insize = 1; inptr + insize <= s_end; insize++)
1438 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1439 if (!(res == (size_t)(-1) && errno == EINVAL))
1441 /* We expect that no input bytes have been consumed so far. */
1442 if (inptr != saved_inptr)
1445 /* After we verified the convertibility and computed the translation's
1446 size m, there shouldn't be any conversion error here. */
1447 if (res == (size_t)(-1))
1450 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1451 #if defined _LIBICONV_VERSION \
1452 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1453 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1456 /* We should have produced exactly m output bytes. */
1457 if (outsize != extra)
1461 #endif /* HAVE_ICONV */
1465 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1466 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1468 is_all_ascii (const char *s, size_t n)
1470 for (; n > 0; s++, n--)
1472 unsigned char c = (unsigned char) *s;
1474 if (!(c_isprint (c) || c_isspace (c)))
1480 #endif /* C_CTYPE_ASCII */
1483 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1488 if (is_utf8_encoding (encoding))
1489 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1494 /* Avoid glibc-2.1 bug with EUC-KR. */
1495 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1496 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1497 to_utf8 = (iconv_t)(-1);
1500 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1502 # if defined __sun && !defined _LIBICONV_VERSION
1503 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1504 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1505 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1506 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1507 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1508 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1509 to_utf8 = (iconv_t)(-1);
1512 to_utf8 = iconv_open (UTF8_NAME, encoding);
1513 if (to_utf8 != (iconv_t)(-1))
1515 /* Determine the length of the resulting UTF-8 string. */
1516 size_t m = iconv_string_length (to_utf8, s, n);
1517 if (m != (size_t)(-1))
1519 /* Convert the string to UTF-8 and build a translation table
1520 from offsets into s to offsets into the translated string. */
1521 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1523 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1526 size_t *offtable = (size_t *) memory;
1527 char *t = (char *) (offtable + n);
1528 char *q = (char *) (t + m);
1531 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1533 /* Determine the possible line breaks of the UTF-8 string. */
1534 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1536 /* Translate the result back to the original string. */
1537 memset (p, UC_BREAK_PROHIBITED, n);
1538 for (i = 0; i < n; i++)
1539 if (offtable[i] != (size_t)(-1))
1540 p[i] = q[offtable[i]];
1543 iconv_close (to_utf8);
1547 iconv_close (to_utf8);
1550 /* Impossible to convert. */
1552 if (is_all_ascii (s, n))
1554 /* ASCII is a subset of UTF-8. */
1555 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1559 /* We have a non-ASCII string and cannot convert it.
1560 Don't produce line breaks except those already present in the
1561 input string. All we assume here is that the encoding is
1562 minimally ASCII compatible. */
1564 const char *s_end = s + n;
1567 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1576 mbs_width_linebreaks (const char *s, size_t n,
1577 int width, int start_column, int at_end_columns,
1578 const char *o, const char *encoding,
1582 return start_column;
1583 if (is_utf8_encoding (encoding))
1584 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1589 /* Avoid glibc-2.1 bug with EUC-KR. */
1590 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1591 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1592 to_utf8 = (iconv_t)(-1);
1595 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1597 # if defined __sun && !defined _LIBICONV_VERSION
1598 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1599 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1600 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1601 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1602 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1603 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1604 to_utf8 = (iconv_t)(-1);
1607 to_utf8 = iconv_open (UTF8_NAME, encoding);
1608 if (to_utf8 != (iconv_t)(-1))
1610 /* Determine the length of the resulting UTF-8 string. */
1611 size_t m = iconv_string_length (to_utf8, s, n);
1612 if (m != (size_t)(-1))
1614 /* Convert the string to UTF-8 and build a translation table
1615 from offsets into s to offsets into the translated string. */
1616 size_t memory_size =
1617 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1618 (o != NULL ? m : 0));
1621 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1624 size_t *offtable = (size_t *) memory;
1625 char *t = (char *) (offtable + n);
1626 char *q = (char *) (t + m);
1627 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1631 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1633 /* Translate the overrides to the UTF-8 string. */
1636 memset (o8, UC_BREAK_UNDEFINED, m);
1637 for (i = 0; i < n; i++)
1638 if (offtable[i] != (size_t)(-1))
1639 o8[offtable[i]] = o[i];
1642 /* Determine the line breaks of the UTF-8 string. */
1644 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1646 /* Translate the result back to the original string. */
1647 memset (p, UC_BREAK_PROHIBITED, n);
1648 for (i = 0; i < n; i++)
1649 if (offtable[i] != (size_t)(-1))
1650 p[i] = q[offtable[i]];
1653 iconv_close (to_utf8);
1657 iconv_close (to_utf8);
1660 /* Impossible to convert. */
1662 if (is_all_ascii (s, n))
1664 /* ASCII is a subset of UTF-8. */
1665 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1668 /* We have a non-ASCII string and cannot convert it.
1669 Don't produce line breaks except those already present in the
1670 input string. All we assume here is that the encoding is
1671 minimally ASCII compatible. */
1673 const char *s_end = s + n;
1676 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1677 ? UC_BREAK_MANDATORY
1678 : UC_BREAK_PROHIBITED);
1684 /* We cannot compute widths in this case. */
1685 return start_column;
1696 /* Read the contents of an input stream, and return it, terminated with a NUL
1699 read_file (FILE *stream)
1701 #define BUFSIZE 4096
1707 while (! feof (stream))
1709 if (size + BUFSIZE > alloc)
1711 alloc = alloc + alloc / 2;
1712 if (alloc < size + BUFSIZE)
1713 alloc = size + BUFSIZE;
1714 buf = realloc (buf, alloc);
1717 fprintf (stderr, "out of memory\n");
1721 count = fread (buf + size, 1, BUFSIZE, stream);
1724 if (ferror (stream))
1733 buf = realloc (buf, size + 1);
1736 fprintf (stderr, "out of memory\n");
1745 main (int argc, char * argv[])
1747 setlocale (LC_CTYPE, "");
1750 /* Display all the break opportunities in the input string. */
1751 char *input = read_file (stdin);
1752 int length = strlen (input);
1753 char *breaks = malloc (length);
1756 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1758 for (i = 0; i < length; i++)
1762 case UC_BREAK_POSSIBLE:
1765 case UC_BREAK_MANDATORY:
1767 case UC_BREAK_PROHIBITED:
1772 putc (input[i], stdout);
1781 /* Insert line breaks for a given width. */
1782 int width = atoi (argv[1]);
1783 char *input = read_file (stdin);
1784 int length = strlen (input);
1785 char *breaks = malloc (length);
1788 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1790 for (i = 0; i < length; i++)
1794 case UC_BREAK_POSSIBLE:
1795 putc ('\n', stdout);
1797 case UC_BREAK_MANDATORY:
1799 case UC_BREAK_PROHIBITED:
1804 putc (input[i], stdout);