1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "linebreak.h"
32 u32_mbtouc_unsafe (unsigned int *puc, const unsigned int *s, size_t n)
40 /* Help GCC to generate good code for string comparisons with
42 #if defined (__GNUC__) && defined (__OPTIMIZE__)
45 streq9 (const char *s1, const char *s2)
47 return strcmp (s1 + 9, s2 + 9) == 0;
51 streq8 (const char *s1, const char *s2, char s28)
58 return streq9 (s1, s2);
65 streq7 (const char *s1, const char *s2, char s27, char s28)
72 return streq8 (s1, s2, s28);
79 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
86 return streq7 (s1, s2, s27, s28);
93 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
100 return streq6 (s1, s2, s26, s27, s28);
107 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
114 return streq5 (s1, s2, s25, s26, s27, s28);
121 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
128 return streq4 (s1, s2, s24, s25, s26, s27, s28);
135 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
142 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
149 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
156 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
163 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
170 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
176 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
177 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
181 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
182 (strcmp (s1, s2) == 0)
188 is_cjk_encoding (const char *encoding)
191 /* Legacy Japanese encodings */
192 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
193 /* Legacy Chinese encodings */
194 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
195 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
196 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
197 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
198 /* Legacy Korean encodings */
199 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
200 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
201 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
207 is_utf8_encoding (const char *encoding)
209 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
215 /* Determine number of column positions required for UC. */
216 int uc_width (unsigned int uc, const char *encoding);
219 * Non-spacing attribute table.
221 * - Non-spacing characters; generated from PropList.txt or
222 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
223 * - Format control characters; generated from
224 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
225 * - Zero width characters; generated from
226 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
228 static const unsigned char nonspacing_table_data[16*64] = {
230 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
231 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
232 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
233 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
234 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
235 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
243 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
244 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
250 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
254 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
255 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
257 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
258 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
259 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
260 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
261 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
262 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
263 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
264 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
266 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
267 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
268 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
270 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
271 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
272 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
273 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
275 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
276 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
277 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
278 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
279 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
280 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
281 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
282 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
284 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
285 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
286 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
287 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
288 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
289 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
290 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
291 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
294 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
295 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
296 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
297 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
299 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
300 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
302 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
303 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
304 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
305 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
306 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
315 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
316 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
318 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
320 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
321 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
322 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
323 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
324 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
325 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
329 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
330 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
332 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
333 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
334 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
335 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
340 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
342 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
351 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
356 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
359 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
360 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
364 /* 0x1d000-0x1d1ff */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
370 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
371 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
374 static const signed char nonspacing_table_ind[240] = {
375 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
376 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
377 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
378 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
379 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
380 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
381 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
382 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
390 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
404 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
407 /* Determine number of column positions required for UC. */
409 uc_width (unsigned int uc, const char *encoding)
411 /* Test for non-spacing or control character. */
414 int ind = nonspacing_table_ind[uc >> 9];
416 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
418 if (uc > 0 && uc < 0xa0)
424 else if ((uc >> 9) == (0xe0000 >> 9))
427 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
431 /* Test for double-width character.
432 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
433 * and "grep '^....;[^WF]' EastAsianWidth.txt"
436 && ((uc < 0x1160) /* Hangul Jamo */
437 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
439 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
440 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
441 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
442 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
443 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
444 || (uc >= 0xffe0 && uc < 0xffe7)
445 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
446 || (uc >= 0x30000 && uc <= 0x3fffd)
449 /* In ancient CJK encodings, Cyrillic and most other characters are
450 double-width as well. */
451 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
452 && is_cjk_encoding (encoding))
458 /* Determine number of column positions required for first N units
459 (or fewer if S ends before this) in S. */
462 u8_width (const unsigned char *s, size_t n, const char *encoding)
464 const unsigned char *s_end = s + n;
472 s += u8_mbtouc_unsafe (&uc, s, s_end - s);
475 break; /* end of string reached */
477 w = uc_width (uc, encoding);
478 if (w >= 0) /* ignore control characters in the string */
486 u16_width (const unsigned short *s, size_t n, const char *encoding)
488 const unsigned short *s_end = s + n;
496 s += u16_mbtouc_unsafe (&uc, s, s_end - s);
499 break; /* end of string reached */
501 w = uc_width (uc, encoding);
502 if (w >= 0) /* ignore control characters in the string */
510 u32_width (const unsigned int *s, size_t n, const char *encoding)
512 const unsigned int *s_end = s + n;
517 unsigned int uc = *s++;
521 break; /* end of string reached */
523 w = uc_width (uc, encoding);
524 if (w >= 0) /* ignore control characters in the string */
532 /* Determine the line break points in S, and store the result at p[0..n-1]. */
533 /* We don't support line breaking of complex-context dependent characters
534 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
536 /* Line breaking classification. */
540 /* Values >= 20 are resolved at run time. */
541 LBP_BK = 0, /* mandatory break */
542 /*LBP_CR, carriage return - not used here because it's a DOSism */
543 /*LBP_LF, line feed - not used here because it's a DOSism */
544 LBP_CM = 20, /* attached characters and combining marks */
545 /*LBP_SG, surrogates - not used here because they are not characters */
546 LBP_ZW = 1, /* zero width space */
547 LBP_IN = 2, /* inseparable */
548 LBP_GL = 3, /* non-breaking (glue) */
549 LBP_CB = 22, /* contingent break opportunity */
550 LBP_SP = 21, /* space */
551 LBP_BA = 4, /* break opportunity after */
552 LBP_BB = 5, /* break opportunity before */
553 LBP_B2 = 6, /* break opportunity before and after */
554 LBP_HY = 7, /* hyphen */
555 LBP_NS = 8, /* non starter */
556 LBP_OP = 9, /* opening punctuation */
557 LBP_CL = 10, /* closing punctuation */
558 LBP_QU = 11, /* ambiguous quotation */
559 LBP_EX = 12, /* exclamation/interrogation */
560 LBP_ID = 13, /* ideographic */
561 LBP_NU = 14, /* numeric */
562 LBP_IS = 15, /* infix separator (numeric) */
563 LBP_SY = 16, /* symbols allowing breaks */
564 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
565 LBP_PR = 18, /* prefix (numeric) */
566 LBP_PO = 19, /* postfix (numeric) */
567 LBP_SA = 23, /* complex context (South East Asian) */
568 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
569 LBP_XX = 25 /* unknown */
572 #include "lbrkprop.h"
574 static inline unsigned char
575 lbrkprop_lookup (unsigned int uc)
577 unsigned int index1 = uc >> lbrkprop_header_0;
578 if (index1 < lbrkprop_header_1)
580 int lookup1 = lbrkprop.level1[index1];
583 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
584 int lookup2 = lbrkprop.level2[lookup1 + index2];
587 unsigned int index3 = uc & lbrkprop_header_4;
588 return lbrkprop.level3[lookup2 + index3];
595 /* Table indexed by two line breaking classifications. */
596 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
597 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
598 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
599 static const unsigned char lbrk_table[19][19] = {
601 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
602 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
603 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
604 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
605 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
606 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
607 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
608 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
609 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
610 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
611 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
612 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
613 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
614 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
615 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
616 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
617 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
618 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
619 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
620 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
624 /* Note: The (B2,B2) entry should probably be D instead of P. */
625 /* Note: The (PR,ID) entry should probably be D instead of I. */
628 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
630 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
631 const unsigned char *s_end = s + n;
632 int last_prop = LBP_BK; /* line break property of last non-space character */
633 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
634 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
636 /* Don't break inside multibyte characters. */
637 memset (p, UC_BREAK_PROHIBITED, n);
642 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
643 int prop = lbrkprop_lookup (uc);
647 /* Mandatory break. */
648 *p = UC_BREAK_MANDATORY;
657 /* Resolve property values whose behaviour is not fixed. */
661 /* Resolve ambiguous. */
662 prop = LBP_AI_REPLACEMENT;
665 /* This is arbitrary. */
669 /* We don't handle complex scripts yet.
670 Treat LBP_SA like LBP_XX. */
672 /* This is arbitrary. */
677 /* Deal with combining characters. */
681 /* Don't break just before a combining character. */
682 *p = UC_BREAK_PROHIBITED;
683 /* A combining character turns a preceding space into LBP_AL. */
684 if (seen_space != NULL)
687 seen_space = seen_space2;
689 goto lookup_via_table;
692 else if (prop == LBP_SP)
694 /* Don't break just before a space. */
695 *p = UC_BREAK_PROHIBITED;
696 seen_space2 = seen_space;
702 /* prop must be usable as an index for table 7.3 of UTR #14. */
703 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
706 if (last_prop == LBP_BK)
708 /* Don't break at the beginning of a line. */
709 *q = UC_BREAK_PROHIBITED;
713 switch (lbrk_table [last_prop-1] [prop-1])
716 *q = UC_BREAK_POSSIBLE;
719 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
722 *q = UC_BREAK_PROHIBITED;
740 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
742 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
743 const unsigned short *s_end = s + n;
744 int last_prop = LBP_BK; /* line break property of last non-space character */
745 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
746 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
748 /* Don't break inside multibyte characters. */
749 memset (p, UC_BREAK_PROHIBITED, n);
754 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
755 int prop = lbrkprop_lookup (uc);
759 /* Mandatory break. */
760 *p = UC_BREAK_MANDATORY;
769 /* Resolve property values whose behaviour is not fixed. */
773 /* Resolve ambiguous. */
774 prop = LBP_AI_REPLACEMENT;
777 /* This is arbitrary. */
781 /* We don't handle complex scripts yet.
782 Treat LBP_SA like LBP_XX. */
784 /* This is arbitrary. */
789 /* Deal with combining characters. */
793 /* Don't break just before a combining character. */
794 *p = UC_BREAK_PROHIBITED;
795 /* A combining character turns a preceding space into LBP_AL. */
796 if (seen_space != NULL)
799 seen_space = seen_space2;
801 goto lookup_via_table;
804 else if (prop == LBP_SP)
806 /* Don't break just before a space. */
807 *p = UC_BREAK_PROHIBITED;
808 seen_space2 = seen_space;
814 /* prop must be usable as an index for table 7.3 of UTR #14. */
815 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
818 if (last_prop == LBP_BK)
820 /* Don't break at the beginning of a line. */
821 *q = UC_BREAK_PROHIBITED;
825 switch (lbrk_table [last_prop-1] [prop-1])
828 *q = UC_BREAK_POSSIBLE;
831 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
834 *q = UC_BREAK_PROHIBITED;
852 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
854 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
855 const unsigned int *s_end = s + n;
856 int last_prop = LBP_BK; /* line break property of last non-space character */
857 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
858 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
862 unsigned int uc = *s;
863 int prop = lbrkprop_lookup (uc);
867 /* Mandatory break. */
868 *p = UC_BREAK_MANDATORY;
877 /* Resolve property values whose behaviour is not fixed. */
881 /* Resolve ambiguous. */
882 prop = LBP_AI_REPLACEMENT;
885 /* This is arbitrary. */
889 /* We don't handle complex scripts yet.
890 Treat LBP_SA like LBP_XX. */
892 /* This is arbitrary. */
897 /* Deal with combining characters. */
901 /* Don't break just before a combining character. */
902 *p = UC_BREAK_PROHIBITED;
903 /* A combining character turns a preceding space into LBP_AL. */
904 if (seen_space != NULL)
907 seen_space = seen_space2;
909 goto lookup_via_table;
912 else if (prop == LBP_SP)
914 /* Don't break just before a space. */
915 *p = UC_BREAK_PROHIBITED;
916 seen_space2 = seen_space;
922 /* prop must be usable as an index for table 7.3 of UTR #14. */
923 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
926 if (last_prop == LBP_BK)
928 /* Don't break at the beginning of a line. */
929 *q = UC_BREAK_PROHIBITED;
933 switch (lbrk_table [last_prop-1] [prop-1])
936 *q = UC_BREAK_POSSIBLE;
939 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
942 *q = UC_BREAK_PROHIBITED;
960 /* Choose the best line breaks, assuming the uc_width function.
961 Return the column after the end of the string. */
964 u8_width_linebreaks (const unsigned char *s, size_t n,
965 int width, int start_column, int at_end_columns,
966 const char *o, const char *encoding,
969 const unsigned char *s_end;
974 u8_possible_linebreaks (s, n, encoding, p);
978 last_column = start_column;
983 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
985 /* Respect the override. */
986 if (o != NULL && *o != UC_BREAK_UNDEFINED)
989 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
991 /* An atomic piece of text ends here. */
992 if (last_p != NULL && last_column + piece_width > width)
994 /* Insert a line break. */
995 *last_p = UC_BREAK_POSSIBLE;
1000 if (*p == UC_BREAK_MANDATORY)
1002 /* uc is a line break character. */
1003 /* Start a new piece at column 0. */
1010 /* uc is not a line break character. */
1013 if (*p == UC_BREAK_POSSIBLE)
1015 /* Start a new piece. */
1017 last_column += piece_width;
1019 /* No line break for the moment, may be turned into
1020 UC_BREAK_POSSIBLE later, via last_p. */
1023 *p = UC_BREAK_PROHIBITED;
1025 w = uc_width (uc, encoding);
1026 if (w >= 0) /* ignore control characters in the string */
1036 /* The last atomic piece of text ends here. */
1037 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1039 /* Insert a line break. */
1040 *last_p = UC_BREAK_POSSIBLE;
1044 return last_column + piece_width;
1048 u16_width_linebreaks (const unsigned short *s, size_t n,
1049 int width, int start_column, int at_end_columns,
1050 const char *o, const char *encoding,
1053 const unsigned short *s_end;
1058 u16_possible_linebreaks (s, n, encoding, p);
1062 last_column = start_column;
1067 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
1069 /* Respect the override. */
1070 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1073 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1075 /* An atomic piece of text ends here. */
1076 if (last_p != NULL && last_column + piece_width > width)
1078 /* Insert a line break. */
1079 *last_p = UC_BREAK_POSSIBLE;
1084 if (*p == UC_BREAK_MANDATORY)
1086 /* uc is a line break character. */
1087 /* Start a new piece at column 0. */
1094 /* uc is not a line break character. */
1097 if (*p == UC_BREAK_POSSIBLE)
1099 /* Start a new piece. */
1101 last_column += piece_width;
1103 /* No line break for the moment, may be turned into
1104 UC_BREAK_POSSIBLE later, via last_p. */
1107 *p = UC_BREAK_PROHIBITED;
1109 w = uc_width (uc, encoding);
1110 if (w >= 0) /* ignore control characters in the string */
1120 /* The last atomic piece of text ends here. */
1121 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1123 /* Insert a line break. */
1124 *last_p = UC_BREAK_POSSIBLE;
1128 return last_column + piece_width;
1132 u32_width_linebreaks (const unsigned int *s, size_t n,
1133 int width, int start_column, int at_end_columns,
1134 const char *o, const char *encoding,
1137 const unsigned int *s_end;
1142 u32_possible_linebreaks (s, n, encoding, p);
1146 last_column = start_column;
1150 unsigned int uc = *s;
1152 /* Respect the override. */
1153 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1156 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1158 /* An atomic piece of text ends here. */
1159 if (last_p != NULL && last_column + piece_width > width)
1161 /* Insert a line break. */
1162 *last_p = UC_BREAK_POSSIBLE;
1167 if (*p == UC_BREAK_MANDATORY)
1169 /* uc is a line break character. */
1170 /* Start a new piece at column 0. */
1177 /* uc is not a line break character. */
1180 if (*p == UC_BREAK_POSSIBLE)
1182 /* Start a new piece. */
1184 last_column += piece_width;
1186 /* No line break for the moment, may be turned into
1187 UC_BREAK_POSSIBLE later, via last_p. */
1190 *p = UC_BREAK_PROHIBITED;
1192 w = uc_width (uc, encoding);
1193 if (w >= 0) /* ignore control characters in the string */
1203 /* The last atomic piece of text ends here. */
1204 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1206 /* Insert a line break. */
1207 *last_p = UC_BREAK_POSSIBLE;
1211 return last_column + piece_width;
1219 /* Read the contents of an input stream, and return it, terminated with a NUL
1222 read_file (FILE *stream)
1224 #define BUFSIZE 4096
1230 while (! feof (stream))
1232 if (size + BUFSIZE > alloc)
1234 alloc = alloc + alloc / 2;
1235 if (alloc < size + BUFSIZE)
1236 alloc = size + BUFSIZE;
1237 buf = realloc (buf, alloc);
1240 fprintf (stderr, "out of memory\n");
1244 count = fread (buf + size, 1, BUFSIZE, stream);
1247 if (ferror (stream))
1256 buf = realloc (buf, size + 1);
1259 fprintf (stderr, "out of memory\n");
1268 main (int argc, char * argv[])
1272 /* Display all the break opportunities in the input string. */
1273 char *input = read_file (stdin);
1274 int length = strlen (input);
1275 char *breaks = malloc (length);
1278 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1280 for (i = 0; i < length; i++)
1284 case UC_BREAK_POSSIBLE:
1285 /* U+2027 in UTF-8 encoding */
1286 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1288 case UC_BREAK_MANDATORY:
1289 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1290 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1292 case UC_BREAK_PROHIBITED:
1297 putc (input[i], stdout);
1306 /* Insert line breaks for a given width. */
1307 int width = atoi (argv[1]);
1308 char *input = read_file (stdin);
1309 int length = strlen (input);
1310 char *breaks = malloc (length);
1313 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1315 for (i = 0; i < length; i++)
1319 case UC_BREAK_POSSIBLE:
1320 putc ('\n', stdout);
1322 case UC_BREAK_MANDATORY:
1324 case UC_BREAK_PROHIBITED:
1329 putc (input[i], stdout);
1343 /* Now the same thing with an arbitrary encoding.
1345 We convert the input string to Unicode.
1347 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1348 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1349 \U0000FFFF. UTF-16 and variants support only characters up to
1350 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1351 UCS-4 specification leaves doubts about endianness and byte order mark.
1352 glibc currently interprets it as big endian without byte order mark,
1353 but this is not backed by an RFC. So we use UTF-8. It supports
1354 characters up to \U7FFFFFFF and is unambiguously defined. */
1361 /* Luckily, the encoding's name is platform independent. */
1362 #define UTF8_NAME "UTF-8"
1364 /* Return the length of a string after conversion through an iconv_t. */
1366 iconv_string_length (iconv_t cd, const char *s, size_t n)
1368 #define TMPBUFSIZE 4096
1370 char tmpbuf[TMPBUFSIZE];
1371 const char *inptr = s;
1375 char *outptr = tmpbuf;
1376 size_t outsize = TMPBUFSIZE;
1377 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1378 if (res == (size_t)(-1) && errno != E2BIG)
1379 return (size_t)(-1);
1380 count += outptr - tmpbuf;
1382 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1383 #if defined _LIBICONV_VERSION \
1384 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1386 char *outptr = tmpbuf;
1387 size_t outsize = TMPBUFSIZE;
1388 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1389 if (res == (size_t)(-1))
1390 return (size_t)(-1);
1391 count += outptr - tmpbuf;
1393 /* Return to the initial state. */
1394 iconv (cd, NULL, NULL, NULL, NULL);
1401 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1402 size_t *offtable, char *t, size_t m)
1409 /* Avoid glibc-2.1 bug. */
1410 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1411 const size_t extra = 1;
1413 const size_t extra = 0;
1416 for (i = 0; i < n; i++)
1417 offtable[i] = (size_t)(-1);
1422 outsize = m + extra;
1423 while (inptr < s_end)
1425 const char *saved_inptr;
1429 offtable[inptr - s] = outptr - t;
1431 saved_inptr = inptr;
1433 for (insize = 1; inptr + insize <= s_end; insize++)
1435 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1436 if (!(res == (size_t)(-1) && errno == EINVAL))
1438 /* We expect that no input bytes have been consumed so far. */
1439 if (inptr != saved_inptr)
1442 /* After we verified the convertibility and computed the translation's
1443 size m, there shouldn't be any conversion error here. */
1444 if (res == (size_t)(-1))
1447 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1448 #if defined _LIBICONV_VERSION \
1449 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1450 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1453 /* We should have produced exactly m output bytes. */
1454 if (outsize != extra)
1458 #endif /* HAVE_ICONV */
1462 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1463 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1465 is_all_ascii (const char *s, size_t n)
1467 for (; n > 0; s++, n--)
1469 unsigned char c = (unsigned char) *s;
1471 if (!(c_isprint (c) || c_isspace (c)))
1477 #endif /* C_CTYPE_ASCII */
1480 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1485 if (is_utf8_encoding (encoding))
1486 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1491 /* Avoid glibc-2.1 bug with EUC-KR. */
1492 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1493 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1494 to_utf8 = (iconv_t)(-1);
1497 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1499 # if defined __sun && !defined _LIBICONV_VERSION
1500 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1501 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1502 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1503 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1504 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1505 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1506 to_utf8 = (iconv_t)(-1);
1509 to_utf8 = iconv_open (UTF8_NAME, encoding);
1510 if (to_utf8 != (iconv_t)(-1))
1512 /* Determine the length of the resulting UTF-8 string. */
1513 size_t m = iconv_string_length (to_utf8, s, n);
1514 if (m != (size_t)(-1))
1516 /* Convert the string to UTF-8 and build a translation table
1517 from offsets into s to offsets into the translated string. */
1518 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1520 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1523 size_t *offtable = (size_t *) memory;
1524 char *t = (char *) (offtable + n);
1525 char *q = (char *) (t + m);
1528 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1530 /* Determine the possible line breaks of the UTF-8 string. */
1531 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1533 /* Translate the result back to the original string. */
1534 memset (p, UC_BREAK_PROHIBITED, n);
1535 for (i = 0; i < n; i++)
1536 if (offtable[i] != (size_t)(-1))
1537 p[i] = q[offtable[i]];
1540 iconv_close (to_utf8);
1544 iconv_close (to_utf8);
1547 /* Impossible to convert. */
1549 if (is_all_ascii (s, n))
1551 /* ASCII is a subset of UTF-8. */
1552 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1556 /* We have a non-ASCII string and cannot convert it.
1557 Don't produce line breaks except those already present in the
1558 input string. All we assume here is that the encoding is
1559 minimally ASCII compatible. */
1561 const char *s_end = s + n;
1564 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1573 mbs_width_linebreaks (const char *s, size_t n,
1574 int width, int start_column, int at_end_columns,
1575 const char *o, const char *encoding,
1579 return start_column;
1580 if (is_utf8_encoding (encoding))
1581 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1586 /* Avoid glibc-2.1 bug with EUC-KR. */
1587 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1588 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1589 to_utf8 = (iconv_t)(-1);
1592 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1594 # if defined __sun && !defined _LIBICONV_VERSION
1595 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1596 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1597 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1598 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1599 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1600 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1601 to_utf8 = (iconv_t)(-1);
1604 to_utf8 = iconv_open (UTF8_NAME, encoding);
1605 if (to_utf8 != (iconv_t)(-1))
1607 /* Determine the length of the resulting UTF-8 string. */
1608 size_t m = iconv_string_length (to_utf8, s, n);
1609 if (m != (size_t)(-1))
1611 /* Convert the string to UTF-8 and build a translation table
1612 from offsets into s to offsets into the translated string. */
1613 size_t memory_size =
1614 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1615 (o != NULL ? m : 0));
1618 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1621 size_t *offtable = (size_t *) memory;
1622 char *t = (char *) (offtable + n);
1623 char *q = (char *) (t + m);
1624 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1628 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1630 /* Translate the overrides to the UTF-8 string. */
1633 memset (o8, UC_BREAK_UNDEFINED, m);
1634 for (i = 0; i < n; i++)
1635 if (offtable[i] != (size_t)(-1))
1636 o8[offtable[i]] = o[i];
1639 /* Determine the line breaks of the UTF-8 string. */
1641 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1643 /* Translate the result back to the original string. */
1644 memset (p, UC_BREAK_PROHIBITED, n);
1645 for (i = 0; i < n; i++)
1646 if (offtable[i] != (size_t)(-1))
1647 p[i] = q[offtable[i]];
1650 iconv_close (to_utf8);
1654 iconv_close (to_utf8);
1657 /* Impossible to convert. */
1659 if (is_all_ascii (s, n))
1661 /* ASCII is a subset of UTF-8. */
1662 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1665 /* We have a non-ASCII string and cannot convert it.
1666 Don't produce line breaks except those already present in the
1667 input string. All we assume here is that the encoding is
1668 minimally ASCII compatible. */
1670 const char *s_end = s + n;
1673 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1674 ? UC_BREAK_MANDATORY
1675 : UC_BREAK_PROHIBITED);
1681 /* We cannot compute widths in this case. */
1682 return start_column;
1693 /* Read the contents of an input stream, and return it, terminated with a NUL
1696 read_file (FILE *stream)
1698 #define BUFSIZE 4096
1704 while (! feof (stream))
1706 if (size + BUFSIZE > alloc)
1708 alloc = alloc + alloc / 2;
1709 if (alloc < size + BUFSIZE)
1710 alloc = size + BUFSIZE;
1711 buf = realloc (buf, alloc);
1714 fprintf (stderr, "out of memory\n");
1718 count = fread (buf + size, 1, BUFSIZE, stream);
1721 if (ferror (stream))
1730 buf = realloc (buf, size + 1);
1733 fprintf (stderr, "out of memory\n");
1742 main (int argc, char * argv[])
1744 setlocale (LC_CTYPE, "");
1747 /* Display all the break opportunities in the input string. */
1748 char *input = read_file (stdin);
1749 int length = strlen (input);
1750 char *breaks = malloc (length);
1753 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1755 for (i = 0; i < length; i++)
1759 case UC_BREAK_POSSIBLE:
1762 case UC_BREAK_MANDATORY:
1764 case UC_BREAK_PROHIBITED:
1769 putc (input[i], stdout);
1778 /* Insert line breaks for a given width. */
1779 int width = atoi (argv[1]);
1780 char *input = read_file (stdin);
1781 int length = strlen (input);
1782 char *breaks = malloc (length);
1785 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1787 for (i = 0; i < length; i++)
1791 case UC_BREAK_POSSIBLE:
1792 putc ('\n', stdout);
1794 case UC_BREAK_MANDATORY:
1796 case UC_BREAK_PROHIBITED:
1801 putc (input[i], stdout);