1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24 #include "linebreak.h"
30 #include "utf8-ucs4.h"
32 #include "utf16-ucs4.h"
36 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
44 /* Help GCC to generate good code for string comparisons with
46 #if defined (__GNUC__) && defined (__OPTIMIZE__)
49 streq9 (const char *s1, const char *s2)
51 return strcmp (s1 + 9, s2 + 9) == 0;
55 streq8 (const char *s1, const char *s2, char s28)
62 return streq9 (s1, s2);
69 streq7 (const char *s1, const char *s2, char s27, char s28)
76 return streq8 (s1, s2, s28);
83 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
90 return streq7 (s1, s2, s27, s28);
97 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
104 return streq6 (s1, s2, s26, s27, s28);
111 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
118 return streq5 (s1, s2, s25, s26, s27, s28);
125 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
132 return streq4 (s1, s2, s24, s25, s26, s27, s28);
139 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
146 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
153 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
160 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
167 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
174 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
180 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
181 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
185 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
186 (strcmp (s1, s2) == 0)
192 is_cjk_encoding (const char *encoding)
195 /* Legacy Japanese encodings */
196 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
197 /* Legacy Chinese encodings */
198 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
199 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
200 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
201 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
202 /* Legacy Korean encodings */
203 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
204 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
205 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
211 is_utf8_encoding (const char *encoding)
213 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
219 /* Determine number of column positions required for UC. */
220 int uc_width (unsigned int uc, const char *encoding);
223 * Non-spacing attribute table.
225 * - Non-spacing characters; generated from PropList.txt or
226 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
227 * - Format control characters; generated from
228 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
229 * - Zero width characters; generated from
230 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
232 static const unsigned char nonspacing_table_data[16*64] = {
234 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
235 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
236 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
247 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
248 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
254 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
255 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
257 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
258 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
259 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
261 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
262 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
263 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
264 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
265 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
266 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
267 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
268 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
274 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
275 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
276 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
277 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
279 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
280 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
281 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
282 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
283 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
284 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
285 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
286 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
288 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
289 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
290 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
291 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
292 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
293 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
294 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
295 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
297 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
298 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
299 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
300 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
301 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
302 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
303 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
304 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
306 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
307 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
315 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
318 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
319 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
320 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
321 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
322 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
324 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
325 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
328 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
329 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
333 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
334 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
335 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
336 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
340 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
342 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
344 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
355 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
356 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
360 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
368 /* 0x1d000-0x1d1ff */
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
374 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
375 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
376 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
378 static const signed char nonspacing_table_ind[240] = {
379 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
380 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
381 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
382 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
394 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
407 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
408 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
411 /* Determine number of column positions required for UC. */
413 uc_width (unsigned int uc, const char *encoding)
415 /* Test for non-spacing or control character. */
418 int ind = nonspacing_table_ind[uc >> 9];
420 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
422 if (uc > 0 && uc < 0xa0)
428 else if ((uc >> 9) == (0xe0000 >> 9))
431 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
435 /* Test for double-width character.
436 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
437 * and "grep '^....;[^WF]' EastAsianWidth.txt"
440 && ((uc < 0x1160) /* Hangul Jamo */
441 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
443 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
444 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
445 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
446 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
447 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
448 || (uc >= 0xffe0 && uc < 0xffe7)
449 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
450 || (uc >= 0x30000 && uc <= 0x3fffd)
453 /* In ancient CJK encodings, Cyrillic and most other characters are
454 double-width as well. */
455 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
456 && is_cjk_encoding (encoding))
462 /* Determine number of column positions required for first N units
463 (or fewer if S ends before this) in S. */
466 u8_width (const unsigned char *s, size_t n, const char *encoding)
468 const unsigned char *s_end = s + n;
476 s += u8_mbtouc (&uc, s, s_end - s);
479 break; /* end of string reached */
481 w = uc_width (uc, encoding);
482 if (w >= 0) /* ignore control characters in the string */
490 u16_width (const unsigned short *s, size_t n, const char *encoding)
492 const unsigned short *s_end = s + n;
500 s += u16_mbtouc (&uc, s, s_end - s);
503 break; /* end of string reached */
505 w = uc_width (uc, encoding);
506 if (w >= 0) /* ignore control characters in the string */
514 u32_width (const unsigned int *s, size_t n, const char *encoding)
516 const unsigned int *s_end = s + n;
521 unsigned int uc = *s++;
525 break; /* end of string reached */
527 w = uc_width (uc, encoding);
528 if (w >= 0) /* ignore control characters in the string */
536 /* Determine the line break points in S, and store the result at p[0..n-1]. */
537 /* We don't support line breaking of complex-context dependent characters
538 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
540 /* Line breaking classification. */
544 /* Values >= 20 are resolved at run time. */
545 LBP_BK = 0, /* mandatory break */
546 /*LBP_CR, carriage return - not used here because it's a DOSism */
547 /*LBP_LF, line feed - not used here because it's a DOSism */
548 LBP_CM = 20, /* attached characters and combining marks */
549 /*LBP_SG, surrogates - not used here because they are not characters */
550 LBP_ZW = 1, /* zero width space */
551 LBP_IN = 2, /* inseparable */
552 LBP_GL = 3, /* non-breaking (glue) */
553 LBP_CB = 22, /* contingent break opportunity */
554 LBP_SP = 21, /* space */
555 LBP_BA = 4, /* break opportunity after */
556 LBP_BB = 5, /* break opportunity before */
557 LBP_B2 = 6, /* break opportunity before and after */
558 LBP_HY = 7, /* hyphen */
559 LBP_NS = 8, /* non starter */
560 LBP_OP = 9, /* opening punctuation */
561 LBP_CL = 10, /* closing punctuation */
562 LBP_QU = 11, /* ambiguous quotation */
563 LBP_EX = 12, /* exclamation/interrogation */
564 LBP_ID = 13, /* ideographic */
565 LBP_NU = 14, /* numeric */
566 LBP_IS = 15, /* infix separator (numeric) */
567 LBP_SY = 16, /* symbols allowing breaks */
568 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
569 LBP_PR = 18, /* prefix (numeric) */
570 LBP_PO = 19, /* postfix (numeric) */
571 LBP_SA = 23, /* complex context (South East Asian) */
572 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
573 LBP_XX = 25 /* unknown */
576 #include "lbrkprop.h"
578 static inline unsigned char
579 lbrkprop_lookup (unsigned int uc)
581 unsigned int index1 = uc >> lbrkprop_header_0;
582 if (index1 < lbrkprop_header_1)
584 int lookup1 = lbrkprop.level1[index1];
587 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
588 int lookup2 = lbrkprop.level2[lookup1 + index2];
591 unsigned int index3 = uc & lbrkprop_header_4;
592 return lbrkprop.level3[lookup2 + index3];
599 /* Table indexed by two line breaking classifications. */
600 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
601 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
602 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
603 static const unsigned char lbrk_table[19][19] = {
605 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
606 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
607 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
608 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
609 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
610 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
611 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
612 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
614 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
615 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
616 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
617 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
618 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
619 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
620 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
621 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
622 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
623 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
624 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
628 /* Note: The (B2,B2) entry should probably be D instead of P. */
629 /* Note: The (PR,ID) entry should probably be D instead of I. */
632 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
634 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
635 const unsigned char *s_end = s + n;
636 int last_prop = LBP_BK; /* line break property of last non-space character */
637 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
638 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
640 /* Don't break inside multibyte characters. */
641 memset (p, UC_BREAK_PROHIBITED, n);
646 int count = u8_mbtouc (&uc, s, s_end - s);
647 int prop = lbrkprop_lookup (uc);
651 /* Mandatory break. */
652 *p = UC_BREAK_MANDATORY;
661 /* Resolve property values whose behaviour is not fixed. */
665 /* Resolve ambiguous. */
666 prop = LBP_AI_REPLACEMENT;
669 /* This is arbitrary. */
673 /* We don't handle complex scripts yet.
674 Treat LBP_SA like LBP_XX. */
676 /* This is arbitrary. */
681 /* Deal with combining characters. */
685 /* Don't break just before a combining character. */
686 *p = UC_BREAK_PROHIBITED;
687 /* A combining character turns a preceding space into LBP_AL. */
688 if (seen_space != NULL)
691 seen_space = seen_space2;
693 goto lookup_via_table;
696 else if (prop == LBP_SP)
698 /* Don't break just before a space. */
699 *p = UC_BREAK_PROHIBITED;
700 seen_space2 = seen_space;
706 /* prop must be usable as an index for table 7.3 of UTR #14. */
707 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
710 if (last_prop == LBP_BK)
712 /* Don't break at the beginning of a line. */
713 *q = UC_BREAK_PROHIBITED;
717 switch (lbrk_table [last_prop-1] [prop-1])
720 *q = UC_BREAK_POSSIBLE;
723 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
726 *q = UC_BREAK_PROHIBITED;
744 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
746 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
747 const unsigned short *s_end = s + n;
748 int last_prop = LBP_BK; /* line break property of last non-space character */
749 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
750 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
752 /* Don't break inside multibyte characters. */
753 memset (p, UC_BREAK_PROHIBITED, n);
758 int count = u16_mbtouc (&uc, s, s_end - s);
759 int prop = lbrkprop_lookup (uc);
763 /* Mandatory break. */
764 *p = UC_BREAK_MANDATORY;
773 /* Resolve property values whose behaviour is not fixed. */
777 /* Resolve ambiguous. */
778 prop = LBP_AI_REPLACEMENT;
781 /* This is arbitrary. */
785 /* We don't handle complex scripts yet.
786 Treat LBP_SA like LBP_XX. */
788 /* This is arbitrary. */
793 /* Deal with combining characters. */
797 /* Don't break just before a combining character. */
798 *p = UC_BREAK_PROHIBITED;
799 /* A combining character turns a preceding space into LBP_AL. */
800 if (seen_space != NULL)
803 seen_space = seen_space2;
805 goto lookup_via_table;
808 else if (prop == LBP_SP)
810 /* Don't break just before a space. */
811 *p = UC_BREAK_PROHIBITED;
812 seen_space2 = seen_space;
818 /* prop must be usable as an index for table 7.3 of UTR #14. */
819 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
822 if (last_prop == LBP_BK)
824 /* Don't break at the beginning of a line. */
825 *q = UC_BREAK_PROHIBITED;
829 switch (lbrk_table [last_prop-1] [prop-1])
832 *q = UC_BREAK_POSSIBLE;
835 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
838 *q = UC_BREAK_PROHIBITED;
856 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
858 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
859 const unsigned int *s_end = s + n;
860 int last_prop = LBP_BK; /* line break property of last non-space character */
861 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
862 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
866 unsigned int uc = *s;
867 int prop = lbrkprop_lookup (uc);
871 /* Mandatory break. */
872 *p = UC_BREAK_MANDATORY;
881 /* Resolve property values whose behaviour is not fixed. */
885 /* Resolve ambiguous. */
886 prop = LBP_AI_REPLACEMENT;
889 /* This is arbitrary. */
893 /* We don't handle complex scripts yet.
894 Treat LBP_SA like LBP_XX. */
896 /* This is arbitrary. */
901 /* Deal with combining characters. */
905 /* Don't break just before a combining character. */
906 *p = UC_BREAK_PROHIBITED;
907 /* A combining character turns a preceding space into LBP_AL. */
908 if (seen_space != NULL)
911 seen_space = seen_space2;
913 goto lookup_via_table;
916 else if (prop == LBP_SP)
918 /* Don't break just before a space. */
919 *p = UC_BREAK_PROHIBITED;
920 seen_space2 = seen_space;
926 /* prop must be usable as an index for table 7.3 of UTR #14. */
927 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
930 if (last_prop == LBP_BK)
932 /* Don't break at the beginning of a line. */
933 *q = UC_BREAK_PROHIBITED;
937 switch (lbrk_table [last_prop-1] [prop-1])
940 *q = UC_BREAK_POSSIBLE;
943 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
946 *q = UC_BREAK_PROHIBITED;
964 /* Choose the best line breaks, assuming the uc_width function.
965 Return the column after the end of the string. */
968 u8_width_linebreaks (const unsigned char *s, size_t n,
969 int width, int start_column, int at_end_columns,
970 const char *o, const char *encoding,
973 const unsigned char *s_end;
978 u8_possible_linebreaks (s, n, encoding, p);
982 last_column = start_column;
987 int count = u8_mbtouc (&uc, s, s_end - s);
989 /* Respect the override. */
990 if (o != NULL && *o != UC_BREAK_UNDEFINED)
993 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
995 /* An atomic piece of text ends here. */
996 if (last_p != NULL && last_column + piece_width > width)
998 /* Insert a line break. */
999 *last_p = UC_BREAK_POSSIBLE;
1004 if (*p == UC_BREAK_MANDATORY)
1006 /* uc is a line break character. */
1007 /* Start a new piece at column 0. */
1014 /* uc is not a line break character. */
1017 if (*p == UC_BREAK_POSSIBLE)
1019 /* Start a new piece. */
1021 last_column += piece_width;
1023 /* No line break for the moment, may be turned into
1024 UC_BREAK_POSSIBLE later, via last_p. */
1027 *p = UC_BREAK_PROHIBITED;
1029 w = uc_width (uc, encoding);
1030 if (w >= 0) /* ignore control characters in the string */
1040 /* The last atomic piece of text ends here. */
1041 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1043 /* Insert a line break. */
1044 *last_p = UC_BREAK_POSSIBLE;
1048 return last_column + piece_width;
1052 u16_width_linebreaks (const unsigned short *s, size_t n,
1053 int width, int start_column, int at_end_columns,
1054 const char *o, const char *encoding,
1057 const unsigned short *s_end;
1062 u16_possible_linebreaks (s, n, encoding, p);
1066 last_column = start_column;
1071 int count = u16_mbtouc (&uc, s, s_end - s);
1073 /* Respect the override. */
1074 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1077 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1079 /* An atomic piece of text ends here. */
1080 if (last_p != NULL && last_column + piece_width > width)
1082 /* Insert a line break. */
1083 *last_p = UC_BREAK_POSSIBLE;
1088 if (*p == UC_BREAK_MANDATORY)
1090 /* uc is a line break character. */
1091 /* Start a new piece at column 0. */
1098 /* uc is not a line break character. */
1101 if (*p == UC_BREAK_POSSIBLE)
1103 /* Start a new piece. */
1105 last_column += piece_width;
1107 /* No line break for the moment, may be turned into
1108 UC_BREAK_POSSIBLE later, via last_p. */
1111 *p = UC_BREAK_PROHIBITED;
1113 w = uc_width (uc, encoding);
1114 if (w >= 0) /* ignore control characters in the string */
1124 /* The last atomic piece of text ends here. */
1125 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1127 /* Insert a line break. */
1128 *last_p = UC_BREAK_POSSIBLE;
1132 return last_column + piece_width;
1136 u32_width_linebreaks (const unsigned int *s, size_t n,
1137 int width, int start_column, int at_end_columns,
1138 const char *o, const char *encoding,
1141 const unsigned int *s_end;
1146 u32_possible_linebreaks (s, n, encoding, p);
1150 last_column = start_column;
1154 unsigned int uc = *s;
1156 /* Respect the override. */
1157 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1160 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1162 /* An atomic piece of text ends here. */
1163 if (last_p != NULL && last_column + piece_width > width)
1165 /* Insert a line break. */
1166 *last_p = UC_BREAK_POSSIBLE;
1171 if (*p == UC_BREAK_MANDATORY)
1173 /* uc is a line break character. */
1174 /* Start a new piece at column 0. */
1181 /* uc is not a line break character. */
1184 if (*p == UC_BREAK_POSSIBLE)
1186 /* Start a new piece. */
1188 last_column += piece_width;
1190 /* No line break for the moment, may be turned into
1191 UC_BREAK_POSSIBLE later, via last_p. */
1194 *p = UC_BREAK_PROHIBITED;
1196 w = uc_width (uc, encoding);
1197 if (w >= 0) /* ignore control characters in the string */
1207 /* The last atomic piece of text ends here. */
1208 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1210 /* Insert a line break. */
1211 *last_p = UC_BREAK_POSSIBLE;
1215 return last_column + piece_width;
1223 /* Read the contents of an input stream, and return it, terminated with a NUL
1226 read_file (FILE *stream)
1228 #define BUFSIZE 4096
1234 while (! feof (stream))
1236 if (size + BUFSIZE > alloc)
1238 alloc = alloc + alloc / 2;
1239 if (alloc < size + BUFSIZE)
1240 alloc = size + BUFSIZE;
1241 buf = realloc (buf, alloc);
1244 fprintf (stderr, "out of memory\n");
1248 count = fread (buf + size, 1, BUFSIZE, stream);
1251 if (ferror (stream))
1260 buf = realloc (buf, size + 1);
1263 fprintf (stderr, "out of memory\n");
1272 main (int argc, char * argv[])
1276 /* Display all the break opportunities in the input string. */
1277 char *input = read_file (stdin);
1278 int length = strlen (input);
1279 char *breaks = malloc (length);
1282 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1284 for (i = 0; i < length; i++)
1288 case UC_BREAK_POSSIBLE:
1289 /* U+2027 in UTF-8 encoding */
1290 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1292 case UC_BREAK_MANDATORY:
1293 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1294 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1296 case UC_BREAK_PROHIBITED:
1301 putc (input[i], stdout);
1310 /* Insert line breaks for a given width. */
1311 int width = atoi (argv[1]);
1312 char *input = read_file (stdin);
1313 int length = strlen (input);
1314 char *breaks = malloc (length);
1317 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1319 for (i = 0; i < length; i++)
1323 case UC_BREAK_POSSIBLE:
1324 putc ('\n', stdout);
1326 case UC_BREAK_MANDATORY:
1328 case UC_BREAK_PROHIBITED:
1333 putc (input[i], stdout);
1347 /* Now the same thing with an arbitrary encoding.
1349 We convert the input string to Unicode.
1351 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1352 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1353 \U0000FFFF. UTF-16 and variants support only characters up to
1354 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1355 UCS-4 specification leaves doubts about endianness and byte order mark.
1356 glibc currently interprets it as big endian without byte order mark,
1357 but this is not backed by an RFC. So we use UTF-8. It supports
1358 characters up to \U7FFFFFFF and is unambiguously defined. */
1365 /* Luckily, the encoding's name is platform independent. */
1366 #define UTF8_NAME "UTF-8"
1368 /* Return the length of a string after conversion through an iconv_t. */
1370 iconv_string_length (iconv_t cd, const char *s, size_t n)
1372 #define TMPBUFSIZE 4096
1374 char tmpbuf[TMPBUFSIZE];
1375 const char *inptr = s;
1379 char *outptr = tmpbuf;
1380 size_t outsize = TMPBUFSIZE;
1381 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1382 if (res == (size_t)(-1) && errno != E2BIG)
1383 return (size_t)(-1);
1384 count += outptr - tmpbuf;
1386 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1387 #if defined _LIBICONV_VERSION \
1388 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1390 char *outptr = tmpbuf;
1391 size_t outsize = TMPBUFSIZE;
1392 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1393 if (res == (size_t)(-1))
1394 return (size_t)(-1);
1395 count += outptr - tmpbuf;
1397 /* Return to the initial state. */
1398 iconv (cd, NULL, NULL, NULL, NULL);
1405 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1406 size_t *offtable, char *t, size_t m)
1413 /* Avoid glibc-2.1 bug. */
1414 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1415 const size_t extra = 1;
1417 const size_t extra = 0;
1420 for (i = 0; i < n; i++)
1421 offtable[i] = (size_t)(-1);
1426 outsize = m + extra;
1427 while (inptr < s_end)
1429 const char *saved_inptr;
1433 offtable[inptr - s] = outptr - t;
1435 saved_inptr = inptr;
1437 for (insize = 1; inptr + insize <= s_end; insize++)
1439 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1440 if (!(res == (size_t)(-1) && errno == EINVAL))
1442 /* We expect that no input bytes have been consumed so far. */
1443 if (inptr != saved_inptr)
1446 /* After we verified the convertibility and computed the translation's
1447 size m, there shouldn't be any conversion error here. */
1448 if (res == (size_t)(-1))
1451 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1452 #if defined _LIBICONV_VERSION \
1453 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1454 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1457 /* We should have produced exactly m output bytes. */
1458 if (outsize != extra)
1462 #endif /* HAVE_ICONV */
1466 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1467 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1469 is_all_ascii (const char *s, size_t n)
1471 for (; n > 0; s++, n--)
1473 unsigned char c = (unsigned char) *s;
1475 if (!(c_isprint (c) || c_isspace (c)))
1481 #endif /* C_CTYPE_ASCII */
1484 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1489 if (is_utf8_encoding (encoding))
1490 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1495 /* Avoid glibc-2.1 bug with EUC-KR. */
1496 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1497 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1498 to_utf8 = (iconv_t)(-1);
1501 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1503 # if defined __sun && !defined _LIBICONV_VERSION
1504 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1505 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1506 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1507 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1508 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1509 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1510 to_utf8 = (iconv_t)(-1);
1513 to_utf8 = iconv_open (UTF8_NAME, encoding);
1514 if (to_utf8 != (iconv_t)(-1))
1516 /* Determine the length of the resulting UTF-8 string. */
1517 size_t m = iconv_string_length (to_utf8, s, n);
1518 if (m != (size_t)(-1))
1520 /* Convert the string to UTF-8 and build a translation table
1521 from offsets into s to offsets into the translated string. */
1522 char *memory = malloc (n * sizeof (size_t) + m + m);
1525 size_t *offtable = (size_t *) memory;
1526 char *t = (char *) (offtable + n);
1527 char *q = (char *) (t + m);
1530 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1532 /* Determine the possible line breaks of the UTF-8 string. */
1533 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1535 /* Translate the result back to the original string. */
1536 memset (p, UC_BREAK_PROHIBITED, n);
1537 for (i = 0; i < n; i++)
1538 if (offtable[i] != (size_t)(-1))
1539 p[i] = q[offtable[i]];
1542 iconv_close (to_utf8);
1546 iconv_close (to_utf8);
1549 /* Impossible to convert. */
1551 if (is_all_ascii (s, n))
1553 /* ASCII is a subset of UTF-8. */
1554 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1558 /* We have a non-ASCII string and cannot convert it.
1559 Don't produce line breaks except those already present in the
1560 input string. All we assume here is that the encoding is
1561 minimally ASCII compatible. */
1563 const char *s_end = s + n;
1566 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1575 mbs_width_linebreaks (const char *s, size_t n,
1576 int width, int start_column, int at_end_columns,
1577 const char *o, const char *encoding,
1581 return start_column;
1582 if (is_utf8_encoding (encoding))
1583 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1588 /* Avoid glibc-2.1 bug with EUC-KR. */
1589 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1590 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1591 to_utf8 = (iconv_t)(-1);
1594 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1596 # if defined __sun && !defined _LIBICONV_VERSION
1597 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1598 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1599 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1600 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1601 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1602 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1603 to_utf8 = (iconv_t)(-1);
1606 to_utf8 = iconv_open (UTF8_NAME, encoding);
1607 if (to_utf8 != (iconv_t)(-1))
1609 /* Determine the length of the resulting UTF-8 string. */
1610 size_t m = iconv_string_length (to_utf8, s, n);
1611 if (m != (size_t)(-1))
1613 /* Convert the string to UTF-8 and build a translation table
1614 from offsets into s to offsets into the translated string. */
1615 char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0));
1618 size_t *offtable = (size_t *) memory;
1619 char *t = (char *) (offtable + n);
1620 char *q = (char *) (t + m);
1621 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1625 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1627 /* Translate the overrides to the UTF-8 string. */
1630 memset (o8, UC_BREAK_UNDEFINED, m);
1631 for (i = 0; i < n; i++)
1632 if (offtable[i] != (size_t)(-1))
1633 o8[offtable[i]] = o[i];
1636 /* Determine the line breaks of the UTF-8 string. */
1638 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1640 /* Translate the result back to the original string. */
1641 memset (p, UC_BREAK_PROHIBITED, n);
1642 for (i = 0; i < n; i++)
1643 if (offtable[i] != (size_t)(-1))
1644 p[i] = q[offtable[i]];
1647 iconv_close (to_utf8);
1651 iconv_close (to_utf8);
1654 /* Impossible to convert. */
1656 if (is_all_ascii (s, n))
1658 /* ASCII is a subset of UTF-8. */
1659 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1662 /* We have a non-ASCII string and cannot convert it.
1663 Don't produce line breaks except those already present in the
1664 input string. All we assume here is that the encoding is
1665 minimally ASCII compatible. */
1667 const char *s_end = s + n;
1670 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1671 ? UC_BREAK_MANDATORY
1672 : UC_BREAK_PROHIBITED);
1678 /* We cannot compute widths in this case. */
1679 return start_column;
1690 /* Read the contents of an input stream, and return it, terminated with a NUL
1693 read_file (FILE *stream)
1695 #define BUFSIZE 4096
1701 while (! feof (stream))
1703 if (size + BUFSIZE > alloc)
1705 alloc = alloc + alloc / 2;
1706 if (alloc < size + BUFSIZE)
1707 alloc = size + BUFSIZE;
1708 buf = realloc (buf, alloc);
1711 fprintf (stderr, "out of memory\n");
1715 count = fread (buf + size, 1, BUFSIZE, stream);
1718 if (ferror (stream))
1727 buf = realloc (buf, size + 1);
1730 fprintf (stderr, "out of memory\n");
1739 main (int argc, char * argv[])
1741 setlocale (LC_CTYPE, "");
1744 /* Display all the break opportunities in the input string. */
1745 char *input = read_file (stdin);
1746 int length = strlen (input);
1747 char *breaks = malloc (length);
1750 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1752 for (i = 0; i < length; i++)
1756 case UC_BREAK_POSSIBLE:
1759 case UC_BREAK_MANDATORY:
1761 case UC_BREAK_PROHIBITED:
1766 putc (input[i], stdout);
1775 /* Insert line breaks for a given width. */
1776 int width = atoi (argv[1]);
1777 char *input = read_file (stdin);
1778 int length = strlen (input);
1779 char *breaks = malloc (length);
1782 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1784 for (i = 0; i < length; i++)
1788 case UC_BREAK_POSSIBLE:
1789 putc ('\n', stdout);
1791 case UC_BREAK_MANDATORY:
1793 case UC_BREAK_PROHIBITED:
1798 putc (input[i], stdout);