1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24 #include "linebreak.h"
30 #include "utf8-ucs4.h"
32 #include "utf16-ucs4.h"
36 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
44 /* Help GCC to generate good code for string comparisons with
46 #if defined (__GNUC__) && defined (__OPTIMIZE__)
49 streq9 (const char *s1, const char *s2)
51 return strcmp (s1 + 9, s2 + 9) == 0;
55 streq8 (const char *s1, const char *s2, char s28)
62 return streq9 (s1, s2);
69 streq7 (const char *s1, const char *s2, char s27, char s28)
76 return streq8 (s1, s2, s28);
83 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
90 return streq7 (s1, s2, s27, s28);
97 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
104 return streq6 (s1, s2, s26, s27, s28);
111 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
118 return streq5 (s1, s2, s25, s26, s27, s28);
125 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
132 return streq4 (s1, s2, s24, s25, s26, s27, s28);
139 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
146 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
153 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
160 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
167 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
174 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
180 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
181 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
185 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
186 (strcmp (s1, s2) == 0)
192 is_cjk_encoding (const char *encoding)
195 /* Legacy Japanese encodings */
196 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
197 /* Legacy Chinese encodings */
198 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
199 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
200 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
201 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
202 /* Legacy Korean encodings */
203 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
204 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
205 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
211 is_utf8_encoding (const char *encoding)
213 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
219 /* Determine number of column positions required for UC. */
220 int uc_width (unsigned int uc, const char *encoding);
223 * Non-spacing attribute table.
225 * - Non-spacing characters; generated from PropList.txt or
226 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
227 * - Format control characters; generated from
228 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
229 * - Zero width characters; generated from
230 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
232 static const unsigned char nonspacing_table_data[16*64] = {
234 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
235 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
236 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0080-0x00bf */
237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
243 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
247 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
248 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
252 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
254 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
255 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
257 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
258 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
259 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
261 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
262 0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
263 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
264 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
265 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
266 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
267 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
268 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
274 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
275 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
276 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
277 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
279 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
280 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
281 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
282 0xbe, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
283 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
284 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
285 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
286 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
288 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
289 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
290 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0c80-0x0cbf */
291 0x40, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
292 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
293 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
294 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
295 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
297 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
298 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
299 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
300 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
301 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
302 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
303 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
304 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
306 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
307 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
308 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
315 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
318 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
319 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
320 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
321 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, /* 0x1780-0x17bf */
322 0x40, 0xfe, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
324 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
325 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1900-0x193f */
329 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
333 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
334 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
335 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
336 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
337 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
340 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
342 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
344 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
345 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
355 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
356 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
360 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
361 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
368 /* 0x1d000-0x1d1ff */
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
374 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
375 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
376 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
378 static const signed char nonspacing_table_ind[240] = {
379 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
380 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
381 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
382 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
383 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
394 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
395 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
407 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
408 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
411 /* Determine number of column positions required for UC. */
413 uc_width (unsigned int uc, const char *encoding)
415 /* Test for non-spacing or control character. */
418 int ind = nonspacing_table_ind[uc >> 9];
420 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
422 if (uc > 0 && uc < 0x100)
428 else if ((uc >> 9) == (0xe0000 >> 9))
430 if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
433 /* Test for double-width character.
434 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
435 * and "grep '^....;[^WF]' EastAsianWidth.txt"
438 && ((uc < 0x1160) /* Hangul Jamo */
439 || (uc >= 0x2e80 && uc < 0xa4d0 /* CJK ... Yi */
441 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
442 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
443 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
444 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
445 || (uc >= 0xffe0 && uc < 0xffe7)
446 || (uc >= 0x20000 && uc <= 0x2a6d6) /* CJK */
447 || (uc >= 0x2f800 && uc <= 0x2fa1d) /* CJK Compatibility Ideographs */
450 /* In ancient CJK encodings, Cyrillic and most other characters are
451 double-width as well. */
452 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
453 && is_cjk_encoding (encoding))
459 /* Determine number of column positions required for first N units
460 (or fewer if S ends before this) in S. */
463 u8_width (const unsigned char *s, size_t n, const char *encoding)
465 const unsigned char *s_end = s + n;
473 s += u8_mbtouc (&uc, s, s_end - s);
476 break; /* end of string reached */
478 w = uc_width (uc, encoding);
479 if (w >= 0) /* ignore control characters in the string */
487 u16_width (const unsigned short *s, size_t n, const char *encoding)
489 const unsigned short *s_end = s + n;
497 s += u16_mbtouc (&uc, s, s_end - s);
500 break; /* end of string reached */
502 w = uc_width (uc, encoding);
503 if (w >= 0) /* ignore control characters in the string */
511 u32_width (const unsigned int *s, size_t n, const char *encoding)
513 const unsigned int *s_end = s + n;
518 unsigned int uc = *s++;
522 break; /* end of string reached */
524 w = uc_width (uc, encoding);
525 if (w >= 0) /* ignore control characters in the string */
533 /* Determine the line break points in S, and store the result at p[0..n-1]. */
534 /* We don't support line breaking of complex-context dependent characters
535 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
537 /* Line breaking classification. */
541 /* Values >= 20 are resolved at run time. */
542 LBP_BK = 0, /* mandatory break */
543 /*LBP_CR, carriage return - not used here because it's a DOSism */
544 /*LBP_LF, line feed - not used here because it's a DOSism */
545 LBP_CM = 20, /* attached characters and combining marks */
546 /*LBP_SG, surrogates - not used here because they are not characters */
547 LBP_ZW = 1, /* zero width space */
548 LBP_IN = 2, /* inseparable */
549 LBP_GL = 3, /* non-breaking (glue) */
550 LBP_CB = 22, /* contingent break opportunity */
551 LBP_SP = 21, /* space */
552 LBP_BA = 4, /* break opportunity after */
553 LBP_BB = 5, /* break opportunity before */
554 LBP_B2 = 6, /* break opportunity before and after */
555 LBP_HY = 7, /* hyphen */
556 LBP_NS = 8, /* non starter */
557 LBP_OP = 9, /* opening punctuation */
558 LBP_CL = 10, /* closing punctuation */
559 LBP_QU = 11, /* ambiguous quotation */
560 LBP_EX = 12, /* exclamation/interrogation */
561 LBP_ID = 13, /* ideographic */
562 LBP_NU = 14, /* numeric */
563 LBP_IS = 15, /* infix separator (numeric) */
564 LBP_SY = 16, /* symbols allowing breaks */
565 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
566 LBP_PR = 18, /* prefix (numeric) */
567 LBP_PO = 19, /* postfix (numeric) */
568 LBP_SA = 23, /* complex context (South East Asian) */
569 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
570 LBP_XX = 25 /* unknown */
573 #include "lbrkprop.h"
575 static inline unsigned char
576 lbrkprop_lookup (unsigned int uc)
578 unsigned int index1 = uc >> lbrkprop_header_0;
579 if (index1 < lbrkprop_header_1)
581 int lookup1 = lbrkprop.level1[index1];
584 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
585 int lookup2 = lbrkprop.level2[lookup1 + index2];
588 unsigned int index3 = uc & lbrkprop_header_4;
589 return lbrkprop.level3[lookup2 + index3];
596 /* Table indexed by two line breaking classifications. */
597 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
598 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
599 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
600 static const unsigned char lbrk_table[19][19] = {
602 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
603 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
604 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
605 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
606 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
607 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
608 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
609 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
610 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
612 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
613 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
614 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
616 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
617 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
618 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
619 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
620 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
621 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
625 /* Note: The (B2,B2) entry should probably be D instead of P. */
626 /* Note: The (PR,ID) entry should probably be D instead of I. */
629 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
631 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
632 const unsigned char *s_end = s + n;
633 int last_prop = LBP_BK; /* line break property of last non-space character */
634 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
635 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
637 /* Don't break inside multibyte characters. */
638 memset (p, UC_BREAK_PROHIBITED, n);
643 int count = u8_mbtouc (&uc, s, s_end - s);
644 int prop = lbrkprop_lookup (uc);
648 /* Mandatory break. */
649 *p = UC_BREAK_MANDATORY;
658 /* Resolve property values whose behaviour is not fixed. */
662 /* Resolve ambiguous. */
663 prop = LBP_AI_REPLACEMENT;
666 /* This is arbitrary. */
670 /* We don't handle complex scripts yet.
671 Treat LBP_SA like LBP_XX. */
673 /* This is arbitrary. */
678 /* Deal with combining characters. */
682 /* Don't break just before a combining character. */
683 *p = UC_BREAK_PROHIBITED;
684 /* A combining character turns a preceding space into LBP_AL. */
685 if (seen_space != NULL)
688 seen_space = seen_space2;
690 goto lookup_via_table;
693 else if (prop == LBP_SP)
695 /* Don't break just before a space. */
696 *p = UC_BREAK_PROHIBITED;
697 seen_space2 = seen_space;
703 /* prop must be usable as an index for table 7.3 of UTR #14. */
704 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
707 if (last_prop == LBP_BK)
709 /* Don't break at the beginning of a line. */
710 *q = UC_BREAK_PROHIBITED;
714 switch (lbrk_table [last_prop-1] [prop-1])
717 *q = UC_BREAK_POSSIBLE;
720 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
723 *q = UC_BREAK_PROHIBITED;
741 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
743 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
744 const unsigned short *s_end = s + n;
745 int last_prop = LBP_BK; /* line break property of last non-space character */
746 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
747 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
749 /* Don't break inside multibyte characters. */
750 memset (p, UC_BREAK_PROHIBITED, n);
755 int count = u16_mbtouc (&uc, s, s_end - s);
756 int prop = lbrkprop_lookup (uc);
760 /* Mandatory break. */
761 *p = UC_BREAK_MANDATORY;
770 /* Resolve property values whose behaviour is not fixed. */
774 /* Resolve ambiguous. */
775 prop = LBP_AI_REPLACEMENT;
778 /* This is arbitrary. */
782 /* We don't handle complex scripts yet.
783 Treat LBP_SA like LBP_XX. */
785 /* This is arbitrary. */
790 /* Deal with combining characters. */
794 /* Don't break just before a combining character. */
795 *p = UC_BREAK_PROHIBITED;
796 /* A combining character turns a preceding space into LBP_AL. */
797 if (seen_space != NULL)
800 seen_space = seen_space2;
802 goto lookup_via_table;
805 else if (prop == LBP_SP)
807 /* Don't break just before a space. */
808 *p = UC_BREAK_PROHIBITED;
809 seen_space2 = seen_space;
815 /* prop must be usable as an index for table 7.3 of UTR #14. */
816 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
819 if (last_prop == LBP_BK)
821 /* Don't break at the beginning of a line. */
822 *q = UC_BREAK_PROHIBITED;
826 switch (lbrk_table [last_prop-1] [prop-1])
829 *q = UC_BREAK_POSSIBLE;
832 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
835 *q = UC_BREAK_PROHIBITED;
853 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
855 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
856 const unsigned int *s_end = s + n;
857 int last_prop = LBP_BK; /* line break property of last non-space character */
858 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
859 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
863 unsigned int uc = *s;
864 int prop = lbrkprop_lookup (uc);
868 /* Mandatory break. */
869 *p = UC_BREAK_MANDATORY;
878 /* Resolve property values whose behaviour is not fixed. */
882 /* Resolve ambiguous. */
883 prop = LBP_AI_REPLACEMENT;
886 /* This is arbitrary. */
890 /* We don't handle complex scripts yet.
891 Treat LBP_SA like LBP_XX. */
893 /* This is arbitrary. */
898 /* Deal with combining characters. */
902 /* Don't break just before a combining character. */
903 *p = UC_BREAK_PROHIBITED;
904 /* A combining character turns a preceding space into LBP_AL. */
905 if (seen_space != NULL)
908 seen_space = seen_space2;
910 goto lookup_via_table;
913 else if (prop == LBP_SP)
915 /* Don't break just before a space. */
916 *p = UC_BREAK_PROHIBITED;
917 seen_space2 = seen_space;
923 /* prop must be usable as an index for table 7.3 of UTR #14. */
924 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
927 if (last_prop == LBP_BK)
929 /* Don't break at the beginning of a line. */
930 *q = UC_BREAK_PROHIBITED;
934 switch (lbrk_table [last_prop-1] [prop-1])
937 *q = UC_BREAK_POSSIBLE;
940 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
943 *q = UC_BREAK_PROHIBITED;
961 /* Choose the best line breaks, assuming the uc_width function.
962 Return the column after the end of the string. */
965 u8_width_linebreaks (const unsigned char *s, size_t n,
966 int width, int start_column, int at_end_columns,
967 const char *o, const char *encoding,
970 const unsigned char *s_end;
975 u8_possible_linebreaks (s, n, encoding, p);
979 last_column = start_column;
984 int count = u8_mbtouc (&uc, s, s_end - s);
986 /* Respect the override. */
987 if (o != NULL && *o != UC_BREAK_UNDEFINED)
990 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
992 /* An atomic piece of text ends here. */
993 if (last_p != NULL && last_column + piece_width > width)
995 /* Insert a line break. */
996 *last_p = UC_BREAK_POSSIBLE;
1001 if (*p == UC_BREAK_MANDATORY)
1003 /* uc is a line break character. */
1004 /* Start a new piece at column 0. */
1011 /* uc is not a line break character. */
1014 if (*p == UC_BREAK_POSSIBLE)
1016 /* Start a new piece. */
1018 last_column += piece_width;
1020 /* No line break for the moment, may be turned into
1021 UC_BREAK_POSSIBLE later, via last_p. */
1024 *p = UC_BREAK_PROHIBITED;
1026 w = uc_width (uc, encoding);
1027 if (w >= 0) /* ignore control characters in the string */
1037 /* The last atomic piece of text ends here. */
1038 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1040 /* Insert a line break. */
1041 *last_p = UC_BREAK_POSSIBLE;
1045 return last_column + piece_width;
1049 u16_width_linebreaks (const unsigned short *s, size_t n,
1050 int width, int start_column, int at_end_columns,
1051 const char *o, const char *encoding,
1054 const unsigned short *s_end;
1059 u16_possible_linebreaks (s, n, encoding, p);
1063 last_column = start_column;
1068 int count = u16_mbtouc (&uc, s, s_end - s);
1070 /* Respect the override. */
1071 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1074 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1076 /* An atomic piece of text ends here. */
1077 if (last_p != NULL && last_column + piece_width > width)
1079 /* Insert a line break. */
1080 *last_p = UC_BREAK_POSSIBLE;
1085 if (*p == UC_BREAK_MANDATORY)
1087 /* uc is a line break character. */
1088 /* Start a new piece at column 0. */
1095 /* uc is not a line break character. */
1098 if (*p == UC_BREAK_POSSIBLE)
1100 /* Start a new piece. */
1102 last_column += piece_width;
1104 /* No line break for the moment, may be turned into
1105 UC_BREAK_POSSIBLE later, via last_p. */
1108 *p = UC_BREAK_PROHIBITED;
1110 w = uc_width (uc, encoding);
1111 if (w >= 0) /* ignore control characters in the string */
1121 /* The last atomic piece of text ends here. */
1122 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1124 /* Insert a line break. */
1125 *last_p = UC_BREAK_POSSIBLE;
1129 return last_column + piece_width;
1133 u32_width_linebreaks (const unsigned int *s, size_t n,
1134 int width, int start_column, int at_end_columns,
1135 const char *o, const char *encoding,
1138 const unsigned int *s_end;
1143 u32_possible_linebreaks (s, n, encoding, p);
1147 last_column = start_column;
1151 unsigned int uc = *s;
1153 /* Respect the override. */
1154 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1157 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1159 /* An atomic piece of text ends here. */
1160 if (last_p != NULL && last_column + piece_width > width)
1162 /* Insert a line break. */
1163 *last_p = UC_BREAK_POSSIBLE;
1168 if (*p == UC_BREAK_MANDATORY)
1170 /* uc is a line break character. */
1171 /* Start a new piece at column 0. */
1178 /* uc is not a line break character. */
1181 if (*p == UC_BREAK_POSSIBLE)
1183 /* Start a new piece. */
1185 last_column += piece_width;
1187 /* No line break for the moment, may be turned into
1188 UC_BREAK_POSSIBLE later, via last_p. */
1191 *p = UC_BREAK_PROHIBITED;
1193 w = uc_width (uc, encoding);
1194 if (w >= 0) /* ignore control characters in the string */
1204 /* The last atomic piece of text ends here. */
1205 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1207 /* Insert a line break. */
1208 *last_p = UC_BREAK_POSSIBLE;
1212 return last_column + piece_width;
1220 /* Read the contents of an input stream, and return it, terminated with a NUL
1223 read_file (FILE *stream)
1225 #define BUFSIZE 4096
1231 while (! feof (stream))
1233 if (size + BUFSIZE > alloc)
1235 alloc = alloc + alloc / 2;
1236 if (alloc < size + BUFSIZE)
1237 alloc = size + BUFSIZE;
1238 buf = realloc (buf, alloc);
1241 fprintf (stderr, "out of memory\n");
1245 count = fread (buf + size, 1, BUFSIZE, stream);
1248 if (ferror (stream))
1257 buf = realloc (buf, size + 1);
1260 fprintf (stderr, "out of memory\n");
1269 main (int argc, char * argv[])
1273 /* Display all the break opportunities in the input string. */
1274 char *input = read_file (stdin);
1275 int length = strlen (input);
1276 char *breaks = malloc (length);
1279 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1281 for (i = 0; i < length; i++)
1285 case UC_BREAK_POSSIBLE:
1286 /* U+2027 in UTF-8 encoding */
1287 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1289 case UC_BREAK_MANDATORY:
1290 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1291 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1293 case UC_BREAK_PROHIBITED:
1298 putc (input[i], stdout);
1307 /* Insert line breaks for a given width. */
1308 int width = atoi (argv[1]);
1309 char *input = read_file (stdin);
1310 int length = strlen (input);
1311 char *breaks = malloc (length);
1314 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1316 for (i = 0; i < length; i++)
1320 case UC_BREAK_POSSIBLE:
1321 putc ('\n', stdout);
1323 case UC_BREAK_MANDATORY:
1325 case UC_BREAK_PROHIBITED:
1330 putc (input[i], stdout);
1344 /* Now the same thing with an arbitrary encoding.
1346 We convert the input string to Unicode.
1348 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1349 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1350 \U0000FFFF. UTF-16 and variants support only characters up to
1351 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1352 UCS-4 specification leaves doubts about endianness and byte order mark.
1353 glibc currently interprets it as big endian without byte order mark,
1354 but this is not backed by an RFC. So we use UTF-8. It supports
1355 characters up to \U7FFFFFFF and is unambiguously defined. */
1362 /* Luckily, the encoding's name is platform independent. */
1363 #define UTF8_NAME "UTF-8"
1365 /* Return the length of a string after conversion through an iconv_t. */
1367 iconv_string_length (iconv_t cd, const char *s, size_t n)
1369 #define TMPBUFSIZE 4096
1371 char tmpbuf[TMPBUFSIZE];
1372 const char *inptr = s;
1376 char *outptr = tmpbuf;
1377 size_t outsize = TMPBUFSIZE;
1378 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1379 if (res == (size_t)(-1))
1380 return (size_t)(-1);
1381 count += outptr - tmpbuf;
1383 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
1384 #if defined _LIBICONV_VERSION \
1385 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1387 char *outptr = tmpbuf;
1388 size_t outsize = TMPBUFSIZE;
1389 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1390 if (res == (size_t)(-1))
1391 return (size_t)(-1);
1392 count += outptr - tmpbuf;
1394 /* Return to the initial state. */
1395 iconv (cd, NULL, NULL, NULL, NULL);
1402 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1403 size_t *offtable, char *t, size_t m)
1410 /* Avoid glibc-2.1 bug. */
1411 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1412 const size_t extra = 1;
1414 const size_t extra = 0;
1417 for (i = 0; i < n; i++)
1418 offtable[i] = (size_t)(-1);
1423 outsize = m + extra;
1424 while (inptr < s_end)
1426 const char *saved_inptr;
1430 offtable[inptr - s] = outptr - t;
1432 saved_inptr = inptr;
1434 for (insize = 1; inptr + insize <= s_end; insize++)
1436 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1437 if (!(res == (size_t)(-1) && errno == EINVAL))
1439 /* We expect that no input bytes have been consumed so far. */
1440 if (inptr != saved_inptr)
1443 /* After we verified the convertibility and computed the translation's
1444 size m, there shouldn't be any conversion error here. */
1445 if (res == (size_t)(-1))
1448 /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
1449 #if defined _LIBICONV_VERSION \
1450 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1451 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1454 /* We should have produced exactly m output bytes. */
1455 if (outsize != extra)
1459 #endif /* HAVE_ICONV */
1463 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1464 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1466 is_all_ascii (const char *s, size_t n)
1468 for (; n > 0; s++, n--)
1470 unsigned char c = (unsigned char) *s;
1472 if (!(c_isprint (c) || c_isspace (c)))
1478 #endif /* C_CTYPE_ASCII */
1481 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1486 if (is_utf8_encoding (encoding))
1487 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1492 /* Avoid glibc-2.1 bug with EUC-KR. */
1493 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1494 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1495 to_utf8 = (iconv_t)(-1);
1498 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1500 # if defined __sun && !defined _LIBICONV_VERSION
1501 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1502 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1503 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1504 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1505 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1506 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1507 to_utf8 = (iconv_t)(-1);
1510 to_utf8 = iconv_open (UTF8_NAME, encoding);
1511 if (to_utf8 != (iconv_t)(-1))
1513 /* Determine the length of the resulting UTF-8 string. */
1514 size_t m = iconv_string_length (to_utf8, s, n);
1515 if (m != (size_t)(-1))
1517 /* Convert the string to UTF-8 and build a translation table
1518 from offsets into s to offsets into the translated string. */
1519 char *memory = malloc (n * sizeof (size_t) + m + m);
1522 size_t *offtable = (size_t *) memory;
1523 char *t = (char *) (offtable + n);
1524 char *q = (char *) (t + m);
1527 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1529 /* Determine the possible line breaks of the UTF-8 string. */
1530 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1532 /* Translate the result back to the original string. */
1533 memset (p, UC_BREAK_PROHIBITED, n);
1534 for (i = 0; i < n; i++)
1535 if (offtable[i] != (size_t)(-1))
1536 p[i] = q[offtable[i]];
1539 iconv_close (to_utf8);
1543 iconv_close (to_utf8);
1546 /* Impossible to convert. */
1548 if (is_all_ascii (s, n))
1550 /* ASCII is a subset of UTF-8. */
1551 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1555 /* We have a non-ASCII string and cannot convert it.
1556 Don't produce line breaks except those already present in the
1557 input string. All we assume here is that the encoding is
1558 minimally ASCII compatible. */
1560 const char *s_end = s + n;
1563 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1572 mbs_width_linebreaks (const char *s, size_t n,
1573 int width, int start_column, int at_end_columns,
1574 const char *o, const char *encoding,
1578 return start_column;
1579 if (is_utf8_encoding (encoding))
1580 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1585 /* Avoid glibc-2.1 bug with EUC-KR. */
1586 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1587 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1588 to_utf8 = (iconv_t)(-1);
1591 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1593 # if defined __sun && !defined _LIBICONV_VERSION
1594 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1595 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1596 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1597 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1598 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1599 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1600 to_utf8 = (iconv_t)(-1);
1603 to_utf8 = iconv_open (UTF8_NAME, encoding);
1604 if (to_utf8 != (iconv_t)(-1))
1606 /* Determine the length of the resulting UTF-8 string. */
1607 size_t m = iconv_string_length (to_utf8, s, n);
1608 if (m != (size_t)(-1))
1610 /* Convert the string to UTF-8 and build a translation table
1611 from offsets into s to offsets into the translated string. */
1612 char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0));
1615 size_t *offtable = (size_t *) memory;
1616 char *t = (char *) (offtable + n);
1617 char *q = (char *) (t + m);
1618 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1622 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1624 /* Translate the overrides to the UTF-8 string. */
1627 memset (o8, UC_BREAK_UNDEFINED, m);
1628 for (i = 0; i < n; i++)
1629 if (offtable[i] != (size_t)(-1))
1630 o8[offtable[i]] = o[i];
1633 /* Determine the line breaks of the UTF-8 string. */
1635 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1637 /* Translate the result back to the original string. */
1638 memset (p, UC_BREAK_PROHIBITED, n);
1639 for (i = 0; i < n; i++)
1640 if (offtable[i] != (size_t)(-1))
1641 p[i] = q[offtable[i]];
1644 iconv_close (to_utf8);
1648 iconv_close (to_utf8);
1651 /* Impossible to convert. */
1653 if (is_all_ascii (s, n))
1655 /* ASCII is a subset of UTF-8. */
1656 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1659 /* We have a non-ASCII string and cannot convert it.
1660 Don't produce line breaks except those already present in the
1661 input string. All we assume here is that the encoding is
1662 minimally ASCII compatible. */
1664 const char *s_end = s + n;
1667 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1668 ? UC_BREAK_MANDATORY
1669 : UC_BREAK_PROHIBITED);
1675 /* We cannot compute widths in this case. */
1676 return start_column;
1687 /* Read the contents of an input stream, and return it, terminated with a NUL
1690 read_file (FILE *stream)
1692 #define BUFSIZE 4096
1698 while (! feof (stream))
1700 if (size + BUFSIZE > alloc)
1702 alloc = alloc + alloc / 2;
1703 if (alloc < size + BUFSIZE)
1704 alloc = size + BUFSIZE;
1705 buf = realloc (buf, alloc);
1708 fprintf (stderr, "out of memory\n");
1712 count = fread (buf + size, 1, BUFSIZE, stream);
1715 if (ferror (stream))
1724 buf = realloc (buf, size + 1);
1727 fprintf (stderr, "out of memory\n");
1736 main (int argc, char * argv[])
1738 setlocale (LC_CTYPE, "");
1741 /* Display all the break opportunities in the input string. */
1742 char *input = read_file (stdin);
1743 int length = strlen (input);
1744 char *breaks = malloc (length);
1747 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1749 for (i = 0; i < length; i++)
1753 case UC_BREAK_POSSIBLE:
1756 case UC_BREAK_MANDATORY:
1758 case UC_BREAK_PROHIBITED:
1763 putc (input[i], stdout);
1772 /* Insert line breaks for a given width. */
1773 int width = atoi (argv[1]);
1774 char *input = read_file (stdin);
1775 int length = strlen (input);
1776 char *breaks = malloc (length);
1779 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1781 for (i = 0; i < length; i++)
1785 case UC_BREAK_POSSIBLE:
1786 putc ('\n', stdout);
1788 case UC_BREAK_MANDATORY:
1790 case UC_BREAK_PROHIBITED:
1795 putc (input[i], stdout);