X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Flinebreak.c;fp=lib%2Flinebreak.c;h=e1d47ea46ced1b0c20b21f8e88e786db5a4cb9ad;hb=2fd230b5dc34baf4246d24eba3d4879aa934e730;hp=0000000000000000000000000000000000000000;hpb=4830a05bb8db7ce7805715cf705aa48b808bcd8f;p=gnulib.git diff --git a/lib/linebreak.c b/lib/linebreak.c new file mode 100644 index 000000000..e1d47ea46 --- /dev/null +++ b/lib/linebreak.c @@ -0,0 +1,1806 @@ +/* linebreak.c - line breaking of Unicode strings + Copyright (C) 2001-2003 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +/* Specification. */ +#include "linebreak.h" + +#include +#include +#include "c-ctype.h" + +#include "utf8-ucs4.h" + +#include "utf16-ucs4.h" + +#ifdef unused +static inline int +u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n) +{ + *puc = *s; + return 1; +} +#endif + + +/* Help GCC to generate good code for string comparisons with + immediate strings. */ +#if defined (__GNUC__) && defined (__OPTIMIZE__) + +static inline int +streq9 (const char *s1, const char *s2) +{ + return strcmp (s1 + 9, s2 + 9) == 0; +} + +static inline int +streq8 (const char *s1, const char *s2, char s28) +{ + if (s1[8] == s28) + { + if (s28 == 0) + return 1; + else + return streq9 (s1, s2); + } + else + return 0; +} + +static inline int +streq7 (const char *s1, const char *s2, char s27, char s28) +{ + if (s1[7] == s27) + { + if (s27 == 0) + return 1; + else + return streq8 (s1, s2, s28); + } + else + return 0; +} + +static inline int +streq6 (const char *s1, const char *s2, char s26, char s27, char s28) +{ + if (s1[6] == s26) + { + if (s26 == 0) + return 1; + else + return streq7 (s1, s2, s27, s28); + } + else + return 0; +} + +static inline int +streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28) +{ + if (s1[5] == s25) + { + if (s25 == 0) + return 1; + else + return streq6 (s1, s2, s26, s27, s28); + } + else + return 0; +} + +static inline int +streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28) +{ + if (s1[4] == s24) + { + if (s24 == 0) + return 1; + else + return streq5 (s1, s2, s25, s26, s27, s28); + } + else + return 0; +} + +static inline int +streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28) +{ + if (s1[3] == s23) + { + if (s23 == 0) + return 1; + else + return streq4 (s1, s2, s24, s25, s26, s27, s28); + } + else + return 0; +} + +static inline int +streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28) +{ + if (s1[2] == s22) + { + if (s22 == 0) + return 1; + else + return streq3 (s1, s2, s23, s24, s25, s26, s27, s28); + } + else + return 0; +} + +static inline int +streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) +{ + if (s1[1] == s21) + { + if (s21 == 0) + return 1; + else + return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28); + } + else + return 0; +} + +static inline int +streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) +{ + if (s1[0] == s20) + { + if (s20 == 0) + return 1; + else + return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28); + } + else + return 0; +} + +#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ + streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28) + +#else + +#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ + (strcmp (s1, s2) == 0) + +#endif + + +static int +is_cjk_encoding (const char *encoding) +{ + if (0 + /* Legacy Japanese encodings */ + || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0) + /* Legacy Chinese encodings */ + || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) + || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) + || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) + || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) + /* Legacy Korean encodings */ + || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) + || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0) + || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0)) + return 1; + return 0; +} + +static int +is_utf8_encoding (const char *encoding) +{ + if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) + return 1; + return 0; +} + + +/* Determine number of column positions required for UC. */ +int uc_width (unsigned int uc, const char *encoding); + +/* + * Non-spacing attribute table. + * Consists of: + * - Non-spacing characters; generated from PropList.txt or + * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" + * - Format control characters; generated from + * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" + * - Zero width characters; generated from + * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" + */ +static const unsigned char nonspacing_table_data[16*64] = { + /* 0x0000-0x01ff */ + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */ + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0080-0x00bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */ + /* 0x0200-0x03ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */ + 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */ + /* 0x0400-0x05ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */ + 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */ + 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */ + 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */ + /* 0x0600-0x07ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */ + 0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */ + 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */ + 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */ + 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */ + 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */ + /* 0x0800-0x09ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */ + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */ + 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */ + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */ + 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */ + /* 0x0a00-0x0bff */ + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */ + 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */ + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */ + 0xbe, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */ + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */ + 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */ + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */ + 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */ + /* 0x0c00-0x0dff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */ + 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0c80-0x0cbf */ + 0x40, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */ + 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */ + 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */ + /* 0x0e00-0x0fff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */ + 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */ + 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */ + 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */ + 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */ + 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */ + /* 0x1000-0x11ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */ + 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */ + /* 0x1600-0x17ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */ + 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */ + 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, /* 0x1780-0x17bf */ + 0x40, 0xfe, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */ + /* 0x1800-0x19ff */ + 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1900-0x193f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */ + /* 0x2000-0x21ff */ + 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */ + 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */ + 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */ + /* 0x3000-0x31ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */ + 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */ + /* 0xfa00-0xfbff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */ + 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */ + /* 0xfe00-0xffff */ + 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */ + /* 0x1d000-0x1d1ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */ + 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */ + 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */ +}; +static const signed char nonspacing_table_ind[240] = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */ + 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */ + 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */ + 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */ + -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */ + 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */ +}; + +/* Determine number of column positions required for UC. */ +int +uc_width (unsigned int uc, const char *encoding) +{ + /* Test for non-spacing or control character. */ + if ((uc >> 9) < 240) + { + int ind = nonspacing_table_ind[uc >> 9]; + if (ind >= 0) + if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1) + { + if (uc > 0 && uc < 0x100) + return -1; + else + return 0; + } + } + else if ((uc >> 9) == (0xe0000 >> 9)) + { + if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001) + return 0; + } + /* Test for double-width character. + * Generated from "grep '^....;[WF]' EastAsianWidth.txt" + * and "grep '^....;[^WF]' EastAsianWidth.txt" + */ + if (uc >= 0x1100 + && ((uc < 0x1160) /* Hangul Jamo */ + || (uc >= 0x2e80 && uc < 0xa4d0 /* CJK ... Yi */ + && !(uc == 0x303f)) + || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */ + || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */ + || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */ + || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */ + || (uc >= 0xffe0 && uc < 0xffe7) + || (uc >= 0x20000 && uc <= 0x2a6d6) /* CJK */ + || (uc >= 0x2f800 && uc <= 0x2fa1d) /* CJK Compatibility Ideographs */ + ) ) + return 2; + /* In ancient CJK encodings, Cyrillic and most other characters are + double-width as well. */ + if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9 + && is_cjk_encoding (encoding)) + return 2; + return 1; +} + + +/* Determine number of column positions required for first N units + (or fewer if S ends before this) in S. */ + +int +u8_width (const unsigned char *s, size_t n, const char *encoding) +{ + const unsigned char *s_end = s + n; + int width = 0; + + while (s < s_end) + { + unsigned int uc; + int w; + + s += u8_mbtouc (&uc, s, s_end - s); + + if (uc == 0) + break; /* end of string reached */ + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + width += w; + } + + return width; +} + +int +u16_width (const unsigned short *s, size_t n, const char *encoding) +{ + const unsigned short *s_end = s + n; + int width = 0; + + while (s < s_end) + { + unsigned int uc; + int w; + + s += u16_mbtouc (&uc, s, s_end - s); + + if (uc == 0) + break; /* end of string reached */ + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + width += w; + } + + return width; +} + +int +u32_width (const unsigned int *s, size_t n, const char *encoding) +{ + const unsigned int *s_end = s + n; + int width = 0; + + while (s < s_end) + { + unsigned int uc = *s++; + int w; + + if (uc == 0) + break; /* end of string reached */ + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + width += w; + } + + return width; +} + + +/* Determine the line break points in S, and store the result at p[0..n-1]. */ +/* We don't support line breaking of complex-context dependent characters + (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ + +/* Line breaking classification. */ + +enum +{ + /* Values >= 20 are resolved at run time. */ + LBP_BK = 0, /* mandatory break */ +/*LBP_CR, carriage return - not used here because it's a DOSism */ +/*LBP_LF, line feed - not used here because it's a DOSism */ + LBP_CM = 20, /* attached characters and combining marks */ +/*LBP_SG, surrogates - not used here because they are not characters */ + LBP_ZW = 1, /* zero width space */ + LBP_IN = 2, /* inseparable */ + LBP_GL = 3, /* non-breaking (glue) */ + LBP_CB = 22, /* contingent break opportunity */ + LBP_SP = 21, /* space */ + LBP_BA = 4, /* break opportunity after */ + LBP_BB = 5, /* break opportunity before */ + LBP_B2 = 6, /* break opportunity before and after */ + LBP_HY = 7, /* hyphen */ + LBP_NS = 8, /* non starter */ + LBP_OP = 9, /* opening punctuation */ + LBP_CL = 10, /* closing punctuation */ + LBP_QU = 11, /* ambiguous quotation */ + LBP_EX = 12, /* exclamation/interrogation */ + LBP_ID = 13, /* ideographic */ + LBP_NU = 14, /* numeric */ + LBP_IS = 15, /* infix separator (numeric) */ + LBP_SY = 16, /* symbols allowing breaks */ + LBP_AL = 17, /* ordinary alphabetic and symbol characters */ + LBP_PR = 18, /* prefix (numeric) */ + LBP_PO = 19, /* postfix (numeric) */ + LBP_SA = 23, /* complex context (South East Asian) */ + LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ + LBP_XX = 25 /* unknown */ +}; + +#include "lbrkprop.h" + +static inline unsigned char +lbrkprop_lookup (unsigned int uc) +{ + unsigned int index1 = uc >> lbrkprop_header_0; + if (index1 < lbrkprop_header_1) + { + int lookup1 = lbrkprop.level1[index1]; + if (lookup1 >= 0) + { + unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; + int lookup2 = lbrkprop.level2[lookup1 + index2]; + if (lookup2 >= 0) + { + unsigned int index3 = uc & lbrkprop_header_4; + return lbrkprop.level3[lookup2 + index3]; + } + } + } + return LBP_XX; +} + +/* Table indexed by two line breaking classifications. */ +#define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ +#define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ +#define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ +static const unsigned char lbrk_table[19][19] = { + /* after */ + /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */ +/* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, +/* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, +/* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, +/* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, +/* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, }, +/* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, }, +/* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, }, +/* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, }, +/* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, +/* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, +/* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, }, +/* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, }, +/* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, +/* "" */ +/* before */ +}; +/* Note: The (B2,B2) entry should probably be D instead of P. */ +/* Note: The (PR,ID) entry should probably be D instead of I. */ + +void +u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p) +{ + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const unsigned char *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ + + /* Don't break inside multibyte characters. */ + memset (p, UC_BREAK_PROHIBITED, n); + + while (s < s_end) + { + unsigned int uc; + int count = u8_mbtouc (&uc, s, s_end - s); + int prop = lbrkprop_lookup (uc); + + if (prop == LBP_BK) + { + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; + seen_space = NULL; + seen_space2 = NULL; + } + else + { + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) + { + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; + } + + /* Deal with combining characters. */ + q = p; + if (prop == LBP_CM) + { + /* Don't break just before a combining character. */ + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_AL. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_AL; + goto lookup_via_table; + } + } + else if (prop == LBP_SP) + { + /* Don't break just before a space. */ + *p = UC_BREAK_PROHIBITED; + seen_space2 = seen_space; + seen_space = p; + } + else + { + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) + { + /* Don't break at the beginning of a line. */ + *q = UC_BREAK_PROHIBITED; + } + else + { + switch (lbrk_table [last_prop-1] [prop-1]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; + } + } + + s += count; + p += count; + } +} + +void +u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p) +{ + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const unsigned short *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ + + /* Don't break inside multibyte characters. */ + memset (p, UC_BREAK_PROHIBITED, n); + + while (s < s_end) + { + unsigned int uc; + int count = u16_mbtouc (&uc, s, s_end - s); + int prop = lbrkprop_lookup (uc); + + if (prop == LBP_BK) + { + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; + seen_space = NULL; + seen_space2 = NULL; + } + else + { + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) + { + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; + } + + /* Deal with combining characters. */ + q = p; + if (prop == LBP_CM) + { + /* Don't break just before a combining character. */ + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_AL. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_AL; + goto lookup_via_table; + } + } + else if (prop == LBP_SP) + { + /* Don't break just before a space. */ + *p = UC_BREAK_PROHIBITED; + seen_space2 = seen_space; + seen_space = p; + } + else + { + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) + { + /* Don't break at the beginning of a line. */ + *q = UC_BREAK_PROHIBITED; + } + else + { + switch (lbrk_table [last_prop-1] [prop-1]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; + } + } + + s += count; + p += count; + } +} + +void +u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p) +{ + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const unsigned int *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ + + while (s < s_end) + { + unsigned int uc = *s; + int prop = lbrkprop_lookup (uc); + + if (prop == LBP_BK) + { + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; + seen_space = NULL; + seen_space2 = NULL; + } + else + { + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) + { + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; + } + + /* Deal with combining characters. */ + q = p; + if (prop == LBP_CM) + { + /* Don't break just before a combining character. */ + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_AL. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_AL; + goto lookup_via_table; + } + } + else if (prop == LBP_SP) + { + /* Don't break just before a space. */ + *p = UC_BREAK_PROHIBITED; + seen_space2 = seen_space; + seen_space = p; + } + else + { + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) + { + /* Don't break at the beginning of a line. */ + *q = UC_BREAK_PROHIBITED; + } + else + { + switch (lbrk_table [last_prop-1] [prop-1]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; + } + } + + s++; + p++; + } +} + + +/* Choose the best line breaks, assuming the uc_width function. + Return the column after the end of the string. */ + +int +u8_width_linebreaks (const unsigned char *s, size_t n, + int width, int start_column, int at_end_columns, + const char *o, const char *encoding, + char *p) +{ + const unsigned char *s_end; + char *last_p; + int last_column; + int piece_width; + + u8_possible_linebreaks (s, n, encoding, p); + + s_end = s + n; + last_p = NULL; + last_column = start_column; + piece_width = 0; + while (s < s_end) + { + unsigned int uc; + int count = u8_mbtouc (&uc, s, s_end - s); + + /* Respect the override. */ + if (o != NULL && *o != UC_BREAK_UNDEFINED) + *p = *o; + + if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) + { + /* An atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + } + + if (*p == UC_BREAK_MANDATORY) + { + /* uc is a line break character. */ + /* Start a new piece at column 0. */ + last_p = NULL; + last_column = 0; + piece_width = 0; + } + else + { + /* uc is not a line break character. */ + int w; + + if (*p == UC_BREAK_POSSIBLE) + { + /* Start a new piece. */ + last_p = p; + last_column += piece_width; + piece_width = 0; + /* No line break for the moment, may be turned into + UC_BREAK_POSSIBLE later, via last_p. */ + } + + *p = UC_BREAK_PROHIBITED; + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + piece_width += w; + } + + s += count; + p += count; + if (o != NULL) + o += count; + } + + /* The last atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width + at_end_columns > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + + return last_column + piece_width; +} + +int +u16_width_linebreaks (const unsigned short *s, size_t n, + int width, int start_column, int at_end_columns, + const char *o, const char *encoding, + char *p) +{ + const unsigned short *s_end; + char *last_p; + int last_column; + int piece_width; + + u16_possible_linebreaks (s, n, encoding, p); + + s_end = s + n; + last_p = NULL; + last_column = start_column; + piece_width = 0; + while (s < s_end) + { + unsigned int uc; + int count = u16_mbtouc (&uc, s, s_end - s); + + /* Respect the override. */ + if (o != NULL && *o != UC_BREAK_UNDEFINED) + *p = *o; + + if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) + { + /* An atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + } + + if (*p == UC_BREAK_MANDATORY) + { + /* uc is a line break character. */ + /* Start a new piece at column 0. */ + last_p = NULL; + last_column = 0; + piece_width = 0; + } + else + { + /* uc is not a line break character. */ + int w; + + if (*p == UC_BREAK_POSSIBLE) + { + /* Start a new piece. */ + last_p = p; + last_column += piece_width; + piece_width = 0; + /* No line break for the moment, may be turned into + UC_BREAK_POSSIBLE later, via last_p. */ + } + + *p = UC_BREAK_PROHIBITED; + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + piece_width += w; + } + + s += count; + p += count; + if (o != NULL) + o += count; + } + + /* The last atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width + at_end_columns > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + + return last_column + piece_width; +} + +int +u32_width_linebreaks (const unsigned int *s, size_t n, + int width, int start_column, int at_end_columns, + const char *o, const char *encoding, + char *p) +{ + const unsigned int *s_end; + char *last_p; + int last_column; + int piece_width; + + u32_possible_linebreaks (s, n, encoding, p); + + s_end = s + n; + last_p = NULL; + last_column = start_column; + piece_width = 0; + while (s < s_end) + { + unsigned int uc = *s; + + /* Respect the override. */ + if (o != NULL && *o != UC_BREAK_UNDEFINED) + *p = *o; + + if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) + { + /* An atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + } + + if (*p == UC_BREAK_MANDATORY) + { + /* uc is a line break character. */ + /* Start a new piece at column 0. */ + last_p = NULL; + last_column = 0; + piece_width = 0; + } + else + { + /* uc is not a line break character. */ + int w; + + if (*p == UC_BREAK_POSSIBLE) + { + /* Start a new piece. */ + last_p = p; + last_column += piece_width; + piece_width = 0; + /* No line break for the moment, may be turned into + UC_BREAK_POSSIBLE later, via last_p. */ + } + + *p = UC_BREAK_PROHIBITED; + + w = uc_width (uc, encoding); + if (w >= 0) /* ignore control characters in the string */ + piece_width += w; + } + + s++; + p++; + if (o != NULL) + o++; + } + + /* The last atomic piece of text ends here. */ + if (last_p != NULL && last_column + piece_width + at_end_columns > width) + { + /* Insert a line break. */ + *last_p = UC_BREAK_POSSIBLE; + last_column = 0; + } + + return last_column + piece_width; +} + + +#ifdef TEST1 + +#include + +/* Read the contents of an input stream, and return it, terminated with a NUL + byte. */ +char * +read_file (FILE *stream) +{ +#define BUFSIZE 4096 + char *buf = NULL; + int alloc = 0; + int size = 0; + int count; + + while (! feof (stream)) + { + if (size + BUFSIZE > alloc) + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } + count = fread (buf + size, 1, BUFSIZE, stream); + if (count == 0) + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } + else + size += count; + } + buf = realloc (buf, size + 1); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + buf[size] = '\0'; + return buf; +#undef BUFSIZE +} + +int +main (int argc, char * argv[]) +{ + if (argc == 1) + { + /* Display all the break opportunities in the input string. */ + char *input = read_file (stdin); + int length = strlen (input); + char *breaks = malloc (length); + int i; + + u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks); + + for (i = 0; i < length; i++) + { + switch (breaks[i]) + { + case UC_BREAK_POSSIBLE: + /* U+2027 in UTF-8 encoding */ + putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); + break; + case UC_BREAK_MANDATORY: + /* U+21B2 (or U+21B5) in UTF-8 encoding */ + putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); + break; + case UC_BREAK_PROHIBITED: + break; + default: + abort (); + } + putc (input[i], stdout); + } + + free (breaks); + + return 0; + } + else if (argc == 2) + { + /* Insert line breaks for a given width. */ + int width = atoi (argv[1]); + char *input = read_file (stdin); + int length = strlen (input); + char *breaks = malloc (length); + int i; + + u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks); + + for (i = 0; i < length; i++) + { + switch (breaks[i]) + { + case UC_BREAK_POSSIBLE: + putc ('\n', stdout); + break; + case UC_BREAK_MANDATORY: + break; + case UC_BREAK_PROHIBITED: + break; + default: + abort (); + } + putc (input[i], stdout); + } + + free (breaks); + + return 0; + } + else + return 1; +} + +#endif /* TEST1 */ + + +/* Now the same thing with an arbitrary encoding. + + We convert the input string to Unicode. + + The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, + UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to + \U0000FFFF. UTF-16 and variants support only characters up to + \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. + UCS-4 specification leaves doubts about endianness and byte order mark. + glibc currently interprets it as big endian without byte order mark, + but this is not backed by an RFC. So we use UTF-8. It supports + characters up to \U7FFFFFFF and is unambiguously defined. */ + +#if HAVE_ICONV + +#include +#include + +/* Luckily, the encoding's name is platform independent. */ +#define UTF8_NAME "UTF-8" + +/* Return the length of a string after conversion through an iconv_t. */ +static size_t +iconv_string_length (iconv_t cd, const char *s, size_t n) +{ +#define TMPBUFSIZE 4096 + size_t count = 0; + char tmpbuf[TMPBUFSIZE]; + const char *inptr = s; + size_t insize = n; + while (insize > 0) + { + char *outptr = tmpbuf; + size_t outsize = TMPBUFSIZE; + size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); + if (res == (size_t)(-1)) + return (size_t)(-1); + count += outptr - tmpbuf; + } + /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ +#if defined _LIBICONV_VERSION \ + || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) + { + char *outptr = tmpbuf; + size_t outsize = TMPBUFSIZE; + size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); + if (res == (size_t)(-1)) + return (size_t)(-1); + count += outptr - tmpbuf; + } + /* Return to the initial state. */ + iconv (cd, NULL, NULL, NULL, NULL); +#endif + return count; +#undef TMPBUFSIZE +} + +static void +iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, + size_t *offtable, char *t, size_t m) +{ + size_t i; + const char *s_end; + const char *inptr; + char *outptr; + size_t outsize; + /* Avoid glibc-2.1 bug. */ +#if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) + const size_t extra = 1; +#else + const size_t extra = 0; +#endif + + for (i = 0; i < n; i++) + offtable[i] = (size_t)(-1); + + s_end = s + n; + inptr = s; + outptr = t; + outsize = m + extra; + while (inptr < s_end) + { + const char *saved_inptr; + size_t insize; + size_t res; + + offtable[inptr - s] = outptr - t; + + saved_inptr = inptr; + res = (size_t)(-1); + for (insize = 1; inptr + insize <= s_end; insize++) + { + res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); + if (!(res == (size_t)(-1) && errno == EINVAL)) + break; + /* We expect that no input bytes have been consumed so far. */ + if (inptr != saved_inptr) + abort (); + } + /* After we verified the convertibility and computed the translation's + size m, there shouldn't be any conversion error here. */ + if (res == (size_t)(-1)) + abort (); + } + /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ +#if defined _LIBICONV_VERSION \ + || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) + if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) + abort (); +#endif + /* We should have produced exactly m output bytes. */ + if (outsize != extra) + abort (); +} + +#endif /* HAVE_ICONV */ + +#if C_CTYPE_ASCII + +/* Tests whether a string is entirely ASCII. Returns 1 if yes. + Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */ +static int +is_all_ascii (const char *s, size_t n) +{ + for (; n > 0; s++, n--) + { + unsigned char c = (unsigned char) *s; + + if (!(c_isprint (c) || c_isspace (c))) + return 0; + } + return 1; +} + +#endif /* C_CTYPE_ASCII */ + +void +mbs_possible_linebreaks (const char *s, size_t n, const char *encoding, + char *p) +{ + if (n == 0) + return; + if (is_utf8_encoding (encoding)) + u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); + else + { +#if HAVE_ICONV + iconv_t to_utf8; + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) + to_utf8 = (iconv_t)(-1); + else +# endif + /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, + GB18030. */ +# if defined __sun && !defined _LIBICONV_VERSION + if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) + || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) + || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) + || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') + || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) + || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) + to_utf8 = (iconv_t)(-1); + else +# endif + to_utf8 = iconv_open (UTF8_NAME, encoding); + if (to_utf8 != (iconv_t)(-1)) + { + /* Determine the length of the resulting UTF-8 string. */ + size_t m = iconv_string_length (to_utf8, s, n); + if (m != (size_t)(-1)) + { + /* Convert the string to UTF-8 and build a translation table + from offsets into s to offsets into the translated string. */ + char *memory = malloc (n * sizeof (size_t) + m + m); + if (memory != NULL) + { + size_t *offtable = (size_t *) memory; + char *t = (char *) (offtable + n); + char *q = (char *) (t + m); + size_t i; + + iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); + + /* Determine the possible line breaks of the UTF-8 string. */ + u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q); + + /* Translate the result back to the original string. */ + memset (p, UC_BREAK_PROHIBITED, n); + for (i = 0; i < n; i++) + if (offtable[i] != (size_t)(-1)) + p[i] = q[offtable[i]]; + + free (memory); + iconv_close (to_utf8); + return; + } + } + iconv_close (to_utf8); + } +#endif + /* Impossible to convert. */ +#if C_CTYPE_ASCII + if (is_all_ascii (s, n)) + { + /* ASCII is a subset of UTF-8. */ + u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); + return; + } +#endif + /* We have a non-ASCII string and cannot convert it. + Don't produce line breaks except those already present in the + input string. All we assume here is that the encoding is + minimally ASCII compatible. */ + { + const char *s_end = s + n; + while (s < s_end) + { + *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); + s++; + p++; + } + } + } +} + +int +mbs_width_linebreaks (const char *s, size_t n, + int width, int start_column, int at_end_columns, + const char *o, const char *encoding, + char *p) +{ + if (n == 0) + return start_column; + if (is_utf8_encoding (encoding)) + return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); + else + { +#if HAVE_ICONV + iconv_t to_utf8; + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) + to_utf8 = (iconv_t)(-1); + else +# endif + /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, + GB18030. */ +# if defined __sun && !defined _LIBICONV_VERSION + if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) + || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) + || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) + || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') + || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) + || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) + to_utf8 = (iconv_t)(-1); + else +# endif + to_utf8 = iconv_open (UTF8_NAME, encoding); + if (to_utf8 != (iconv_t)(-1)) + { + /* Determine the length of the resulting UTF-8 string. */ + size_t m = iconv_string_length (to_utf8, s, n); + if (m != (size_t)(-1)) + { + /* Convert the string to UTF-8 and build a translation table + from offsets into s to offsets into the translated string. */ + char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0)); + if (memory != NULL) + { + size_t *offtable = (size_t *) memory; + char *t = (char *) (offtable + n); + char *q = (char *) (t + m); + char *o8 = (o != NULL ? (char *) (q + m) : NULL); + int res_column; + size_t i; + + iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); + + /* Translate the overrides to the UTF-8 string. */ + if (o != NULL) + { + memset (o8, UC_BREAK_UNDEFINED, m); + for (i = 0; i < n; i++) + if (offtable[i] != (size_t)(-1)) + o8[offtable[i]] = o[i]; + } + + /* Determine the line breaks of the UTF-8 string. */ + res_column = + u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q); + + /* Translate the result back to the original string. */ + memset (p, UC_BREAK_PROHIBITED, n); + for (i = 0; i < n; i++) + if (offtable[i] != (size_t)(-1)) + p[i] = q[offtable[i]]; + + free (memory); + iconv_close (to_utf8); + return res_column; + } + } + iconv_close (to_utf8); + } +#endif + /* Impossible to convert. */ +#if C_CTYPE_ASCII + if (is_all_ascii (s, n)) + { + /* ASCII is a subset of UTF-8. */ + return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); + } +#endif + /* We have a non-ASCII string and cannot convert it. + Don't produce line breaks except those already present in the + input string. All we assume here is that the encoding is + minimally ASCII compatible. */ + { + const char *s_end = s + n; + while (s < s_end) + { + *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' + ? UC_BREAK_MANDATORY + : UC_BREAK_PROHIBITED); + s++; + p++; + if (o != NULL) + o++; + } + /* We cannot compute widths in this case. */ + return start_column; + } + } +} + + +#ifdef TEST2 + +#include +#include + +/* Read the contents of an input stream, and return it, terminated with a NUL + byte. */ +char * +read_file (FILE *stream) +{ +#define BUFSIZE 4096 + char *buf = NULL; + int alloc = 0; + int size = 0; + int count; + + while (! feof (stream)) + { + if (size + BUFSIZE > alloc) + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } + count = fread (buf + size, 1, BUFSIZE, stream); + if (count == 0) + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } + else + size += count; + } + buf = realloc (buf, size + 1); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + buf[size] = '\0'; + return buf; +#undef BUFSIZE +} + +int +main (int argc, char * argv[]) +{ + setlocale (LC_CTYPE, ""); + if (argc == 1) + { + /* Display all the break opportunities in the input string. */ + char *input = read_file (stdin); + int length = strlen (input); + char *breaks = malloc (length); + int i; + + mbs_possible_linebreaks (input, length, locale_charset (), breaks); + + for (i = 0; i < length; i++) + { + switch (breaks[i]) + { + case UC_BREAK_POSSIBLE: + putc ('|', stdout); + break; + case UC_BREAK_MANDATORY: + break; + case UC_BREAK_PROHIBITED: + break; + default: + abort (); + } + putc (input[i], stdout); + } + + free (breaks); + + return 0; + } + else if (argc == 2) + { + /* Insert line breaks for a given width. */ + int width = atoi (argv[1]); + char *input = read_file (stdin); + int length = strlen (input); + char *breaks = malloc (length); + int i; + + mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); + + for (i = 0; i < length; i++) + { + switch (breaks[i]) + { + case UC_BREAK_POSSIBLE: + putc ('\n', stdout); + break; + case UC_BREAK_MANDATORY: + break; + case UC_BREAK_PROHIBITED: + break; + default: + abort (); + } + putc (input[i], stdout); + } + + free (breaks); + + return 0; + } + else + return 1; +} + +#endif /* TEST2 */