lib/linebreak.c

   1 /* linebreak.c - line breaking of Unicode strings
   2    Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5 This program is free software; you can redistribute it and/or modify
   6 it under the terms of the GNU General Public License as published by
   7 the Free Software Foundation; either version 2, or (at your option)
   8 any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; if not, write to the Free Software Foundation,
  17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  18
  19 #include <config.h>
  20
  21 /* Specification.  */
  22 #include "linebreak.h"
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include "c-ctype.h"
  27 #include "xsize.h"
  28
  29 #include "utf8-ucs4.h"
  30
  31 #include "utf16-ucs4.h"
  32
  33 #ifdef unused
  34 static inline int
  35 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
  36 {
  37   *puc = *s;
  38   return 1;
  39 }
  40 #endif
  41
  42
  43 /* Help GCC to generate good code for string comparisons with
  44    immediate strings. */
  45 #if defined (__GNUC__) && defined (__OPTIMIZE__)
  46
  47 static inline int
  48 streq9 (const char *s1, const char *s2)
  49 {
  50   return strcmp (s1 + 9, s2 + 9) == 0;
  51 }
  52
  53 static inline int
  54 streq8 (const char *s1, const char *s2, char s28)
  55 {
  56   if (s1[8] == s28)
  57     {
  58       if (s28 == 0)
  59         return 1;
  60       else
  61         return streq9 (s1, s2);
  62     }
  63   else
  64     return 0;
  65 }
  66
  67 static inline int
  68 streq7 (const char *s1, const char *s2, char s27, char s28)
  69 {
  70   if (s1[7] == s27)
  71     {
  72       if (s27 == 0)
  73         return 1;
  74       else
  75         return streq8 (s1, s2, s28);
  76     }
  77   else
  78     return 0;
  79 }
  80
  81 static inline int
  82 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
  83 {
  84   if (s1[6] == s26)
  85     {
  86       if (s26 == 0)
  87         return 1;
  88       else
  89         return streq7 (s1, s2, s27, s28);
  90     }
  91   else
  92     return 0;
  93 }
  94
  95 static inline int
  96 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
  97 {
  98   if (s1[5] == s25)
  99     {
 100       if (s25 == 0)
 101         return 1;
 102       else
 103         return streq6 (s1, s2, s26, s27, s28);
 104     }
 105   else
 106     return 0;
 107 }
 108
 109 static inline int
 110 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
 111 {
 112   if (s1[4] == s24)
 113     {
 114       if (s24 == 0)
 115         return 1;
 116       else
 117         return streq5 (s1, s2, s25, s26, s27, s28);
 118     }
 119   else
 120     return 0;
 121 }
 122
 123 static inline int
 124 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
 125 {
 126   if (s1[3] == s23)
 127     {
 128       if (s23 == 0)
 129         return 1;
 130       else
 131         return streq4 (s1, s2, s24, s25, s26, s27, s28);
 132     }
 133   else
 134     return 0;
 135 }
 136
 137 static inline int
 138 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 139 {
 140   if (s1[2] == s22)
 141     {
 142       if (s22 == 0)
 143         return 1;
 144       else
 145         return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
 146     }
 147   else
 148     return 0;
 149 }
 150
 151 static inline int
 152 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 153 {
 154   if (s1[1] == s21)
 155     {
 156       if (s21 == 0)
 157         return 1;
 158       else
 159         return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
 160     }
 161   else
 162     return 0;
 163 }
 164
 165 static inline int
 166 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 167 {
 168   if (s1[0] == s20)
 169     {
 170       if (s20 == 0)
 171         return 1;
 172       else
 173         return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
 174     }
 175   else
 176     return 0;
 177 }
 178
 179 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 180   streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
 181
 182 #else
 183
 184 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 185   (strcmp (s1, s2) == 0)
 186
 187 #endif
 188
 189
 190 static int
 191 is_cjk_encoding (const char *encoding)
 192 {
 193   if (0
 194       /* Legacy Japanese encodings */
 195       || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
 196       /* Legacy Chinese encodings */
 197       || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 198       || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
 199       || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
 200       || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
 201       /* Legacy Korean encodings */
 202       || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 203       || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
 204       || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
 205     return 1;
 206   return 0;
 207 }
 208
 209 static int
 210 is_utf8_encoding (const char *encoding)
 211 {
 212   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
 213     return 1;
 214   return 0;
 215 }
 216
 217
 218 /* Determine number of column positions required for UC. */
 219 int uc_width (unsigned int uc, const char *encoding);
 220
 221 /*
 222  * Non-spacing attribute table.
 223  * Consists of:
 224  * - Non-spacing characters; generated from PropList.txt or
 225  *   "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
 226  * - Format control characters; generated from
 227  *   "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
 228  * - Zero width characters; generated from
 229  *   "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
 230  */
 231 static const unsigned char nonspacing_table_data[16*64] = {
 232   /* 0x0000-0x01ff */
 233   0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
 234   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
 235   0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
 236   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
 237   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
 238   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
 239   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
 240   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
 241   /* 0x0200-0x03ff */
 242   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
 243   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
 244   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
 245   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
 246   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
 247   0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
 248   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
 249   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
 250   /* 0x0400-0x05ff */
 251   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
 252   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
 253   0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
 254   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
 255   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
 256   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
 257   0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
 258   0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
 259   /* 0x0600-0x07ff */
 260   0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
 261   0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
 262   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
 263   0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
 264   0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
 265   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
 266   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
 267   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
 268   /* 0x0800-0x09ff */
 269   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
 270   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
 271   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
 272   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
 273   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
 274   0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
 275   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
 276   0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
 277   /* 0x0a00-0x0bff */
 278   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
 279   0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
 280   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
 281   0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
 282   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
 283   0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
 284   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
 285   0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
 286   /* 0x0c00-0x0dff */
 287   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
 288   0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
 289   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
 290   0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
 291   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
 292   0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
 293   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
 294   0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
 295   /* 0x0e00-0x0fff */
 296   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
 297   0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
 298   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
 299   0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
 300   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
 301   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
 302   0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
 303   0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
 304   /* 0x1000-0x11ff */
 305   0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
 306   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
 307   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
 308   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
 309   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
 310   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
 311   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
 312   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
 313   /* 0x1600-0x17ff */
 314   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
 315   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
 316   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
 317   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
 318   0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
 319   0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
 320   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
 321   0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
 322   /* 0x1800-0x19ff */
 323   0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
 324   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
 325   0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
 326   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
 327   0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
 328   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
 329   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
 330   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
 331   /* 0x2000-0x21ff */
 332   0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
 333   0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
 334   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
 335   0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
 336   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
 337   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
 338   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
 339   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
 340   /* 0x3000-0x31ff */
 341   0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
 342   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
 343   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
 344   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
 345   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
 346   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
 347   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
 348   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
 349   /* 0xfa00-0xfbff */
 350   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
 351   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
 352   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
 353   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
 354   0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
 355   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
 356   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
 357   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
 358   /* 0xfe00-0xffff */
 359   0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
 360   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
 361   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
 362   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
 363   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
 364   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
 365   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
 366   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
 367   /* 0x1d000-0x1d1ff */
 368   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
 369   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
 370   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
 371   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
 372   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
 373   0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
 374   0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
 375   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0x1d1c0-0x1d1ff */
 376 };
 377 static const signed char nonspacing_table_ind[240] = {
 378    0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
 379    8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
 380   11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
 381   12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
 382   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
 383   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
 384   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
 385   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
 386   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
 387   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
 388   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
 389   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
 390   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
 391   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
 392   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
 393   -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
 394   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
 395   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
 396   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
 397   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
 398   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
 399   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
 400   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
 401   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
 402   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
 403   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
 404   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
 405   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
 406   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
 407   15, -1, -1, -1, -1, -1, -1, -1  /* 0x1d000-0x1dfff */
 408 };
 409
 410 /* Determine number of column positions required for UC. */
 411 int
 412 uc_width (unsigned int uc, const char *encoding)
 413 {
 414   /* Test for non-spacing or control character.  */
 415   if ((uc >> 9) < 240)
 416     {
 417       int ind = nonspacing_table_ind[uc >> 9];
 418       if (ind >= 0)
 419         if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
 420           {
 421             if (uc > 0 && uc < 0xa0)
 422               return -1;
 423             else
 424               return 0;
 425           }
 426     }
 427   else if ((uc >> 9) == (0xe0000 >> 9))
 428     {
 429       if (uc < 0xe0100
 430           ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
 431           : (uc <= 0xe01ef))
 432         return 0;
 433     }
 434   /* Test for double-width character.
 435    * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
 436    * and            "grep '^....;[^WF]' EastAsianWidth.txt"
 437    */
 438   if (uc >= 0x1100
 439       && ((uc < 0x1160) /* Hangul Jamo */
 440           || (uc >= 0x2e80 && uc < 0x4dc0  /* CJK */
 441               && !(uc == 0x303f))
 442           || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
 443           || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
 444           || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
 445           || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
 446           || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
 447           || (uc >= 0xffe0 && uc < 0xffe7)
 448           || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
 449           || (uc >= 0x30000 && uc <= 0x3fffd)
 450      )   )
 451     return 2;
 452   /* In ancient CJK encodings, Cyrillic and most other characters are
 453      double-width as well.  */
 454   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
 455       && is_cjk_encoding (encoding))
 456     return 2;
 457   return 1;
 458 }
 459
 460
 461 /* Determine number of column positions required for first N units
 462    (or fewer if S ends before this) in S.  */
 463
 464 int
 465 u8_width (const unsigned char *s, size_t n, const char *encoding)
 466 {
 467   const unsigned char *s_end = s + n;
 468   int width = 0;
 469
 470   while (s < s_end)
 471     {
 472       unsigned int uc;
 473       int w;
 474
 475       s += u8_mbtouc (&uc, s, s_end - s);
 476
 477       if (uc == 0)
 478         break; /* end of string reached */
 479
 480       w = uc_width (uc, encoding);
 481       if (w >= 0) /* ignore control characters in the string */
 482         width += w;
 483     }
 484
 485   return width;
 486 }
 487
 488 int
 489 u16_width (const unsigned short *s, size_t n, const char *encoding)
 490 {
 491   const unsigned short *s_end = s + n;
 492   int width = 0;
 493
 494   while (s < s_end)
 495     {
 496       unsigned int uc;
 497       int w;
 498
 499       s += u16_mbtouc (&uc, s, s_end - s);
 500
 501       if (uc == 0)
 502         break; /* end of string reached */
 503
 504       w = uc_width (uc, encoding);
 505       if (w >= 0) /* ignore control characters in the string */
 506         width += w;
 507     }
 508
 509   return width;
 510 }
 511
 512 int
 513 u32_width (const unsigned int *s, size_t n, const char *encoding)
 514 {
 515   const unsigned int *s_end = s + n;
 516   int width = 0;
 517
 518   while (s < s_end)
 519     {
 520       unsigned int uc = *s++;
 521       int w;
 522
 523       if (uc == 0)
 524         break; /* end of string reached */
 525
 526       w = uc_width (uc, encoding);
 527       if (w >= 0) /* ignore control characters in the string */
 528         width += w;
 529     }
 530
 531   return width;
 532 }
 533
 534
 535 /* Determine the line break points in S, and store the result at p[0..n-1].  */
 536 /* We don't support line breaking of complex-context dependent characters
 537    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
 538
 539 /* Line breaking classification.  */
 540
 541 enum
 542 {
 543   /* Values >= 20 are resolved at run time. */
 544   LBP_BK =  0, /* mandatory break */
 545 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 546 /*LBP_LF,         line feed - not used here because it's a DOSism */
 547   LBP_CM = 20, /* attached characters and combining marks */
 548 /*LBP_SG,         surrogates - not used here because they are not characters */
 549   LBP_ZW =  1, /* zero width space */
 550   LBP_IN =  2, /* inseparable */
 551   LBP_GL =  3, /* non-breaking (glue) */
 552   LBP_CB = 22, /* contingent break opportunity */
 553   LBP_SP = 21, /* space */
 554   LBP_BA =  4, /* break opportunity after */
 555   LBP_BB =  5, /* break opportunity before */
 556   LBP_B2 =  6, /* break opportunity before and after */
 557   LBP_HY =  7, /* hyphen */
 558   LBP_NS =  8, /* non starter */
 559   LBP_OP =  9, /* opening punctuation */
 560   LBP_CL = 10, /* closing punctuation */
 561   LBP_QU = 11, /* ambiguous quotation */
 562   LBP_EX = 12, /* exclamation/interrogation */
 563   LBP_ID = 13, /* ideographic */
 564   LBP_NU = 14, /* numeric */
 565   LBP_IS = 15, /* infix separator (numeric) */
 566   LBP_SY = 16, /* symbols allowing breaks */
 567   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
 568   LBP_PR = 18, /* prefix (numeric) */
 569   LBP_PO = 19, /* postfix (numeric) */
 570   LBP_SA = 23, /* complex context (South East Asian) */
 571   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
 572   LBP_XX = 25  /* unknown */
 573 };
 574
 575 #include "lbrkprop.h"
 576
 577 static inline unsigned char
 578 lbrkprop_lookup (unsigned int uc)
 579 {
 580   unsigned int index1 = uc >> lbrkprop_header_0;
 581   if (index1 < lbrkprop_header_1)
 582     {
 583       int lookup1 = lbrkprop.level1[index1];
 584       if (lookup1 >= 0)
 585         {
 586           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
 587           int lookup2 = lbrkprop.level2[lookup1 + index2];
 588           if (lookup2 >= 0)
 589             {
 590               unsigned int index3 = uc & lbrkprop_header_4;
 591               return lbrkprop.level3[lookup2 + index3];
 592             }
 593         }
 594     }
 595   return LBP_XX;
 596 }
 597
 598 /* Table indexed by two line breaking classifications.  */
 599 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
 600 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 601 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 602 static const unsigned char lbrk_table[19][19] = {
 603                                 /* after */
 604         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
 605 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
 606 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 607 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 608 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 609 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 610 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 611 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 612 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 613 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
 614 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
 615 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
 616 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 617 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
 618 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
 619 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 620 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 621 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
 622 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
 623 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 624 /* "" */
 625 /* before */
 626 };
 627 /* Note: The (B2,B2) entry should probably be D instead of P.  */
 628 /* Note: The (PR,ID) entry should probably be D instead of I.  */
 629
 630 void
 631 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
 632 {
 633   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 634   const unsigned char *s_end = s + n;
 635   int last_prop = LBP_BK; /* line break property of last non-space character */
 636   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 637   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 638
 639   /* Don't break inside multibyte characters.  */
 640   memset (p, UC_BREAK_PROHIBITED, n);
 641
 642   while (s < s_end)
 643     {
 644       unsigned int uc;
 645       int count = u8_mbtouc (&uc, s, s_end - s);
 646       int prop = lbrkprop_lookup (uc);
 647
 648       if (prop == LBP_BK)
 649         {
 650           /* Mandatory break.  */
 651           *p = UC_BREAK_MANDATORY;
 652           last_prop = LBP_BK;
 653           seen_space = NULL;
 654           seen_space2 = NULL;
 655         }
 656       else
 657         {
 658           char *q;
 659
 660           /* Resolve property values whose behaviour is not fixed.  */
 661           switch (prop)
 662             {
 663               case LBP_AI:
 664                 /* Resolve ambiguous.  */
 665                 prop = LBP_AI_REPLACEMENT;
 666                 break;
 667               case LBP_CB:
 668                 /* This is arbitrary.  */
 669                 prop = LBP_ID;
 670                 break;
 671               case LBP_SA:
 672                 /* We don't handle complex scripts yet.
 673                    Treat LBP_SA like LBP_XX.  */
 674               case LBP_XX:
 675                 /* This is arbitrary.  */
 676                 prop = LBP_AL;
 677                 break;
 678             }
 679
 680           /* Deal with combining characters.  */
 681           q = p;
 682           if (prop == LBP_CM)
 683             {
 684               /* Don't break just before a combining character.  */
 685               *p = UC_BREAK_PROHIBITED;
 686               /* A combining character turns a preceding space into LBP_AL.  */
 687               if (seen_space != NULL)
 688                 {
 689                   q = seen_space;
 690                   seen_space = seen_space2;
 691                   prop = LBP_AL;
 692                   goto lookup_via_table;
 693                 }
 694             }
 695           else if (prop == LBP_SP)
 696             {
 697               /* Don't break just before a space.  */
 698               *p = UC_BREAK_PROHIBITED;
 699               seen_space2 = seen_space;
 700               seen_space = p;
 701             }
 702           else
 703             {
 704              lookup_via_table:
 705               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 706               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 707                 abort ();
 708
 709               if (last_prop == LBP_BK)
 710                 {
 711                   /* Don't break at the beginning of a line.  */
 712                   *q = UC_BREAK_PROHIBITED;
 713                 }
 714               else
 715                 {
 716                   switch (lbrk_table [last_prop-1] [prop-1])
 717                     {
 718                       case D:
 719                         *q = UC_BREAK_POSSIBLE;
 720                         break;
 721                       case I:
 722                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 723                         break;
 724                       case P:
 725                         *q = UC_BREAK_PROHIBITED;
 726                         break;
 727                       default:
 728                         abort ();
 729                     }
 730                 }
 731               last_prop = prop;
 732               seen_space = NULL;
 733               seen_space2 = NULL;
 734             }
 735         }
 736
 737       s += count;
 738       p += count;
 739     }
 740 }
 741
 742 void
 743 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
 744 {
 745   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 746   const unsigned short *s_end = s + n;
 747   int last_prop = LBP_BK; /* line break property of last non-space character */
 748   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 749   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 750
 751   /* Don't break inside multibyte characters.  */
 752   memset (p, UC_BREAK_PROHIBITED, n);
 753
 754   while (s < s_end)
 755     {
 756       unsigned int uc;
 757       int count = u16_mbtouc (&uc, s, s_end - s);
 758       int prop = lbrkprop_lookup (uc);
 759
 760       if (prop == LBP_BK)
 761         {
 762           /* Mandatory break.  */
 763           *p = UC_BREAK_MANDATORY;
 764           last_prop = LBP_BK;
 765           seen_space = NULL;
 766           seen_space2 = NULL;
 767         }
 768       else
 769         {
 770           char *q;
 771
 772           /* Resolve property values whose behaviour is not fixed.  */
 773           switch (prop)
 774             {
 775               case LBP_AI:
 776                 /* Resolve ambiguous.  */
 777                 prop = LBP_AI_REPLACEMENT;
 778                 break;
 779               case LBP_CB:
 780                 /* This is arbitrary.  */
 781                 prop = LBP_ID;
 782                 break;
 783               case LBP_SA:
 784                 /* We don't handle complex scripts yet.
 785                    Treat LBP_SA like LBP_XX.  */
 786               case LBP_XX:
 787                 /* This is arbitrary.  */
 788                 prop = LBP_AL;
 789                 break;
 790             }
 791
 792           /* Deal with combining characters.  */
 793           q = p;
 794           if (prop == LBP_CM)
 795             {
 796               /* Don't break just before a combining character.  */
 797               *p = UC_BREAK_PROHIBITED;
 798               /* A combining character turns a preceding space into LBP_AL.  */
 799               if (seen_space != NULL)
 800                 {
 801                   q = seen_space;
 802                   seen_space = seen_space2;
 803                   prop = LBP_AL;
 804                   goto lookup_via_table;
 805                 }
 806             }
 807           else if (prop == LBP_SP)
 808             {
 809               /* Don't break just before a space.  */
 810               *p = UC_BREAK_PROHIBITED;
 811               seen_space2 = seen_space;
 812               seen_space = p;
 813             }
 814           else
 815             {
 816              lookup_via_table:
 817               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 818               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 819                 abort ();
 820
 821               if (last_prop == LBP_BK)
 822                 {
 823                   /* Don't break at the beginning of a line.  */
 824                   *q = UC_BREAK_PROHIBITED;
 825                 }
 826               else
 827                 {
 828                   switch (lbrk_table [last_prop-1] [prop-1])
 829                     {
 830                       case D:
 831                         *q = UC_BREAK_POSSIBLE;
 832                         break;
 833                       case I:
 834                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 835                         break;
 836                       case P:
 837                         *q = UC_BREAK_PROHIBITED;
 838                         break;
 839                       default:
 840                         abort ();
 841                     }
 842                 }
 843               last_prop = prop;
 844               seen_space = NULL;
 845               seen_space2 = NULL;
 846             }
 847         }
 848
 849       s += count;
 850       p += count;
 851     }
 852 }
 853
 854 void
 855 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
 856 {
 857   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 858   const unsigned int *s_end = s + n;
 859   int last_prop = LBP_BK; /* line break property of last non-space character */
 860   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 861   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 862
 863   while (s < s_end)
 864     {
 865       unsigned int uc = *s;
 866       int prop = lbrkprop_lookup (uc);
 867
 868       if (prop == LBP_BK)
 869         {
 870           /* Mandatory break.  */
 871           *p = UC_BREAK_MANDATORY;
 872           last_prop = LBP_BK;
 873           seen_space = NULL;
 874           seen_space2 = NULL;
 875         }
 876       else
 877         {
 878           char *q;
 879
 880           /* Resolve property values whose behaviour is not fixed.  */
 881           switch (prop)
 882             {
 883               case LBP_AI:
 884                 /* Resolve ambiguous.  */
 885                 prop = LBP_AI_REPLACEMENT;
 886                 break;
 887               case LBP_CB:
 888                 /* This is arbitrary.  */
 889                 prop = LBP_ID;
 890                 break;
 891               case LBP_SA:
 892                 /* We don't handle complex scripts yet.
 893                    Treat LBP_SA like LBP_XX.  */
 894               case LBP_XX:
 895                 /* This is arbitrary.  */
 896                 prop = LBP_AL;
 897                 break;
 898             }
 899
 900           /* Deal with combining characters.  */
 901           q = p;
 902           if (prop == LBP_CM)
 903             {
 904               /* Don't break just before a combining character.  */
 905               *p = UC_BREAK_PROHIBITED;
 906               /* A combining character turns a preceding space into LBP_AL.  */
 907               if (seen_space != NULL)
 908                 {
 909                   q = seen_space;
 910                   seen_space = seen_space2;
 911                   prop = LBP_AL;
 912                   goto lookup_via_table;
 913                 }
 914             }
 915           else if (prop == LBP_SP)
 916             {
 917               /* Don't break just before a space.  */
 918               *p = UC_BREAK_PROHIBITED;
 919               seen_space2 = seen_space;
 920               seen_space = p;
 921             }
 922           else
 923             {
 924              lookup_via_table:
 925               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 926               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 927                 abort ();
 928
 929               if (last_prop == LBP_BK)
 930                 {
 931                   /* Don't break at the beginning of a line.  */
 932                   *q = UC_BREAK_PROHIBITED;
 933                 }
 934               else
 935                 {
 936                   switch (lbrk_table [last_prop-1] [prop-1])
 937                     {
 938                       case D:
 939                         *q = UC_BREAK_POSSIBLE;
 940                         break;
 941                       case I:
 942                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 943                         break;
 944                       case P:
 945                         *q = UC_BREAK_PROHIBITED;
 946                         break;
 947                       default:
 948                         abort ();
 949                     }
 950                 }
 951               last_prop = prop;
 952               seen_space = NULL;
 953               seen_space2 = NULL;
 954             }
 955         }
 956
 957       s++;
 958       p++;
 959     }
 960 }
 961
 962
 963 /* Choose the best line breaks, assuming the uc_width function.
 964    Return the column after the end of the string.  */
 965
 966 int
 967 u8_width_linebreaks (const unsigned char *s, size_t n,
 968                      int width, int start_column, int at_end_columns,
 969                      const char *o, const char *encoding,
 970                      char *p)
 971 {
 972   const unsigned char *s_end;
 973   char *last_p;
 974   int last_column;
 975   int piece_width;
 976
 977   u8_possible_linebreaks (s, n, encoding, p);
 978
 979   s_end = s + n;
 980   last_p = NULL;
 981   last_column = start_column;
 982   piece_width = 0;
 983   while (s < s_end)
 984     {
 985       unsigned int uc;
 986       int count = u8_mbtouc (&uc, s, s_end - s);
 987
 988       /* Respect the override.  */
 989       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 990         *p = *o;
 991
 992       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 993         {
 994           /* An atomic piece of text ends here.  */
 995           if (last_p != NULL && last_column + piece_width > width)
 996             {
 997               /* Insert a line break.  */
 998               *last_p = UC_BREAK_POSSIBLE;
 999               last_column = 0;
1000             }
1001         }
1002
1003       if (*p == UC_BREAK_MANDATORY)
1004         {
1005           /* uc is a line break character.  */
1006           /* Start a new piece at column 0.  */
1007           last_p = NULL;
1008           last_column = 0;
1009           piece_width = 0;
1010         }
1011       else
1012         {
1013           /* uc is not a line break character.  */
1014           int w;
1015
1016           if (*p == UC_BREAK_POSSIBLE)
1017             {
1018               /* Start a new piece.  */
1019               last_p = p;
1020               last_column += piece_width;
1021               piece_width = 0;
1022               /* No line break for the moment, may be turned into
1023                  UC_BREAK_POSSIBLE later, via last_p. */
1024             }
1025
1026           *p = UC_BREAK_PROHIBITED;
1027
1028           w = uc_width (uc, encoding);
1029           if (w >= 0) /* ignore control characters in the string */
1030             piece_width += w;
1031          }
1032
1033       s += count;
1034       p += count;
1035       if (o != NULL)
1036         o += count;
1037     }
1038
1039   /* The last atomic piece of text ends here.  */
1040   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1041     {
1042       /* Insert a line break.  */
1043       *last_p = UC_BREAK_POSSIBLE;
1044       last_column = 0;
1045     }
1046
1047   return last_column + piece_width;
1048 }
1049
1050 int
1051 u16_width_linebreaks (const unsigned short *s, size_t n,
1052                       int width, int start_column, int at_end_columns,
1053                       const char *o, const char *encoding,
1054                       char *p)
1055 {
1056   const unsigned short *s_end;
1057   char *last_p;
1058   int last_column;
1059   int piece_width;
1060
1061   u16_possible_linebreaks (s, n, encoding, p);
1062
1063   s_end = s + n;
1064   last_p = NULL;
1065   last_column = start_column;
1066   piece_width = 0;
1067   while (s < s_end)
1068     {
1069       unsigned int uc;
1070       int count = u16_mbtouc (&uc, s, s_end - s);
1071
1072       /* Respect the override.  */
1073       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1074         *p = *o;
1075
1076       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1077         {
1078           /* An atomic piece of text ends here.  */
1079           if (last_p != NULL && last_column + piece_width > width)
1080             {
1081               /* Insert a line break.  */
1082               *last_p = UC_BREAK_POSSIBLE;
1083               last_column = 0;
1084             }
1085         }
1086
1087       if (*p == UC_BREAK_MANDATORY)
1088         {
1089           /* uc is a line break character.  */
1090           /* Start a new piece at column 0.  */
1091           last_p = NULL;
1092           last_column = 0;
1093           piece_width = 0;
1094         }
1095       else
1096         {
1097           /* uc is not a line break character.  */
1098           int w;
1099
1100           if (*p == UC_BREAK_POSSIBLE)
1101             {
1102               /* Start a new piece.  */
1103               last_p = p;
1104               last_column += piece_width;
1105               piece_width = 0;
1106               /* No line break for the moment, may be turned into
1107                  UC_BREAK_POSSIBLE later, via last_p. */
1108             }
1109
1110           *p = UC_BREAK_PROHIBITED;
1111
1112           w = uc_width (uc, encoding);
1113           if (w >= 0) /* ignore control characters in the string */
1114             piece_width += w;
1115          }
1116
1117       s += count;
1118       p += count;
1119       if (o != NULL)
1120         o += count;
1121     }
1122
1123   /* The last atomic piece of text ends here.  */
1124   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1125     {
1126       /* Insert a line break.  */
1127       *last_p = UC_BREAK_POSSIBLE;
1128       last_column = 0;
1129     }
1130
1131   return last_column + piece_width;
1132 }
1133
1134 int
1135 u32_width_linebreaks (const unsigned int *s, size_t n,
1136                       int width, int start_column, int at_end_columns,
1137                       const char *o, const char *encoding,
1138                       char *p)
1139 {
1140   const unsigned int *s_end;
1141   char *last_p;
1142   int last_column;
1143   int piece_width;
1144
1145   u32_possible_linebreaks (s, n, encoding, p);
1146
1147   s_end = s + n;
1148   last_p = NULL;
1149   last_column = start_column;
1150   piece_width = 0;
1151   while (s < s_end)
1152     {
1153       unsigned int uc = *s;
1154
1155       /* Respect the override.  */
1156       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1157         *p = *o;
1158
1159       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1160         {
1161           /* An atomic piece of text ends here.  */
1162           if (last_p != NULL && last_column + piece_width > width)
1163             {
1164               /* Insert a line break.  */
1165               *last_p = UC_BREAK_POSSIBLE;
1166               last_column = 0;
1167             }
1168         }
1169
1170       if (*p == UC_BREAK_MANDATORY)
1171         {
1172           /* uc is a line break character.  */
1173           /* Start a new piece at column 0.  */
1174           last_p = NULL;
1175           last_column = 0;
1176           piece_width = 0;
1177         }
1178       else
1179         {
1180           /* uc is not a line break character.  */
1181           int w;
1182
1183           if (*p == UC_BREAK_POSSIBLE)
1184             {
1185               /* Start a new piece.  */
1186               last_p = p;
1187               last_column += piece_width;
1188               piece_width = 0;
1189               /* No line break for the moment, may be turned into
1190                  UC_BREAK_POSSIBLE later, via last_p. */
1191             }
1192
1193           *p = UC_BREAK_PROHIBITED;
1194
1195           w = uc_width (uc, encoding);
1196           if (w >= 0) /* ignore control characters in the string */
1197             piece_width += w;
1198          }
1199
1200       s++;
1201       p++;
1202       if (o != NULL)
1203         o++;
1204     }
1205
1206   /* The last atomic piece of text ends here.  */
1207   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1208     {
1209       /* Insert a line break.  */
1210       *last_p = UC_BREAK_POSSIBLE;
1211       last_column = 0;
1212     }
1213
1214   return last_column + piece_width;
1215 }
1216
1217
1218 #ifdef TEST1
1219
1220 #include <stdio.h>
1221
1222 /* Read the contents of an input stream, and return it, terminated with a NUL
1223    byte. */
1224 char *
1225 read_file (FILE *stream)
1226 {
1227 #define BUFSIZE 4096
1228   char *buf = NULL;
1229   int alloc = 0;
1230   int size = 0;
1231   int count;
1232
1233   while (! feof (stream))
1234     {
1235       if (size + BUFSIZE > alloc)
1236         {
1237           alloc = alloc + alloc / 2;
1238           if (alloc < size + BUFSIZE)
1239             alloc = size + BUFSIZE;
1240           buf = realloc (buf, alloc);
1241           if (buf == NULL)
1242             {
1243               fprintf (stderr, "out of memory\n");
1244               exit (1);
1245             }
1246         }
1247       count = fread (buf + size, 1, BUFSIZE, stream);
1248       if (count == 0)
1249         {
1250           if (ferror (stream))
1251             {
1252               perror ("fread");
1253               exit (1);
1254             }
1255         }
1256       else
1257         size += count;
1258     }
1259   buf = realloc (buf, size + 1);
1260   if (buf == NULL)
1261     {
1262       fprintf (stderr, "out of memory\n");
1263       exit (1);
1264     }
1265   buf[size] = '\0';
1266   return buf;
1267 #undef BUFSIZE
1268 }
1269
1270 int
1271 main (int argc, char * argv[])
1272 {
1273   if (argc == 1)
1274     {
1275       /* Display all the break opportunities in the input string.  */
1276       char *input = read_file (stdin);
1277       int length = strlen (input);
1278       char *breaks = malloc (length);
1279       int i;
1280
1281       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1282
1283       for (i = 0; i < length; i++)
1284         {
1285           switch (breaks[i])
1286             {
1287               case UC_BREAK_POSSIBLE:
1288                 /* U+2027 in UTF-8 encoding */
1289                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1290                 break;
1291               case UC_BREAK_MANDATORY:
1292                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1293                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1294                 break;
1295               case UC_BREAK_PROHIBITED:
1296                 break;
1297               default:
1298                 abort ();
1299             }
1300           putc (input[i], stdout);
1301         }
1302
1303       free (breaks);
1304
1305       return 0;
1306     }
1307   else if (argc == 2)
1308     {
1309       /* Insert line breaks for a given width.  */
1310       int width = atoi (argv[1]);
1311       char *input = read_file (stdin);
1312       int length = strlen (input);
1313       char *breaks = malloc (length);
1314       int i;
1315
1316       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1317
1318       for (i = 0; i < length; i++)
1319         {
1320           switch (breaks[i])
1321             {
1322               case UC_BREAK_POSSIBLE:
1323                 putc ('\n', stdout);
1324                 break;
1325               case UC_BREAK_MANDATORY:
1326                 break;
1327               case UC_BREAK_PROHIBITED:
1328                 break;
1329               default:
1330                 abort ();
1331             }
1332           putc (input[i], stdout);
1333         }
1334
1335       free (breaks);
1336
1337       return 0;
1338     }
1339   else
1340     return 1;
1341 }
1342
1343 #endif /* TEST1 */
1344
1345
1346 /* Now the same thing with an arbitrary encoding.
1347
1348    We convert the input string to Unicode.
1349
1350    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1351    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
1352    \U0000FFFF.  UTF-16 and variants support only characters up to
1353    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
1354    UCS-4 specification leaves doubts about endianness and byte order mark.
1355    glibc currently interprets it as big endian without byte order mark,
1356    but this is not backed by an RFC.  So we use UTF-8. It supports
1357    characters up to \U7FFFFFFF and is unambiguously defined.  */
1358
1359 #if HAVE_ICONV
1360
1361 #include <iconv.h>
1362 #include <errno.h>
1363
1364 /* Luckily, the encoding's name is platform independent.  */
1365 #define UTF8_NAME "UTF-8"
1366
1367 /* Return the length of a string after conversion through an iconv_t.  */
1368 static size_t
1369 iconv_string_length (iconv_t cd, const char *s, size_t n)
1370 {
1371 #define TMPBUFSIZE 4096
1372   size_t count = 0;
1373   char tmpbuf[TMPBUFSIZE];
1374   const char *inptr = s;
1375   size_t insize = n;
1376   while (insize > 0)
1377     {
1378       char *outptr = tmpbuf;
1379       size_t outsize = TMPBUFSIZE;
1380       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1381       if (res == (size_t)(-1) && errno != E2BIG)
1382         return (size_t)(-1);
1383       count += outptr - tmpbuf;
1384     }
1385   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
1386 #if defined _LIBICONV_VERSION \
1387     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1388   {
1389     char *outptr = tmpbuf;
1390     size_t outsize = TMPBUFSIZE;
1391     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1392     if (res == (size_t)(-1))
1393       return (size_t)(-1);
1394     count += outptr - tmpbuf;
1395   }
1396   /* Return to the initial state.  */
1397   iconv (cd, NULL, NULL, NULL, NULL);
1398 #endif
1399   return count;
1400 #undef TMPBUFSIZE
1401 }
1402
1403 static void
1404 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1405                               size_t *offtable, char *t, size_t m)
1406 {
1407   size_t i;
1408   const char *s_end;
1409   const char *inptr;
1410   char *outptr;
1411   size_t outsize;
1412   /* Avoid glibc-2.1 bug.  */
1413 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1414   const size_t extra = 1;
1415 #else
1416   const size_t extra = 0;
1417 #endif
1418
1419   for (i = 0; i < n; i++)
1420     offtable[i] = (size_t)(-1);
1421
1422   s_end = s + n;
1423   inptr = s;
1424   outptr = t;
1425   outsize = m + extra;
1426   while (inptr < s_end)
1427     {
1428       const char *saved_inptr;
1429       size_t insize;
1430       size_t res;
1431
1432       offtable[inptr - s] = outptr - t;
1433
1434       saved_inptr = inptr;
1435       res = (size_t)(-1);
1436       for (insize = 1; inptr + insize <= s_end; insize++)
1437         {
1438           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1439           if (!(res == (size_t)(-1) && errno == EINVAL))
1440             break;
1441           /* We expect that no input bytes have been consumed so far.  */
1442           if (inptr != saved_inptr)
1443             abort ();
1444         }
1445       /* After we verified the convertibility and computed the translation's
1446          size m, there shouldn't be any conversion error here. */
1447       if (res == (size_t)(-1))
1448         abort ();
1449     }
1450   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
1451 #if defined _LIBICONV_VERSION \
1452     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1453   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1454     abort ();
1455 #endif
1456   /* We should have produced exactly m output bytes.  */
1457   if (outsize != extra)
1458     abort ();
1459 }
1460
1461 #endif /* HAVE_ICONV */
1462
1463 #if C_CTYPE_ASCII
1464
1465 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
1466    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
1467 static int
1468 is_all_ascii (const char *s, size_t n)
1469 {
1470   for (; n > 0; s++, n--)
1471     {
1472       unsigned char c = (unsigned char) *s;
1473
1474       if (!(c_isprint (c) || c_isspace (c)))
1475         return 0;
1476     }
1477   return 1;
1478 }
1479
1480 #endif /* C_CTYPE_ASCII */
1481
1482 void
1483 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1484                          char *p)
1485 {
1486   if (n == 0)
1487     return;
1488   if (is_utf8_encoding (encoding))
1489     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1490   else
1491     {
1492 #if HAVE_ICONV
1493       iconv_t to_utf8;
1494       /* Avoid glibc-2.1 bug with EUC-KR.  */
1495 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1496       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1497         to_utf8 = (iconv_t)(-1);
1498       else
1499 # endif
1500       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1501          GB18030.  */
1502 # if defined __sun && !defined _LIBICONV_VERSION
1503       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1504           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1505           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1506           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1507           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1508           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1509         to_utf8 = (iconv_t)(-1);
1510       else
1511 # endif
1512       to_utf8 = iconv_open (UTF8_NAME, encoding);
1513       if (to_utf8 != (iconv_t)(-1))
1514         {
1515           /* Determine the length of the resulting UTF-8 string.  */
1516           size_t m = iconv_string_length (to_utf8, s, n);
1517           if (m != (size_t)(-1))
1518             {
1519               /* Convert the string to UTF-8 and build a translation table
1520                  from offsets into s to offsets into the translated string.  */
1521               size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1522               char *memory =
1523                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1524               if (memory != NULL)
1525                 {
1526                   size_t *offtable = (size_t *) memory;
1527                   char *t = (char *) (offtable + n);
1528                   char *q = (char *) (t + m);
1529                   size_t i;
1530
1531                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1532
1533                   /* Determine the possible line breaks of the UTF-8 string.  */
1534                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1535
1536                   /* Translate the result back to the original string.  */
1537                   memset (p, UC_BREAK_PROHIBITED, n);
1538                   for (i = 0; i < n; i++)
1539                     if (offtable[i] != (size_t)(-1))
1540                       p[i] = q[offtable[i]];
1541
1542                   free (memory);
1543                   iconv_close (to_utf8);
1544                   return;
1545                 }
1546             }
1547           iconv_close (to_utf8);
1548         }
1549 #endif
1550       /* Impossible to convert.  */
1551 #if C_CTYPE_ASCII
1552       if (is_all_ascii (s, n))
1553         {
1554           /* ASCII is a subset of UTF-8.  */
1555           u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1556           return;
1557         }
1558 #endif
1559       /* We have a non-ASCII string and cannot convert it.
1560          Don't produce line breaks except those already present in the
1561          input string.  All we assume here is that the encoding is
1562          minimally ASCII compatible.  */
1563       {
1564         const char *s_end = s + n;
1565         while (s < s_end)
1566           {
1567             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1568             s++;
1569             p++;
1570           }
1571       }
1572     }
1573 }
1574
1575 int
1576 mbs_width_linebreaks (const char *s, size_t n,
1577                       int width, int start_column, int at_end_columns,
1578                       const char *o, const char *encoding,
1579                       char *p)
1580 {
1581   if (n == 0)
1582     return start_column;
1583   if (is_utf8_encoding (encoding))
1584     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1585   else
1586     {
1587 #if HAVE_ICONV
1588       iconv_t to_utf8;
1589       /* Avoid glibc-2.1 bug with EUC-KR.  */
1590 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1591       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1592         to_utf8 = (iconv_t)(-1);
1593       else
1594 # endif
1595       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1596          GB18030.  */
1597 # if defined __sun && !defined _LIBICONV_VERSION
1598       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1599           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1600           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1601           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1602           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1603           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1604         to_utf8 = (iconv_t)(-1);
1605       else
1606 # endif
1607       to_utf8 = iconv_open (UTF8_NAME, encoding);
1608       if (to_utf8 != (iconv_t)(-1))
1609         {
1610           /* Determine the length of the resulting UTF-8 string.  */
1611           size_t m = iconv_string_length (to_utf8, s, n);
1612           if (m != (size_t)(-1))
1613             {
1614               /* Convert the string to UTF-8 and build a translation table
1615                  from offsets into s to offsets into the translated string.  */
1616               size_t memory_size =
1617                 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1618                        (o != NULL ? m : 0));
1619               char *memory =
1620                 (char *)
1621                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1622               if (memory != NULL)
1623                 {
1624                   size_t *offtable = (size_t *) memory;
1625                   char *t = (char *) (offtable + n);
1626                   char *q = (char *) (t + m);
1627                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1628                   int res_column;
1629                   size_t i;
1630
1631                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1632
1633                   /* Translate the overrides to the UTF-8 string.  */
1634                   if (o != NULL)
1635                     {
1636                       memset (o8, UC_BREAK_UNDEFINED, m);
1637                       for (i = 0; i < n; i++)
1638                         if (offtable[i] != (size_t)(-1))
1639                           o8[offtable[i]] = o[i];
1640                     }
1641
1642                   /* Determine the line breaks of the UTF-8 string.  */
1643                   res_column =
1644                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1645
1646                   /* Translate the result back to the original string.  */
1647                   memset (p, UC_BREAK_PROHIBITED, n);
1648                   for (i = 0; i < n; i++)
1649                     if (offtable[i] != (size_t)(-1))
1650                       p[i] = q[offtable[i]];
1651
1652                   free (memory);
1653                   iconv_close (to_utf8);
1654                   return res_column;
1655                 }
1656             }
1657           iconv_close (to_utf8);
1658         }
1659 #endif
1660       /* Impossible to convert.  */
1661 #if C_CTYPE_ASCII
1662       if (is_all_ascii (s, n))
1663         {
1664           /* ASCII is a subset of UTF-8.  */
1665           return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1666         }
1667 #endif
1668       /* We have a non-ASCII string and cannot convert it.
1669          Don't produce line breaks except those already present in the
1670          input string.  All we assume here is that the encoding is
1671          minimally ASCII compatible.  */
1672       {
1673         const char *s_end = s + n;
1674         while (s < s_end)
1675           {
1676             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1677                   ? UC_BREAK_MANDATORY
1678                   : UC_BREAK_PROHIBITED);
1679             s++;
1680             p++;
1681             if (o != NULL)
1682               o++;
1683           }
1684         /* We cannot compute widths in this case.  */
1685         return start_column;
1686       }
1687     }
1688 }
1689
1690
1691 #ifdef TEST2
1692
1693 #include <stdio.h>
1694 #include <locale.h>
1695
1696 /* Read the contents of an input stream, and return it, terminated with a NUL
1697    byte. */
1698 char *
1699 read_file (FILE *stream)
1700 {
1701 #define BUFSIZE 4096
1702   char *buf = NULL;
1703   int alloc = 0;
1704   int size = 0;
1705   int count;
1706
1707   while (! feof (stream))
1708     {
1709       if (size + BUFSIZE > alloc)
1710         {
1711           alloc = alloc + alloc / 2;
1712           if (alloc < size + BUFSIZE)
1713             alloc = size + BUFSIZE;
1714           buf = realloc (buf, alloc);
1715           if (buf == NULL)
1716             {
1717               fprintf (stderr, "out of memory\n");
1718               exit (1);
1719             }
1720         }
1721       count = fread (buf + size, 1, BUFSIZE, stream);
1722       if (count == 0)
1723         {
1724           if (ferror (stream))
1725             {
1726               perror ("fread");
1727               exit (1);
1728             }
1729         }
1730       else
1731         size += count;
1732     }
1733   buf = realloc (buf, size + 1);
1734   if (buf == NULL)
1735     {
1736       fprintf (stderr, "out of memory\n");
1737       exit (1);
1738     }
1739   buf[size] = '\0';
1740   return buf;
1741 #undef BUFSIZE
1742 }
1743
1744 int
1745 main (int argc, char * argv[])
1746 {
1747   setlocale (LC_CTYPE, "");
1748   if (argc == 1)
1749     {
1750       /* Display all the break opportunities in the input string.  */
1751       char *input = read_file (stdin);
1752       int length = strlen (input);
1753       char *breaks = malloc (length);
1754       int i;
1755
1756       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1757
1758       for (i = 0; i < length; i++)
1759         {
1760           switch (breaks[i])
1761             {
1762               case UC_BREAK_POSSIBLE:
1763                 putc ('|', stdout);
1764                 break;
1765               case UC_BREAK_MANDATORY:
1766                 break;
1767               case UC_BREAK_PROHIBITED:
1768                 break;
1769               default:
1770                 abort ();
1771             }
1772           putc (input[i], stdout);
1773         }
1774
1775       free (breaks);
1776
1777       return 0;
1778     }
1779   else if (argc == 2)
1780     {
1781       /* Insert line breaks for a given width.  */
1782       int width = atoi (argv[1]);
1783       char *input = read_file (stdin);
1784       int length = strlen (input);
1785       char *breaks = malloc (length);
1786       int i;
1787
1788       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1789
1790       for (i = 0; i < length; i++)
1791         {
1792           switch (breaks[i])
1793             {
1794               case UC_BREAK_POSSIBLE:
1795                 putc ('\n', stdout);
1796                 break;
1797               case UC_BREAK_MANDATORY:
1798                 break;
1799               case UC_BREAK_PROHIBITED:
1800                 break;
1801               default:
1802                 abort ();
1803             }
1804           putc (input[i], stdout);
1805         }
1806
1807       free (breaks);
1808
1809       return 0;
1810     }
1811   else
1812     return 1;
1813 }
1814
1815 #endif /* TEST2 */