lib/linebreak.c

   1 /* linebreak.c - line breaking of Unicode strings
   2    Copyright (C) 2001-2003 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5 This program is free software; you can redistribute it and/or modify
   6 it under the terms of the GNU General Public License as published by
   7 the Free Software Foundation; either version 2, or (at your option)
   8 any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; if not, write to the Free Software
  17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include <config.h>
  21 #endif
  22
  23 /* Specification.  */
  24 #include "linebreak.h"
  25
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include "c-ctype.h"
  29
  30 #include "utf8-ucs4.h"
  31
  32 #include "utf16-ucs4.h"
  33
  34 #ifdef unused
  35 static inline int
  36 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
  37 {
  38   *puc = *s;
  39   return 1;
  40 }
  41 #endif
  42
  43
  44 /* Help GCC to generate good code for string comparisons with
  45    immediate strings. */
  46 #if defined (__GNUC__) && defined (__OPTIMIZE__)
  47
  48 static inline int
  49 streq9 (const char *s1, const char *s2)
  50 {
  51   return strcmp (s1 + 9, s2 + 9) == 0;
  52 }
  53
  54 static inline int
  55 streq8 (const char *s1, const char *s2, char s28)
  56 {
  57   if (s1[8] == s28)
  58     {
  59       if (s28 == 0)
  60         return 1;
  61       else
  62         return streq9 (s1, s2);
  63     }
  64   else
  65     return 0;
  66 }
  67
  68 static inline int
  69 streq7 (const char *s1, const char *s2, char s27, char s28)
  70 {
  71   if (s1[7] == s27)
  72     {
  73       if (s27 == 0)
  74         return 1;
  75       else
  76         return streq8 (s1, s2, s28);
  77     }
  78   else
  79     return 0;
  80 }
  81
  82 static inline int
  83 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
  84 {
  85   if (s1[6] == s26)
  86     {
  87       if (s26 == 0)
  88         return 1;
  89       else
  90         return streq7 (s1, s2, s27, s28);
  91     }
  92   else
  93     return 0;
  94 }
  95
  96 static inline int
  97 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
  98 {
  99   if (s1[5] == s25)
 100     {
 101       if (s25 == 0)
 102         return 1;
 103       else
 104         return streq6 (s1, s2, s26, s27, s28);
 105     }
 106   else
 107     return 0;
 108 }
 109
 110 static inline int
 111 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
 112 {
 113   if (s1[4] == s24)
 114     {
 115       if (s24 == 0)
 116         return 1;
 117       else
 118         return streq5 (s1, s2, s25, s26, s27, s28);
 119     }
 120   else
 121     return 0;
 122 }
 123
 124 static inline int
 125 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
 126 {
 127   if (s1[3] == s23)
 128     {
 129       if (s23 == 0)
 130         return 1;
 131       else
 132         return streq4 (s1, s2, s24, s25, s26, s27, s28);
 133     }
 134   else
 135     return 0;
 136 }
 137
 138 static inline int
 139 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 140 {
 141   if (s1[2] == s22)
 142     {
 143       if (s22 == 0)
 144         return 1;
 145       else
 146         return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
 147     }
 148   else
 149     return 0;
 150 }
 151
 152 static inline int
 153 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 154 {
 155   if (s1[1] == s21)
 156     {
 157       if (s21 == 0)
 158         return 1;
 159       else
 160         return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
 161     }
 162   else
 163     return 0;
 164 }
 165
 166 static inline int
 167 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 168 {
 169   if (s1[0] == s20)
 170     {
 171       if (s20 == 0)
 172         return 1;
 173       else
 174         return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
 175     }
 176   else
 177     return 0;
 178 }
 179
 180 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 181   streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
 182
 183 #else
 184
 185 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 186   (strcmp (s1, s2) == 0)
 187
 188 #endif
 189
 190
 191 static int
 192 is_cjk_encoding (const char *encoding)
 193 {
 194   if (0
 195       /* Legacy Japanese encodings */
 196       || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
 197       /* Legacy Chinese encodings */
 198       || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 199       || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
 200       || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
 201       || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
 202       /* Legacy Korean encodings */
 203       || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 204       || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
 205       || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
 206     return 1;
 207   return 0;
 208 }
 209
 210 static int
 211 is_utf8_encoding (const char *encoding)
 212 {
 213   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
 214     return 1;
 215   return 0;
 216 }
 217
 218
 219 /* Determine number of column positions required for UC. */
 220 int uc_width (unsigned int uc, const char *encoding);
 221
 222 /*
 223  * Non-spacing attribute table.
 224  * Consists of:
 225  * - Non-spacing characters; generated from PropList.txt or
 226  *   "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
 227  * - Format control characters; generated from
 228  *   "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
 229  * - Zero width characters; generated from
 230  *   "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
 231  */
 232 static const unsigned char nonspacing_table_data[16*64] = {
 233   /* 0x0000-0x01ff */
 234   0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
 235   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
 236   0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
 237   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
 238   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
 239   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
 240   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
 241   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
 242   /* 0x0200-0x03ff */
 243   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
 244   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
 245   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
 246   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
 247   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
 248   0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
 249   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
 250   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
 251   /* 0x0400-0x05ff */
 252   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
 253   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
 254   0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
 255   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
 256   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
 257   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
 258   0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
 259   0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
 260   /* 0x0600-0x07ff */
 261   0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
 262   0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
 263   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
 264   0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
 265   0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
 266   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
 267   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
 268   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
 269   /* 0x0800-0x09ff */
 270   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
 271   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
 272   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
 273   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
 274   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
 275   0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
 276   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
 277   0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
 278   /* 0x0a00-0x0bff */
 279   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
 280   0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
 281   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
 282   0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
 283   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
 284   0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
 285   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
 286   0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
 287   /* 0x0c00-0x0dff */
 288   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
 289   0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
 290   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
 291   0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
 292   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
 293   0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
 294   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
 295   0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
 296   /* 0x0e00-0x0fff */
 297   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
 298   0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
 299   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
 300   0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
 301   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
 302   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
 303   0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
 304   0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
 305   /* 0x1000-0x11ff */
 306   0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
 307   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
 308   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
 309   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
 310   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
 311   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
 312   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
 313   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
 314   /* 0x1600-0x17ff */
 315   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
 316   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
 317   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
 318   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
 319   0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
 320   0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
 321   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
 322   0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
 323   /* 0x1800-0x19ff */
 324   0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
 325   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
 326   0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
 327   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
 328   0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
 329   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
 330   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
 331   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
 332   /* 0x2000-0x21ff */
 333   0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
 334   0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
 335   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
 336   0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
 337   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
 338   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
 339   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
 340   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
 341   /* 0x3000-0x31ff */
 342   0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
 343   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
 344   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
 345   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
 346   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
 347   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
 348   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
 349   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
 350   /* 0xfa00-0xfbff */
 351   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
 352   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
 353   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
 354   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
 355   0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
 356   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
 357   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
 358   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
 359   /* 0xfe00-0xffff */
 360   0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
 361   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
 362   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
 363   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
 364   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
 365   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
 366   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
 367   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
 368   /* 0x1d000-0x1d1ff */
 369   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
 370   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
 371   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
 372   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
 373   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
 374   0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
 375   0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
 376   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0x1d1c0-0x1d1ff */
 377 };
 378 static const signed char nonspacing_table_ind[240] = {
 379    0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
 380    8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
 381   11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
 382   12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
 383   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
 384   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
 385   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
 386   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
 387   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
 388   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
 389   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
 390   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
 391   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
 392   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
 393   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
 394   -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
 395   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
 396   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
 397   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
 398   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
 399   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
 400   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
 401   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
 402   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
 403   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
 404   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
 405   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
 406   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
 407   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
 408   15, -1, -1, -1, -1, -1, -1, -1  /* 0x1d000-0x1dfff */
 409 };
 410
 411 /* Determine number of column positions required for UC. */
 412 int
 413 uc_width (unsigned int uc, const char *encoding)
 414 {
 415   /* Test for non-spacing or control character.  */
 416   if ((uc >> 9) < 240)
 417     {
 418       int ind = nonspacing_table_ind[uc >> 9];
 419       if (ind >= 0)
 420         if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
 421           {
 422             if (uc > 0 && uc < 0xa0)
 423               return -1;
 424             else
 425               return 0;
 426           }
 427     }
 428   else if ((uc >> 9) == (0xe0000 >> 9))
 429     {
 430       if (uc < 0xe0100
 431           ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
 432           : (uc <= 0xe01ef))
 433         return 0;
 434     }
 435   /* Test for double-width character.
 436    * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
 437    * and            "grep '^....;[^WF]' EastAsianWidth.txt"
 438    */
 439   if (uc >= 0x1100
 440       && ((uc < 0x1160) /* Hangul Jamo */
 441           || (uc >= 0x2e80 && uc < 0x4dc0  /* CJK */
 442               && !(uc == 0x303f))
 443           || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
 444           || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
 445           || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
 446           || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
 447           || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
 448           || (uc >= 0xffe0 && uc < 0xffe7)
 449           || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
 450           || (uc >= 0x30000 && uc <= 0x3fffd)
 451      )   )
 452     return 2;
 453   /* In ancient CJK encodings, Cyrillic and most other characters are
 454      double-width as well.  */
 455   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
 456       && is_cjk_encoding (encoding))
 457     return 2;
 458   return 1;
 459 }
 460
 461
 462 /* Determine number of column positions required for first N units
 463    (or fewer if S ends before this) in S.  */
 464
 465 int
 466 u8_width (const unsigned char *s, size_t n, const char *encoding)
 467 {
 468   const unsigned char *s_end = s + n;
 469   int width = 0;
 470
 471   while (s < s_end)
 472     {
 473       unsigned int uc;
 474       int w;
 475
 476       s += u8_mbtouc (&uc, s, s_end - s);
 477
 478       if (uc == 0)
 479         break; /* end of string reached */
 480
 481       w = uc_width (uc, encoding);
 482       if (w >= 0) /* ignore control characters in the string */
 483         width += w;
 484     }
 485
 486   return width;
 487 }
 488
 489 int
 490 u16_width (const unsigned short *s, size_t n, const char *encoding)
 491 {
 492   const unsigned short *s_end = s + n;
 493   int width = 0;
 494
 495   while (s < s_end)
 496     {
 497       unsigned int uc;
 498       int w;
 499
 500       s += u16_mbtouc (&uc, s, s_end - s);
 501
 502       if (uc == 0)
 503         break; /* end of string reached */
 504
 505       w = uc_width (uc, encoding);
 506       if (w >= 0) /* ignore control characters in the string */
 507         width += w;
 508     }
 509
 510   return width;
 511 }
 512
 513 int
 514 u32_width (const unsigned int *s, size_t n, const char *encoding)
 515 {
 516   const unsigned int *s_end = s + n;
 517   int width = 0;
 518
 519   while (s < s_end)
 520     {
 521       unsigned int uc = *s++;
 522       int w;
 523
 524       if (uc == 0)
 525         break; /* end of string reached */
 526
 527       w = uc_width (uc, encoding);
 528       if (w >= 0) /* ignore control characters in the string */
 529         width += w;
 530     }
 531
 532   return width;
 533 }
 534
 535
 536 /* Determine the line break points in S, and store the result at p[0..n-1].  */
 537 /* We don't support line breaking of complex-context dependent characters
 538    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
 539
 540 /* Line breaking classification.  */
 541
 542 enum
 543 {
 544   /* Values >= 20 are resolved at run time. */
 545   LBP_BK =  0, /* mandatory break */
 546 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 547 /*LBP_LF,         line feed - not used here because it's a DOSism */
 548   LBP_CM = 20, /* attached characters and combining marks */
 549 /*LBP_SG,         surrogates - not used here because they are not characters */
 550   LBP_ZW =  1, /* zero width space */
 551   LBP_IN =  2, /* inseparable */
 552   LBP_GL =  3, /* non-breaking (glue) */
 553   LBP_CB = 22, /* contingent break opportunity */
 554   LBP_SP = 21, /* space */
 555   LBP_BA =  4, /* break opportunity after */
 556   LBP_BB =  5, /* break opportunity before */
 557   LBP_B2 =  6, /* break opportunity before and after */
 558   LBP_HY =  7, /* hyphen */
 559   LBP_NS =  8, /* non starter */
 560   LBP_OP =  9, /* opening punctuation */
 561   LBP_CL = 10, /* closing punctuation */
 562   LBP_QU = 11, /* ambiguous quotation */
 563   LBP_EX = 12, /* exclamation/interrogation */
 564   LBP_ID = 13, /* ideographic */
 565   LBP_NU = 14, /* numeric */
 566   LBP_IS = 15, /* infix separator (numeric) */
 567   LBP_SY = 16, /* symbols allowing breaks */
 568   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
 569   LBP_PR = 18, /* prefix (numeric) */
 570   LBP_PO = 19, /* postfix (numeric) */
 571   LBP_SA = 23, /* complex context (South East Asian) */
 572   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
 573   LBP_XX = 25  /* unknown */
 574 };
 575
 576 #include "lbrkprop.h"
 577
 578 static inline unsigned char
 579 lbrkprop_lookup (unsigned int uc)
 580 {
 581   unsigned int index1 = uc >> lbrkprop_header_0;
 582   if (index1 < lbrkprop_header_1)
 583     {
 584       int lookup1 = lbrkprop.level1[index1];
 585       if (lookup1 >= 0)
 586         {
 587           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
 588           int lookup2 = lbrkprop.level2[lookup1 + index2];
 589           if (lookup2 >= 0)
 590             {
 591               unsigned int index3 = uc & lbrkprop_header_4;
 592               return lbrkprop.level3[lookup2 + index3];
 593             }
 594         }
 595     }
 596   return LBP_XX;
 597 }
 598
 599 /* Table indexed by two line breaking classifications.  */
 600 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
 601 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 602 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 603 static const unsigned char lbrk_table[19][19] = {
 604                                 /* after */
 605         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
 606 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
 607 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 608 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 609 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 610 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 611 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 612 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 613 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 614 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
 615 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
 616 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
 617 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 618 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
 619 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
 620 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 621 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 622 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
 623 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
 624 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 625 /* "" */
 626 /* before */
 627 };
 628 /* Note: The (B2,B2) entry should probably be D instead of P.  */
 629 /* Note: The (PR,ID) entry should probably be D instead of I.  */
 630
 631 void
 632 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
 633 {
 634   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 635   const unsigned char *s_end = s + n;
 636   int last_prop = LBP_BK; /* line break property of last non-space character */
 637   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 638   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 639
 640   /* Don't break inside multibyte characters.  */
 641   memset (p, UC_BREAK_PROHIBITED, n);
 642
 643   while (s < s_end)
 644     {
 645       unsigned int uc;
 646       int count = u8_mbtouc (&uc, s, s_end - s);
 647       int prop = lbrkprop_lookup (uc);
 648
 649       if (prop == LBP_BK)
 650         {
 651           /* Mandatory break.  */
 652           *p = UC_BREAK_MANDATORY;
 653           last_prop = LBP_BK;
 654           seen_space = NULL;
 655           seen_space2 = NULL;
 656         }
 657       else
 658         {
 659           char *q;
 660
 661           /* Resolve property values whose behaviour is not fixed.  */
 662           switch (prop)
 663             {
 664               case LBP_AI:
 665                 /* Resolve ambiguous.  */
 666                 prop = LBP_AI_REPLACEMENT;
 667                 break;
 668               case LBP_CB:
 669                 /* This is arbitrary.  */
 670                 prop = LBP_ID;
 671                 break;
 672               case LBP_SA:
 673                 /* We don't handle complex scripts yet.
 674                    Treat LBP_SA like LBP_XX.  */
 675               case LBP_XX:
 676                 /* This is arbitrary.  */
 677                 prop = LBP_AL;
 678                 break;
 679             }
 680
 681           /* Deal with combining characters.  */
 682           q = p;
 683           if (prop == LBP_CM)
 684             {
 685               /* Don't break just before a combining character.  */
 686               *p = UC_BREAK_PROHIBITED;
 687               /* A combining character turns a preceding space into LBP_AL.  */
 688               if (seen_space != NULL)
 689                 {
 690                   q = seen_space;
 691                   seen_space = seen_space2;
 692                   prop = LBP_AL;
 693                   goto lookup_via_table;
 694                 }
 695             }
 696           else if (prop == LBP_SP)
 697             {
 698               /* Don't break just before a space.  */
 699               *p = UC_BREAK_PROHIBITED;
 700               seen_space2 = seen_space;
 701               seen_space = p;
 702             }
 703           else
 704             {
 705              lookup_via_table:
 706               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 707               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 708                 abort ();
 709
 710               if (last_prop == LBP_BK)
 711                 {
 712                   /* Don't break at the beginning of a line.  */
 713                   *q = UC_BREAK_PROHIBITED;
 714                 }
 715               else
 716                 {
 717                   switch (lbrk_table [last_prop-1] [prop-1])
 718                     {
 719                       case D:
 720                         *q = UC_BREAK_POSSIBLE;
 721                         break;
 722                       case I:
 723                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 724                         break;
 725                       case P:
 726                         *q = UC_BREAK_PROHIBITED;
 727                         break;
 728                       default:
 729                         abort ();
 730                     }
 731                 }
 732               last_prop = prop;
 733               seen_space = NULL;
 734               seen_space2 = NULL;
 735             }
 736         }
 737
 738       s += count;
 739       p += count;
 740     }
 741 }
 742
 743 void
 744 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
 745 {
 746   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 747   const unsigned short *s_end = s + n;
 748   int last_prop = LBP_BK; /* line break property of last non-space character */
 749   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 750   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 751
 752   /* Don't break inside multibyte characters.  */
 753   memset (p, UC_BREAK_PROHIBITED, n);
 754
 755   while (s < s_end)
 756     {
 757       unsigned int uc;
 758       int count = u16_mbtouc (&uc, s, s_end - s);
 759       int prop = lbrkprop_lookup (uc);
 760
 761       if (prop == LBP_BK)
 762         {
 763           /* Mandatory break.  */
 764           *p = UC_BREAK_MANDATORY;
 765           last_prop = LBP_BK;
 766           seen_space = NULL;
 767           seen_space2 = NULL;
 768         }
 769       else
 770         {
 771           char *q;
 772
 773           /* Resolve property values whose behaviour is not fixed.  */
 774           switch (prop)
 775             {
 776               case LBP_AI:
 777                 /* Resolve ambiguous.  */
 778                 prop = LBP_AI_REPLACEMENT;
 779                 break;
 780               case LBP_CB:
 781                 /* This is arbitrary.  */
 782                 prop = LBP_ID;
 783                 break;
 784               case LBP_SA:
 785                 /* We don't handle complex scripts yet.
 786                    Treat LBP_SA like LBP_XX.  */
 787               case LBP_XX:
 788                 /* This is arbitrary.  */
 789                 prop = LBP_AL;
 790                 break;
 791             }
 792
 793           /* Deal with combining characters.  */
 794           q = p;
 795           if (prop == LBP_CM)
 796             {
 797               /* Don't break just before a combining character.  */
 798               *p = UC_BREAK_PROHIBITED;
 799               /* A combining character turns a preceding space into LBP_AL.  */
 800               if (seen_space != NULL)
 801                 {
 802                   q = seen_space;
 803                   seen_space = seen_space2;
 804                   prop = LBP_AL;
 805                   goto lookup_via_table;
 806                 }
 807             }
 808           else if (prop == LBP_SP)
 809             {
 810               /* Don't break just before a space.  */
 811               *p = UC_BREAK_PROHIBITED;
 812               seen_space2 = seen_space;
 813               seen_space = p;
 814             }
 815           else
 816             {
 817              lookup_via_table:
 818               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 819               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 820                 abort ();
 821
 822               if (last_prop == LBP_BK)
 823                 {
 824                   /* Don't break at the beginning of a line.  */
 825                   *q = UC_BREAK_PROHIBITED;
 826                 }
 827               else
 828                 {
 829                   switch (lbrk_table [last_prop-1] [prop-1])
 830                     {
 831                       case D:
 832                         *q = UC_BREAK_POSSIBLE;
 833                         break;
 834                       case I:
 835                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 836                         break;
 837                       case P:
 838                         *q = UC_BREAK_PROHIBITED;
 839                         break;
 840                       default:
 841                         abort ();
 842                     }
 843                 }
 844               last_prop = prop;
 845               seen_space = NULL;
 846               seen_space2 = NULL;
 847             }
 848         }
 849
 850       s += count;
 851       p += count;
 852     }
 853 }
 854
 855 void
 856 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
 857 {
 858   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 859   const unsigned int *s_end = s + n;
 860   int last_prop = LBP_BK; /* line break property of last non-space character */
 861   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 862   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 863
 864   while (s < s_end)
 865     {
 866       unsigned int uc = *s;
 867       int prop = lbrkprop_lookup (uc);
 868
 869       if (prop == LBP_BK)
 870         {
 871           /* Mandatory break.  */
 872           *p = UC_BREAK_MANDATORY;
 873           last_prop = LBP_BK;
 874           seen_space = NULL;
 875           seen_space2 = NULL;
 876         }
 877       else
 878         {
 879           char *q;
 880
 881           /* Resolve property values whose behaviour is not fixed.  */
 882           switch (prop)
 883             {
 884               case LBP_AI:
 885                 /* Resolve ambiguous.  */
 886                 prop = LBP_AI_REPLACEMENT;
 887                 break;
 888               case LBP_CB:
 889                 /* This is arbitrary.  */
 890                 prop = LBP_ID;
 891                 break;
 892               case LBP_SA:
 893                 /* We don't handle complex scripts yet.
 894                    Treat LBP_SA like LBP_XX.  */
 895               case LBP_XX:
 896                 /* This is arbitrary.  */
 897                 prop = LBP_AL;
 898                 break;
 899             }
 900
 901           /* Deal with combining characters.  */
 902           q = p;
 903           if (prop == LBP_CM)
 904             {
 905               /* Don't break just before a combining character.  */
 906               *p = UC_BREAK_PROHIBITED;
 907               /* A combining character turns a preceding space into LBP_AL.  */
 908               if (seen_space != NULL)
 909                 {
 910                   q = seen_space;
 911                   seen_space = seen_space2;
 912                   prop = LBP_AL;
 913                   goto lookup_via_table;
 914                 }
 915             }
 916           else if (prop == LBP_SP)
 917             {
 918               /* Don't break just before a space.  */
 919               *p = UC_BREAK_PROHIBITED;
 920               seen_space2 = seen_space;
 921               seen_space = p;
 922             }
 923           else
 924             {
 925              lookup_via_table:
 926               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 927               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 928                 abort ();
 929
 930               if (last_prop == LBP_BK)
 931                 {
 932                   /* Don't break at the beginning of a line.  */
 933                   *q = UC_BREAK_PROHIBITED;
 934                 }
 935               else
 936                 {
 937                   switch (lbrk_table [last_prop-1] [prop-1])
 938                     {
 939                       case D:
 940                         *q = UC_BREAK_POSSIBLE;
 941                         break;
 942                       case I:
 943                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 944                         break;
 945                       case P:
 946                         *q = UC_BREAK_PROHIBITED;
 947                         break;
 948                       default:
 949                         abort ();
 950                     }
 951                 }
 952               last_prop = prop;
 953               seen_space = NULL;
 954               seen_space2 = NULL;
 955             }
 956         }
 957
 958       s++;
 959       p++;
 960     }
 961 }
 962
 963
 964 /* Choose the best line breaks, assuming the uc_width function.
 965    Return the column after the end of the string.  */
 966
 967 int
 968 u8_width_linebreaks (const unsigned char *s, size_t n,
 969                      int width, int start_column, int at_end_columns,
 970                      const char *o, const char *encoding,
 971                      char *p)
 972 {
 973   const unsigned char *s_end;
 974   char *last_p;
 975   int last_column;
 976   int piece_width;
 977
 978   u8_possible_linebreaks (s, n, encoding, p);
 979
 980   s_end = s + n;
 981   last_p = NULL;
 982   last_column = start_column;
 983   piece_width = 0;
 984   while (s < s_end)
 985     {
 986       unsigned int uc;
 987       int count = u8_mbtouc (&uc, s, s_end - s);
 988
 989       /* Respect the override.  */
 990       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 991         *p = *o;
 992
 993       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 994         {
 995           /* An atomic piece of text ends here.  */
 996           if (last_p != NULL && last_column + piece_width > width)
 997             {
 998               /* Insert a line break.  */
 999               *last_p = UC_BREAK_POSSIBLE;
1000               last_column = 0;
1001             }
1002         }
1003
1004       if (*p == UC_BREAK_MANDATORY)
1005         {
1006           /* uc is a line break character.  */
1007           /* Start a new piece at column 0.  */
1008           last_p = NULL;
1009           last_column = 0;
1010           piece_width = 0;
1011         }
1012       else
1013         {
1014           /* uc is not a line break character.  */
1015           int w;
1016
1017           if (*p == UC_BREAK_POSSIBLE)
1018             {
1019               /* Start a new piece.  */
1020               last_p = p;
1021               last_column += piece_width;
1022               piece_width = 0;
1023               /* No line break for the moment, may be turned into
1024                  UC_BREAK_POSSIBLE later, via last_p. */
1025             }
1026
1027           *p = UC_BREAK_PROHIBITED;
1028
1029           w = uc_width (uc, encoding);
1030           if (w >= 0) /* ignore control characters in the string */
1031             piece_width += w;
1032          }
1033
1034       s += count;
1035       p += count;
1036       if (o != NULL)
1037         o += count;
1038     }
1039
1040   /* The last atomic piece of text ends here.  */
1041   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1042     {
1043       /* Insert a line break.  */
1044       *last_p = UC_BREAK_POSSIBLE;
1045       last_column = 0;
1046     }
1047
1048   return last_column + piece_width;
1049 }
1050
1051 int
1052 u16_width_linebreaks (const unsigned short *s, size_t n,
1053                       int width, int start_column, int at_end_columns,
1054                       const char *o, const char *encoding,
1055                       char *p)
1056 {
1057   const unsigned short *s_end;
1058   char *last_p;
1059   int last_column;
1060   int piece_width;
1061
1062   u16_possible_linebreaks (s, n, encoding, p);
1063
1064   s_end = s + n;
1065   last_p = NULL;
1066   last_column = start_column;
1067   piece_width = 0;
1068   while (s < s_end)
1069     {
1070       unsigned int uc;
1071       int count = u16_mbtouc (&uc, s, s_end - s);
1072
1073       /* Respect the override.  */
1074       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1075         *p = *o;
1076
1077       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1078         {
1079           /* An atomic piece of text ends here.  */
1080           if (last_p != NULL && last_column + piece_width > width)
1081             {
1082               /* Insert a line break.  */
1083               *last_p = UC_BREAK_POSSIBLE;
1084               last_column = 0;
1085             }
1086         }
1087
1088       if (*p == UC_BREAK_MANDATORY)
1089         {
1090           /* uc is a line break character.  */
1091           /* Start a new piece at column 0.  */
1092           last_p = NULL;
1093           last_column = 0;
1094           piece_width = 0;
1095         }
1096       else
1097         {
1098           /* uc is not a line break character.  */
1099           int w;
1100
1101           if (*p == UC_BREAK_POSSIBLE)
1102             {
1103               /* Start a new piece.  */
1104               last_p = p;
1105               last_column += piece_width;
1106               piece_width = 0;
1107               /* No line break for the moment, may be turned into
1108                  UC_BREAK_POSSIBLE later, via last_p. */
1109             }
1110
1111           *p = UC_BREAK_PROHIBITED;
1112
1113           w = uc_width (uc, encoding);
1114           if (w >= 0) /* ignore control characters in the string */
1115             piece_width += w;
1116          }
1117
1118       s += count;
1119       p += count;
1120       if (o != NULL)
1121         o += count;
1122     }
1123
1124   /* The last atomic piece of text ends here.  */
1125   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1126     {
1127       /* Insert a line break.  */
1128       *last_p = UC_BREAK_POSSIBLE;
1129       last_column = 0;
1130     }
1131
1132   return last_column + piece_width;
1133 }
1134
1135 int
1136 u32_width_linebreaks (const unsigned int *s, size_t n,
1137                       int width, int start_column, int at_end_columns,
1138                       const char *o, const char *encoding,
1139                       char *p)
1140 {
1141   const unsigned int *s_end;
1142   char *last_p;
1143   int last_column;
1144   int piece_width;
1145
1146   u32_possible_linebreaks (s, n, encoding, p);
1147
1148   s_end = s + n;
1149   last_p = NULL;
1150   last_column = start_column;
1151   piece_width = 0;
1152   while (s < s_end)
1153     {
1154       unsigned int uc = *s;
1155
1156       /* Respect the override.  */
1157       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1158         *p = *o;
1159
1160       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1161         {
1162           /* An atomic piece of text ends here.  */
1163           if (last_p != NULL && last_column + piece_width > width)
1164             {
1165               /* Insert a line break.  */
1166               *last_p = UC_BREAK_POSSIBLE;
1167               last_column = 0;
1168             }
1169         }
1170
1171       if (*p == UC_BREAK_MANDATORY)
1172         {
1173           /* uc is a line break character.  */
1174           /* Start a new piece at column 0.  */
1175           last_p = NULL;
1176           last_column = 0;
1177           piece_width = 0;
1178         }
1179       else
1180         {
1181           /* uc is not a line break character.  */
1182           int w;
1183
1184           if (*p == UC_BREAK_POSSIBLE)
1185             {
1186               /* Start a new piece.  */
1187               last_p = p;
1188               last_column += piece_width;
1189               piece_width = 0;
1190               /* No line break for the moment, may be turned into
1191                  UC_BREAK_POSSIBLE later, via last_p. */
1192             }
1193
1194           *p = UC_BREAK_PROHIBITED;
1195
1196           w = uc_width (uc, encoding);
1197           if (w >= 0) /* ignore control characters in the string */
1198             piece_width += w;
1199          }
1200
1201       s++;
1202       p++;
1203       if (o != NULL)
1204         o++;
1205     }
1206
1207   /* The last atomic piece of text ends here.  */
1208   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1209     {
1210       /* Insert a line break.  */
1211       *last_p = UC_BREAK_POSSIBLE;
1212       last_column = 0;
1213     }
1214
1215   return last_column + piece_width;
1216 }
1217
1218
1219 #ifdef TEST1
1220
1221 #include <stdio.h>
1222
1223 /* Read the contents of an input stream, and return it, terminated with a NUL
1224    byte. */
1225 char *
1226 read_file (FILE *stream)
1227 {
1228 #define BUFSIZE 4096
1229   char *buf = NULL;
1230   int alloc = 0;
1231   int size = 0;
1232   int count;
1233
1234   while (! feof (stream))
1235     {
1236       if (size + BUFSIZE > alloc)
1237         {
1238           alloc = alloc + alloc / 2;
1239           if (alloc < size + BUFSIZE)
1240             alloc = size + BUFSIZE;
1241           buf = realloc (buf, alloc);
1242           if (buf == NULL)
1243             {
1244               fprintf (stderr, "out of memory\n");
1245               exit (1);
1246             }
1247         }
1248       count = fread (buf + size, 1, BUFSIZE, stream);
1249       if (count == 0)
1250         {
1251           if (ferror (stream))
1252             {
1253               perror ("fread");
1254               exit (1);
1255             }
1256         }
1257       else
1258         size += count;
1259     }
1260   buf = realloc (buf, size + 1);
1261   if (buf == NULL)
1262     {
1263       fprintf (stderr, "out of memory\n");
1264       exit (1);
1265     }
1266   buf[size] = '\0';
1267   return buf;
1268 #undef BUFSIZE
1269 }
1270
1271 int
1272 main (int argc, char * argv[])
1273 {
1274   if (argc == 1)
1275     {
1276       /* Display all the break opportunities in the input string.  */
1277       char *input = read_file (stdin);
1278       int length = strlen (input);
1279       char *breaks = malloc (length);
1280       int i;
1281
1282       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1283
1284       for (i = 0; i < length; i++)
1285         {
1286           switch (breaks[i])
1287             {
1288               case UC_BREAK_POSSIBLE:
1289                 /* U+2027 in UTF-8 encoding */
1290                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1291                 break;
1292               case UC_BREAK_MANDATORY:
1293                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1294                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1295                 break;
1296               case UC_BREAK_PROHIBITED:
1297                 break;
1298               default:
1299                 abort ();
1300             }
1301           putc (input[i], stdout);
1302         }
1303
1304       free (breaks);
1305
1306       return 0;
1307     }
1308   else if (argc == 2)
1309     {
1310       /* Insert line breaks for a given width.  */
1311       int width = atoi (argv[1]);
1312       char *input = read_file (stdin);
1313       int length = strlen (input);
1314       char *breaks = malloc (length);
1315       int i;
1316
1317       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1318
1319       for (i = 0; i < length; i++)
1320         {
1321           switch (breaks[i])
1322             {
1323               case UC_BREAK_POSSIBLE:
1324                 putc ('\n', stdout);
1325                 break;
1326               case UC_BREAK_MANDATORY:
1327                 break;
1328               case UC_BREAK_PROHIBITED:
1329                 break;
1330               default:
1331                 abort ();
1332             }
1333           putc (input[i], stdout);
1334         }
1335
1336       free (breaks);
1337
1338       return 0;
1339     }
1340   else
1341     return 1;
1342 }
1343
1344 #endif /* TEST1 */
1345
1346
1347 /* Now the same thing with an arbitrary encoding.
1348
1349    We convert the input string to Unicode.
1350
1351    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1352    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
1353    \U0000FFFF.  UTF-16 and variants support only characters up to
1354    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
1355    UCS-4 specification leaves doubts about endianness and byte order mark.
1356    glibc currently interprets it as big endian without byte order mark,
1357    but this is not backed by an RFC.  So we use UTF-8. It supports
1358    characters up to \U7FFFFFFF and is unambiguously defined.  */
1359
1360 #if HAVE_ICONV
1361
1362 #include <iconv.h>
1363 #include <errno.h>
1364
1365 /* Luckily, the encoding's name is platform independent.  */
1366 #define UTF8_NAME "UTF-8"
1367
1368 /* Return the length of a string after conversion through an iconv_t.  */
1369 static size_t
1370 iconv_string_length (iconv_t cd, const char *s, size_t n)
1371 {
1372 #define TMPBUFSIZE 4096
1373   size_t count = 0;
1374   char tmpbuf[TMPBUFSIZE];
1375   const char *inptr = s;
1376   size_t insize = n;
1377   while (insize > 0)
1378     {
1379       char *outptr = tmpbuf;
1380       size_t outsize = TMPBUFSIZE;
1381       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1382       if (res == (size_t)(-1) && errno != E2BIG)
1383         return (size_t)(-1);
1384       count += outptr - tmpbuf;
1385     }
1386   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
1387 #if defined _LIBICONV_VERSION \
1388     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1389   {
1390     char *outptr = tmpbuf;
1391     size_t outsize = TMPBUFSIZE;
1392     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1393     if (res == (size_t)(-1))
1394       return (size_t)(-1);
1395     count += outptr - tmpbuf;
1396   }
1397   /* Return to the initial state.  */
1398   iconv (cd, NULL, NULL, NULL, NULL);
1399 #endif
1400   return count;
1401 #undef TMPBUFSIZE
1402 }
1403
1404 static void
1405 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1406                               size_t *offtable, char *t, size_t m)
1407 {
1408   size_t i;
1409   const char *s_end;
1410   const char *inptr;
1411   char *outptr;
1412   size_t outsize;
1413   /* Avoid glibc-2.1 bug.  */
1414 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1415   const size_t extra = 1;
1416 #else
1417   const size_t extra = 0;
1418 #endif
1419
1420   for (i = 0; i < n; i++)
1421     offtable[i] = (size_t)(-1);
1422
1423   s_end = s + n;
1424   inptr = s;
1425   outptr = t;
1426   outsize = m + extra;
1427   while (inptr < s_end)
1428     {
1429       const char *saved_inptr;
1430       size_t insize;
1431       size_t res;
1432
1433       offtable[inptr - s] = outptr - t;
1434
1435       saved_inptr = inptr;
1436       res = (size_t)(-1);
1437       for (insize = 1; inptr + insize <= s_end; insize++)
1438         {
1439           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1440           if (!(res == (size_t)(-1) && errno == EINVAL))
1441             break;
1442           /* We expect that no input bytes have been consumed so far.  */
1443           if (inptr != saved_inptr)
1444             abort ();
1445         }
1446       /* After we verified the convertibility and computed the translation's
1447          size m, there shouldn't be any conversion error here. */
1448       if (res == (size_t)(-1))
1449         abort ();
1450     }
1451   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
1452 #if defined _LIBICONV_VERSION \
1453     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1454   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1455     abort ();
1456 #endif
1457   /* We should have produced exactly m output bytes.  */
1458   if (outsize != extra)
1459     abort ();
1460 }
1461
1462 #endif /* HAVE_ICONV */
1463
1464 #if C_CTYPE_ASCII
1465
1466 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
1467    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
1468 static int
1469 is_all_ascii (const char *s, size_t n)
1470 {
1471   for (; n > 0; s++, n--)
1472     {
1473       unsigned char c = (unsigned char) *s;
1474
1475       if (!(c_isprint (c) || c_isspace (c)))
1476         return 0;
1477     }
1478   return 1;
1479 }
1480
1481 #endif /* C_CTYPE_ASCII */
1482
1483 void
1484 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1485                          char *p)
1486 {
1487   if (n == 0)
1488     return;
1489   if (is_utf8_encoding (encoding))
1490     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1491   else
1492     {
1493 #if HAVE_ICONV
1494       iconv_t to_utf8;
1495       /* Avoid glibc-2.1 bug with EUC-KR.  */
1496 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1497       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1498         to_utf8 = (iconv_t)(-1);
1499       else
1500 # endif
1501       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1502          GB18030.  */
1503 # if defined __sun && !defined _LIBICONV_VERSION
1504       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1505           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1506           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1507           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1508           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1509           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1510         to_utf8 = (iconv_t)(-1);
1511       else
1512 # endif
1513       to_utf8 = iconv_open (UTF8_NAME, encoding);
1514       if (to_utf8 != (iconv_t)(-1))
1515         {
1516           /* Determine the length of the resulting UTF-8 string.  */
1517           size_t m = iconv_string_length (to_utf8, s, n);
1518           if (m != (size_t)(-1))
1519             {
1520               /* Convert the string to UTF-8 and build a translation table
1521                  from offsets into s to offsets into the translated string.  */
1522               char *memory = malloc (n * sizeof (size_t) + m + m);
1523               if (memory != NULL)
1524                 {
1525                   size_t *offtable = (size_t *) memory;
1526                   char *t = (char *) (offtable + n);
1527                   char *q = (char *) (t + m);
1528                   size_t i;
1529
1530                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1531
1532                   /* Determine the possible line breaks of the UTF-8 string.  */
1533                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1534
1535                   /* Translate the result back to the original string.  */
1536                   memset (p, UC_BREAK_PROHIBITED, n);
1537                   for (i = 0; i < n; i++)
1538                     if (offtable[i] != (size_t)(-1))
1539                       p[i] = q[offtable[i]];
1540
1541                   free (memory);
1542                   iconv_close (to_utf8);
1543                   return;
1544                 }
1545             }
1546           iconv_close (to_utf8);
1547         }
1548 #endif
1549       /* Impossible to convert.  */
1550 #if C_CTYPE_ASCII
1551       if (is_all_ascii (s, n))
1552         {
1553           /* ASCII is a subset of UTF-8.  */
1554           u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1555           return;
1556         }
1557 #endif
1558       /* We have a non-ASCII string and cannot convert it.
1559          Don't produce line breaks except those already present in the
1560          input string.  All we assume here is that the encoding is
1561          minimally ASCII compatible.  */
1562       {
1563         const char *s_end = s + n;
1564         while (s < s_end)
1565           {
1566             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1567             s++;
1568             p++;
1569           }
1570       }
1571     }
1572 }
1573
1574 int
1575 mbs_width_linebreaks (const char *s, size_t n,
1576                       int width, int start_column, int at_end_columns,
1577                       const char *o, const char *encoding,
1578                       char *p)
1579 {
1580   if (n == 0)
1581     return start_column;
1582   if (is_utf8_encoding (encoding))
1583     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1584   else
1585     {
1586 #if HAVE_ICONV
1587       iconv_t to_utf8;
1588       /* Avoid glibc-2.1 bug with EUC-KR.  */
1589 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1590       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1591         to_utf8 = (iconv_t)(-1);
1592       else
1593 # endif
1594       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1595          GB18030.  */
1596 # if defined __sun && !defined _LIBICONV_VERSION
1597       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1598           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1599           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1600           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1601           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1602           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1603         to_utf8 = (iconv_t)(-1);
1604       else
1605 # endif
1606       to_utf8 = iconv_open (UTF8_NAME, encoding);
1607       if (to_utf8 != (iconv_t)(-1))
1608         {
1609           /* Determine the length of the resulting UTF-8 string.  */
1610           size_t m = iconv_string_length (to_utf8, s, n);
1611           if (m != (size_t)(-1))
1612             {
1613               /* Convert the string to UTF-8 and build a translation table
1614                  from offsets into s to offsets into the translated string.  */
1615               char *memory = malloc (n * sizeof (size_t) + m + m + (o != NULL ? m : 0));
1616               if (memory != NULL)
1617                 {
1618                   size_t *offtable = (size_t *) memory;
1619                   char *t = (char *) (offtable + n);
1620                   char *q = (char *) (t + m);
1621                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1622                   int res_column;
1623                   size_t i;
1624
1625                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1626
1627                   /* Translate the overrides to the UTF-8 string.  */
1628                   if (o != NULL)
1629                     {
1630                       memset (o8, UC_BREAK_UNDEFINED, m);
1631                       for (i = 0; i < n; i++)
1632                         if (offtable[i] != (size_t)(-1))
1633                           o8[offtable[i]] = o[i];
1634                     }
1635
1636                   /* Determine the line breaks of the UTF-8 string.  */
1637                   res_column =
1638                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1639
1640                   /* Translate the result back to the original string.  */
1641                   memset (p, UC_BREAK_PROHIBITED, n);
1642                   for (i = 0; i < n; i++)
1643                     if (offtable[i] != (size_t)(-1))
1644                       p[i] = q[offtable[i]];
1645
1646                   free (memory);
1647                   iconv_close (to_utf8);
1648                   return res_column;
1649                 }
1650             }
1651           iconv_close (to_utf8);
1652         }
1653 #endif
1654       /* Impossible to convert.  */
1655 #if C_CTYPE_ASCII
1656       if (is_all_ascii (s, n))
1657         {
1658           /* ASCII is a subset of UTF-8.  */
1659           return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1660         }
1661 #endif
1662       /* We have a non-ASCII string and cannot convert it.
1663          Don't produce line breaks except those already present in the
1664          input string.  All we assume here is that the encoding is
1665          minimally ASCII compatible.  */
1666       {
1667         const char *s_end = s + n;
1668         while (s < s_end)
1669           {
1670             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1671                   ? UC_BREAK_MANDATORY
1672                   : UC_BREAK_PROHIBITED);
1673             s++;
1674             p++;
1675             if (o != NULL)
1676               o++;
1677           }
1678         /* We cannot compute widths in this case.  */
1679         return start_column;
1680       }
1681     }
1682 }
1683
1684
1685 #ifdef TEST2
1686
1687 #include <stdio.h>
1688 #include <locale.h>
1689
1690 /* Read the contents of an input stream, and return it, terminated with a NUL
1691    byte. */
1692 char *
1693 read_file (FILE *stream)
1694 {
1695 #define BUFSIZE 4096
1696   char *buf = NULL;
1697   int alloc = 0;
1698   int size = 0;
1699   int count;
1700
1701   while (! feof (stream))
1702     {
1703       if (size + BUFSIZE > alloc)
1704         {
1705           alloc = alloc + alloc / 2;
1706           if (alloc < size + BUFSIZE)
1707             alloc = size + BUFSIZE;
1708           buf = realloc (buf, alloc);
1709           if (buf == NULL)
1710             {
1711               fprintf (stderr, "out of memory\n");
1712               exit (1);
1713             }
1714         }
1715       count = fread (buf + size, 1, BUFSIZE, stream);
1716       if (count == 0)
1717         {
1718           if (ferror (stream))
1719             {
1720               perror ("fread");
1721               exit (1);
1722             }
1723         }
1724       else
1725         size += count;
1726     }
1727   buf = realloc (buf, size + 1);
1728   if (buf == NULL)
1729     {
1730       fprintf (stderr, "out of memory\n");
1731       exit (1);
1732     }
1733   buf[size] = '\0';
1734   return buf;
1735 #undef BUFSIZE
1736 }
1737
1738 int
1739 main (int argc, char * argv[])
1740 {
1741   setlocale (LC_CTYPE, "");
1742   if (argc == 1)
1743     {
1744       /* Display all the break opportunities in the input string.  */
1745       char *input = read_file (stdin);
1746       int length = strlen (input);
1747       char *breaks = malloc (length);
1748       int i;
1749
1750       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1751
1752       for (i = 0; i < length; i++)
1753         {
1754           switch (breaks[i])
1755             {
1756               case UC_BREAK_POSSIBLE:
1757                 putc ('|', stdout);
1758                 break;
1759               case UC_BREAK_MANDATORY:
1760                 break;
1761               case UC_BREAK_PROHIBITED:
1762                 break;
1763               default:
1764                 abort ();
1765             }
1766           putc (input[i], stdout);
1767         }
1768
1769       free (breaks);
1770
1771       return 0;
1772     }
1773   else if (argc == 2)
1774     {
1775       /* Insert line breaks for a given width.  */
1776       int width = atoi (argv[1]);
1777       char *input = read_file (stdin);
1778       int length = strlen (input);
1779       char *breaks = malloc (length);
1780       int i;
1781
1782       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1783
1784       for (i = 0; i < length; i++)
1785         {
1786           switch (breaks[i])
1787             {
1788               case UC_BREAK_POSSIBLE:
1789                 putc ('\n', stdout);
1790                 break;
1791               case UC_BREAK_MANDATORY:
1792                 break;
1793               case UC_BREAK_PROHIBITED:
1794                 break;
1795               default:
1796                 abort ();
1797             }
1798           putc (input[i], stdout);
1799         }
1800
1801       free (breaks);
1802
1803       return 0;
1804     }
1805   else
1806     return 1;
1807 }
1808
1809 #endif /* TEST2 */