lib/linebreak.c

   1 /* linebreak.c - line breaking of Unicode strings
   2    Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5 This program is free software; you can redistribute it and/or modify
   6 it under the terms of the GNU General Public License as published by
   7 the Free Software Foundation; either version 2, or (at your option)
   8 any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; if not, write to the Free Software Foundation,
  17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  18
  19 #include <config.h>
  20
  21 /* Specification.  */
  22 #include "linebreak.h"
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include "c-ctype.h"
  27 #include "xsize.h"
  28 #include "unistr.h"
  29 #include "uniwidth.h"
  30 #include "uniwidth/cjk.h"
  31 #include "streq.h"
  32
  33
  34 static int
  35 is_utf8_encoding (const char *encoding)
  36 {
  37   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
  38     return 1;
  39   return 0;
  40 }
  41
  42
  43 /* Determine the line break points in S, and store the result at p[0..n-1].  */
  44 /* We don't support line breaking of complex-context dependent characters
  45    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
  46
  47 /* Line breaking classification.  */
  48
  49 enum
  50 {
  51   /* Values >= 20 are resolved at run time. */
  52   LBP_BK =  0, /* mandatory break */
  53 /*LBP_CR,         carriage return - not used here because it's a DOSism */
  54 /*LBP_LF,         line feed - not used here because it's a DOSism */
  55   LBP_CM = 20, /* attached characters and combining marks */
  56 /*LBP_SG,         surrogates - not used here because they are not characters */
  57   LBP_ZW =  1, /* zero width space */
  58   LBP_IN =  2, /* inseparable */
  59   LBP_GL =  3, /* non-breaking (glue) */
  60   LBP_CB = 22, /* contingent break opportunity */
  61   LBP_SP = 21, /* space */
  62   LBP_BA =  4, /* break opportunity after */
  63   LBP_BB =  5, /* break opportunity before */
  64   LBP_B2 =  6, /* break opportunity before and after */
  65   LBP_HY =  7, /* hyphen */
  66   LBP_NS =  8, /* non starter */
  67   LBP_OP =  9, /* opening punctuation */
  68   LBP_CL = 10, /* closing punctuation */
  69   LBP_QU = 11, /* ambiguous quotation */
  70   LBP_EX = 12, /* exclamation/interrogation */
  71   LBP_ID = 13, /* ideographic */
  72   LBP_NU = 14, /* numeric */
  73   LBP_IS = 15, /* infix separator (numeric) */
  74   LBP_SY = 16, /* symbols allowing breaks */
  75   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
  76   LBP_PR = 18, /* prefix (numeric) */
  77   LBP_PO = 19, /* postfix (numeric) */
  78   LBP_SA = 23, /* complex context (South East Asian) */
  79   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
  80   LBP_XX = 25  /* unknown */
  81 };
  82
  83 #include "lbrkprop.h"
  84
  85 static inline unsigned char
  86 lbrkprop_lookup (unsigned int uc)
  87 {
  88   unsigned int index1 = uc >> lbrkprop_header_0;
  89   if (index1 < lbrkprop_header_1)
  90     {
  91       int lookup1 = lbrkprop.level1[index1];
  92       if (lookup1 >= 0)
  93         {
  94           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
  95           int lookup2 = lbrkprop.level2[lookup1 + index2];
  96           if (lookup2 >= 0)
  97             {
  98               unsigned int index3 = uc & lbrkprop_header_4;
  99               return lbrkprop.level3[lookup2 + index3];
 100             }
 101         }
 102     }
 103   return LBP_XX;
 104 }
 105
 106 /* Table indexed by two line breaking classifications.  */
 107 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
 108 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 109 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 110 static const unsigned char lbrk_table[19][19] = {
 111                                 /* after */
 112         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
 113 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
 114 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 115 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 116 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 117 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 118 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 119 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 120 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 121 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
 122 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
 123 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
 124 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 125 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
 126 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
 127 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 128 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 129 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
 130 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
 131 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 132 /* "" */
 133 /* before */
 134 };
 135 /* Note: The (B2,B2) entry should probably be D instead of P.  */
 136 /* Note: The (PR,ID) entry should probably be D instead of I.  */
 137
 138 void
 139 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
 140 {
 141   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 142   const unsigned char *s_end = s + n;
 143   int last_prop = LBP_BK; /* line break property of last non-space character */
 144   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 145   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 146
 147   /* Don't break inside multibyte characters.  */
 148   memset (p, UC_BREAK_PROHIBITED, n);
 149
 150   while (s < s_end)
 151     {
 152       unsigned int uc;
 153       int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
 154       int prop = lbrkprop_lookup (uc);
 155
 156       if (prop == LBP_BK)
 157         {
 158           /* Mandatory break.  */
 159           *p = UC_BREAK_MANDATORY;
 160           last_prop = LBP_BK;
 161           seen_space = NULL;
 162           seen_space2 = NULL;
 163         }
 164       else
 165         {
 166           char *q;
 167
 168           /* Resolve property values whose behaviour is not fixed.  */
 169           switch (prop)
 170             {
 171               case LBP_AI:
 172                 /* Resolve ambiguous.  */
 173                 prop = LBP_AI_REPLACEMENT;
 174                 break;
 175               case LBP_CB:
 176                 /* This is arbitrary.  */
 177                 prop = LBP_ID;
 178                 break;
 179               case LBP_SA:
 180                 /* We don't handle complex scripts yet.
 181                    Treat LBP_SA like LBP_XX.  */
 182               case LBP_XX:
 183                 /* This is arbitrary.  */
 184                 prop = LBP_AL;
 185                 break;
 186             }
 187
 188           /* Deal with combining characters.  */
 189           q = p;
 190           if (prop == LBP_CM)
 191             {
 192               /* Don't break just before a combining character.  */
 193               *p = UC_BREAK_PROHIBITED;
 194               /* A combining character turns a preceding space into LBP_AL.  */
 195               if (seen_space != NULL)
 196                 {
 197                   q = seen_space;
 198                   seen_space = seen_space2;
 199                   prop = LBP_AL;
 200                   goto lookup_via_table;
 201                 }
 202             }
 203           else if (prop == LBP_SP)
 204             {
 205               /* Don't break just before a space.  */
 206               *p = UC_BREAK_PROHIBITED;
 207               seen_space2 = seen_space;
 208               seen_space = p;
 209             }
 210           else
 211             {
 212              lookup_via_table:
 213               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 214               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 215                 abort ();
 216
 217               if (last_prop == LBP_BK)
 218                 {
 219                   /* Don't break at the beginning of a line.  */
 220                   *q = UC_BREAK_PROHIBITED;
 221                 }
 222               else
 223                 {
 224                   switch (lbrk_table [last_prop-1] [prop-1])
 225                     {
 226                       case D:
 227                         *q = UC_BREAK_POSSIBLE;
 228                         break;
 229                       case I:
 230                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 231                         break;
 232                       case P:
 233                         *q = UC_BREAK_PROHIBITED;
 234                         break;
 235                       default:
 236                         abort ();
 237                     }
 238                 }
 239               last_prop = prop;
 240               seen_space = NULL;
 241               seen_space2 = NULL;
 242             }
 243         }
 244
 245       s += count;
 246       p += count;
 247     }
 248 }
 249
 250 void
 251 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
 252 {
 253   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 254   const unsigned short *s_end = s + n;
 255   int last_prop = LBP_BK; /* line break property of last non-space character */
 256   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 257   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 258
 259   /* Don't break inside multibyte characters.  */
 260   memset (p, UC_BREAK_PROHIBITED, n);
 261
 262   while (s < s_end)
 263     {
 264       unsigned int uc;
 265       int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
 266       int prop = lbrkprop_lookup (uc);
 267
 268       if (prop == LBP_BK)
 269         {
 270           /* Mandatory break.  */
 271           *p = UC_BREAK_MANDATORY;
 272           last_prop = LBP_BK;
 273           seen_space = NULL;
 274           seen_space2 = NULL;
 275         }
 276       else
 277         {
 278           char *q;
 279
 280           /* Resolve property values whose behaviour is not fixed.  */
 281           switch (prop)
 282             {
 283               case LBP_AI:
 284                 /* Resolve ambiguous.  */
 285                 prop = LBP_AI_REPLACEMENT;
 286                 break;
 287               case LBP_CB:
 288                 /* This is arbitrary.  */
 289                 prop = LBP_ID;
 290                 break;
 291               case LBP_SA:
 292                 /* We don't handle complex scripts yet.
 293                    Treat LBP_SA like LBP_XX.  */
 294               case LBP_XX:
 295                 /* This is arbitrary.  */
 296                 prop = LBP_AL;
 297                 break;
 298             }
 299
 300           /* Deal with combining characters.  */
 301           q = p;
 302           if (prop == LBP_CM)
 303             {
 304               /* Don't break just before a combining character.  */
 305               *p = UC_BREAK_PROHIBITED;
 306               /* A combining character turns a preceding space into LBP_AL.  */
 307               if (seen_space != NULL)
 308                 {
 309                   q = seen_space;
 310                   seen_space = seen_space2;
 311                   prop = LBP_AL;
 312                   goto lookup_via_table;
 313                 }
 314             }
 315           else if (prop == LBP_SP)
 316             {
 317               /* Don't break just before a space.  */
 318               *p = UC_BREAK_PROHIBITED;
 319               seen_space2 = seen_space;
 320               seen_space = p;
 321             }
 322           else
 323             {
 324              lookup_via_table:
 325               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 326               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 327                 abort ();
 328
 329               if (last_prop == LBP_BK)
 330                 {
 331                   /* Don't break at the beginning of a line.  */
 332                   *q = UC_BREAK_PROHIBITED;
 333                 }
 334               else
 335                 {
 336                   switch (lbrk_table [last_prop-1] [prop-1])
 337                     {
 338                       case D:
 339                         *q = UC_BREAK_POSSIBLE;
 340                         break;
 341                       case I:
 342                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 343                         break;
 344                       case P:
 345                         *q = UC_BREAK_PROHIBITED;
 346                         break;
 347                       default:
 348                         abort ();
 349                     }
 350                 }
 351               last_prop = prop;
 352               seen_space = NULL;
 353               seen_space2 = NULL;
 354             }
 355         }
 356
 357       s += count;
 358       p += count;
 359     }
 360 }
 361
 362 void
 363 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
 364 {
 365   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 366   const unsigned int *s_end = s + n;
 367   int last_prop = LBP_BK; /* line break property of last non-space character */
 368   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 369   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 370
 371   while (s < s_end)
 372     {
 373       unsigned int uc = *s;
 374       int prop = lbrkprop_lookup (uc);
 375
 376       if (prop == LBP_BK)
 377         {
 378           /* Mandatory break.  */
 379           *p = UC_BREAK_MANDATORY;
 380           last_prop = LBP_BK;
 381           seen_space = NULL;
 382           seen_space2 = NULL;
 383         }
 384       else
 385         {
 386           char *q;
 387
 388           /* Resolve property values whose behaviour is not fixed.  */
 389           switch (prop)
 390             {
 391               case LBP_AI:
 392                 /* Resolve ambiguous.  */
 393                 prop = LBP_AI_REPLACEMENT;
 394                 break;
 395               case LBP_CB:
 396                 /* This is arbitrary.  */
 397                 prop = LBP_ID;
 398                 break;
 399               case LBP_SA:
 400                 /* We don't handle complex scripts yet.
 401                    Treat LBP_SA like LBP_XX.  */
 402               case LBP_XX:
 403                 /* This is arbitrary.  */
 404                 prop = LBP_AL;
 405                 break;
 406             }
 407
 408           /* Deal with combining characters.  */
 409           q = p;
 410           if (prop == LBP_CM)
 411             {
 412               /* Don't break just before a combining character.  */
 413               *p = UC_BREAK_PROHIBITED;
 414               /* A combining character turns a preceding space into LBP_AL.  */
 415               if (seen_space != NULL)
 416                 {
 417                   q = seen_space;
 418                   seen_space = seen_space2;
 419                   prop = LBP_AL;
 420                   goto lookup_via_table;
 421                 }
 422             }
 423           else if (prop == LBP_SP)
 424             {
 425               /* Don't break just before a space.  */
 426               *p = UC_BREAK_PROHIBITED;
 427               seen_space2 = seen_space;
 428               seen_space = p;
 429             }
 430           else
 431             {
 432              lookup_via_table:
 433               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 434               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 435                 abort ();
 436
 437               if (last_prop == LBP_BK)
 438                 {
 439                   /* Don't break at the beginning of a line.  */
 440                   *q = UC_BREAK_PROHIBITED;
 441                 }
 442               else
 443                 {
 444                   switch (lbrk_table [last_prop-1] [prop-1])
 445                     {
 446                       case D:
 447                         *q = UC_BREAK_POSSIBLE;
 448                         break;
 449                       case I:
 450                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 451                         break;
 452                       case P:
 453                         *q = UC_BREAK_PROHIBITED;
 454                         break;
 455                       default:
 456                         abort ();
 457                     }
 458                 }
 459               last_prop = prop;
 460               seen_space = NULL;
 461               seen_space2 = NULL;
 462             }
 463         }
 464
 465       s++;
 466       p++;
 467     }
 468 }
 469
 470
 471 /* Choose the best line breaks, assuming the uc_width function.
 472    Return the column after the end of the string.  */
 473
 474 int
 475 u8_width_linebreaks (const unsigned char *s, size_t n,
 476                      int width, int start_column, int at_end_columns,
 477                      const char *o, const char *encoding,
 478                      char *p)
 479 {
 480   const unsigned char *s_end;
 481   char *last_p;
 482   int last_column;
 483   int piece_width;
 484
 485   u8_possible_linebreaks (s, n, encoding, p);
 486
 487   s_end = s + n;
 488   last_p = NULL;
 489   last_column = start_column;
 490   piece_width = 0;
 491   while (s < s_end)
 492     {
 493       unsigned int uc;
 494       int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
 495
 496       /* Respect the override.  */
 497       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 498         *p = *o;
 499
 500       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 501         {
 502           /* An atomic piece of text ends here.  */
 503           if (last_p != NULL && last_column + piece_width > width)
 504             {
 505               /* Insert a line break.  */
 506               *last_p = UC_BREAK_POSSIBLE;
 507               last_column = 0;
 508             }
 509         }
 510
 511       if (*p == UC_BREAK_MANDATORY)
 512         {
 513           /* uc is a line break character.  */
 514           /* Start a new piece at column 0.  */
 515           last_p = NULL;
 516           last_column = 0;
 517           piece_width = 0;
 518         }
 519       else
 520         {
 521           /* uc is not a line break character.  */
 522           int w;
 523
 524           if (*p == UC_BREAK_POSSIBLE)
 525             {
 526               /* Start a new piece.  */
 527               last_p = p;
 528               last_column += piece_width;
 529               piece_width = 0;
 530               /* No line break for the moment, may be turned into
 531                  UC_BREAK_POSSIBLE later, via last_p. */
 532             }
 533
 534           *p = UC_BREAK_PROHIBITED;
 535
 536           w = uc_width (uc, encoding);
 537           if (w >= 0) /* ignore control characters in the string */
 538             piece_width += w;
 539          }
 540
 541       s += count;
 542       p += count;
 543       if (o != NULL)
 544         o += count;
 545     }
 546
 547   /* The last atomic piece of text ends here.  */
 548   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 549     {
 550       /* Insert a line break.  */
 551       *last_p = UC_BREAK_POSSIBLE;
 552       last_column = 0;
 553     }
 554
 555   return last_column + piece_width;
 556 }
 557
 558 int
 559 u16_width_linebreaks (const unsigned short *s, size_t n,
 560                       int width, int start_column, int at_end_columns,
 561                       const char *o, const char *encoding,
 562                       char *p)
 563 {
 564   const unsigned short *s_end;
 565   char *last_p;
 566   int last_column;
 567   int piece_width;
 568
 569   u16_possible_linebreaks (s, n, encoding, p);
 570
 571   s_end = s + n;
 572   last_p = NULL;
 573   last_column = start_column;
 574   piece_width = 0;
 575   while (s < s_end)
 576     {
 577       unsigned int uc;
 578       int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
 579
 580       /* Respect the override.  */
 581       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 582         *p = *o;
 583
 584       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 585         {
 586           /* An atomic piece of text ends here.  */
 587           if (last_p != NULL && last_column + piece_width > width)
 588             {
 589               /* Insert a line break.  */
 590               *last_p = UC_BREAK_POSSIBLE;
 591               last_column = 0;
 592             }
 593         }
 594
 595       if (*p == UC_BREAK_MANDATORY)
 596         {
 597           /* uc is a line break character.  */
 598           /* Start a new piece at column 0.  */
 599           last_p = NULL;
 600           last_column = 0;
 601           piece_width = 0;
 602         }
 603       else
 604         {
 605           /* uc is not a line break character.  */
 606           int w;
 607
 608           if (*p == UC_BREAK_POSSIBLE)
 609             {
 610               /* Start a new piece.  */
 611               last_p = p;
 612               last_column += piece_width;
 613               piece_width = 0;
 614               /* No line break for the moment, may be turned into
 615                  UC_BREAK_POSSIBLE later, via last_p. */
 616             }
 617
 618           *p = UC_BREAK_PROHIBITED;
 619
 620           w = uc_width (uc, encoding);
 621           if (w >= 0) /* ignore control characters in the string */
 622             piece_width += w;
 623          }
 624
 625       s += count;
 626       p += count;
 627       if (o != NULL)
 628         o += count;
 629     }
 630
 631   /* The last atomic piece of text ends here.  */
 632   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 633     {
 634       /* Insert a line break.  */
 635       *last_p = UC_BREAK_POSSIBLE;
 636       last_column = 0;
 637     }
 638
 639   return last_column + piece_width;
 640 }
 641
 642 int
 643 u32_width_linebreaks (const unsigned int *s, size_t n,
 644                       int width, int start_column, int at_end_columns,
 645                       const char *o, const char *encoding,
 646                       char *p)
 647 {
 648   const unsigned int *s_end;
 649   char *last_p;
 650   int last_column;
 651   int piece_width;
 652
 653   u32_possible_linebreaks (s, n, encoding, p);
 654
 655   s_end = s + n;
 656   last_p = NULL;
 657   last_column = start_column;
 658   piece_width = 0;
 659   while (s < s_end)
 660     {
 661       unsigned int uc = *s;
 662
 663       /* Respect the override.  */
 664       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 665         *p = *o;
 666
 667       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 668         {
 669           /* An atomic piece of text ends here.  */
 670           if (last_p != NULL && last_column + piece_width > width)
 671             {
 672               /* Insert a line break.  */
 673               *last_p = UC_BREAK_POSSIBLE;
 674               last_column = 0;
 675             }
 676         }
 677
 678       if (*p == UC_BREAK_MANDATORY)
 679         {
 680           /* uc is a line break character.  */
 681           /* Start a new piece at column 0.  */
 682           last_p = NULL;
 683           last_column = 0;
 684           piece_width = 0;
 685         }
 686       else
 687         {
 688           /* uc is not a line break character.  */
 689           int w;
 690
 691           if (*p == UC_BREAK_POSSIBLE)
 692             {
 693               /* Start a new piece.  */
 694               last_p = p;
 695               last_column += piece_width;
 696               piece_width = 0;
 697               /* No line break for the moment, may be turned into
 698                  UC_BREAK_POSSIBLE later, via last_p. */
 699             }
 700
 701           *p = UC_BREAK_PROHIBITED;
 702
 703           w = uc_width (uc, encoding);
 704           if (w >= 0) /* ignore control characters in the string */
 705             piece_width += w;
 706          }
 707
 708       s++;
 709       p++;
 710       if (o != NULL)
 711         o++;
 712     }
 713
 714   /* The last atomic piece of text ends here.  */
 715   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 716     {
 717       /* Insert a line break.  */
 718       *last_p = UC_BREAK_POSSIBLE;
 719       last_column = 0;
 720     }
 721
 722   return last_column + piece_width;
 723 }
 724
 725
 726 #ifdef TEST1
 727
 728 #include <stdio.h>
 729
 730 /* Read the contents of an input stream, and return it, terminated with a NUL
 731    byte. */
 732 char *
 733 read_file (FILE *stream)
 734 {
 735 #define BUFSIZE 4096
 736   char *buf = NULL;
 737   int alloc = 0;
 738   int size = 0;
 739   int count;
 740
 741   while (! feof (stream))
 742     {
 743       if (size + BUFSIZE > alloc)
 744         {
 745           alloc = alloc + alloc / 2;
 746           if (alloc < size + BUFSIZE)
 747             alloc = size + BUFSIZE;
 748           buf = realloc (buf, alloc);
 749           if (buf == NULL)
 750             {
 751               fprintf (stderr, "out of memory\n");
 752               exit (1);
 753             }
 754         }
 755       count = fread (buf + size, 1, BUFSIZE, stream);
 756       if (count == 0)
 757         {
 758           if (ferror (stream))
 759             {
 760               perror ("fread");
 761               exit (1);
 762             }
 763         }
 764       else
 765         size += count;
 766     }
 767   buf = realloc (buf, size + 1);
 768   if (buf == NULL)
 769     {
 770       fprintf (stderr, "out of memory\n");
 771       exit (1);
 772     }
 773   buf[size] = '\0';
 774   return buf;
 775 #undef BUFSIZE
 776 }
 777
 778 int
 779 main (int argc, char * argv[])
 780 {
 781   if (argc == 1)
 782     {
 783       /* Display all the break opportunities in the input string.  */
 784       char *input = read_file (stdin);
 785       int length = strlen (input);
 786       char *breaks = malloc (length);
 787       int i;
 788
 789       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
 790
 791       for (i = 0; i < length; i++)
 792         {
 793           switch (breaks[i])
 794             {
 795               case UC_BREAK_POSSIBLE:
 796                 /* U+2027 in UTF-8 encoding */
 797                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
 798                 break;
 799               case UC_BREAK_MANDATORY:
 800                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
 801                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
 802                 break;
 803               case UC_BREAK_PROHIBITED:
 804                 break;
 805               default:
 806                 abort ();
 807             }
 808           putc (input[i], stdout);
 809         }
 810
 811       free (breaks);
 812
 813       return 0;
 814     }
 815   else if (argc == 2)
 816     {
 817       /* Insert line breaks for a given width.  */
 818       int width = atoi (argv[1]);
 819       char *input = read_file (stdin);
 820       int length = strlen (input);
 821       char *breaks = malloc (length);
 822       int i;
 823
 824       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
 825
 826       for (i = 0; i < length; i++)
 827         {
 828           switch (breaks[i])
 829             {
 830               case UC_BREAK_POSSIBLE:
 831                 putc ('\n', stdout);
 832                 break;
 833               case UC_BREAK_MANDATORY:
 834                 break;
 835               case UC_BREAK_PROHIBITED:
 836                 break;
 837               default:
 838                 abort ();
 839             }
 840           putc (input[i], stdout);
 841         }
 842
 843       free (breaks);
 844
 845       return 0;
 846     }
 847   else
 848     return 1;
 849 }
 850
 851 #endif /* TEST1 */
 852
 853
 854 /* Now the same thing with an arbitrary encoding.
 855
 856    We convert the input string to Unicode.
 857
 858    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
 859    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
 860    \U0000FFFF.  UTF-16 and variants support only characters up to
 861    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
 862    UCS-4 specification leaves doubts about endianness and byte order mark.
 863    glibc currently interprets it as big endian without byte order mark,
 864    but this is not backed by an RFC.  So we use UTF-8. It supports
 865    characters up to \U7FFFFFFF and is unambiguously defined.  */
 866
 867 #if HAVE_ICONV
 868
 869 #include <iconv.h>
 870 #include <errno.h>
 871
 872 /* Luckily, the encoding's name is platform independent.  */
 873 #define UTF8_NAME "UTF-8"
 874
 875 /* Return the length of a string after conversion through an iconv_t.  */
 876 static size_t
 877 iconv_string_length (iconv_t cd, const char *s, size_t n)
 878 {
 879 #define TMPBUFSIZE 4096
 880   size_t count = 0;
 881   char tmpbuf[TMPBUFSIZE];
 882   const char *inptr = s;
 883   size_t insize = n;
 884   while (insize > 0)
 885     {
 886       char *outptr = tmpbuf;
 887       size_t outsize = TMPBUFSIZE;
 888       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
 889       if (res == (size_t)(-1) && errno != E2BIG)
 890         return (size_t)(-1);
 891       count += outptr - tmpbuf;
 892     }
 893   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
 894 #if defined _LIBICONV_VERSION \
 895     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 896   {
 897     char *outptr = tmpbuf;
 898     size_t outsize = TMPBUFSIZE;
 899     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
 900     if (res == (size_t)(-1))
 901       return (size_t)(-1);
 902     count += outptr - tmpbuf;
 903   }
 904   /* Return to the initial state.  */
 905   iconv (cd, NULL, NULL, NULL, NULL);
 906 #endif
 907   return count;
 908 #undef TMPBUFSIZE
 909 }
 910
 911 static void
 912 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
 913                               size_t *offtable, char *t, size_t m)
 914 {
 915   size_t i;
 916   const char *s_end;
 917   const char *inptr;
 918   char *outptr;
 919   size_t outsize;
 920   /* Avoid glibc-2.1 bug.  */
 921 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
 922   const size_t extra = 1;
 923 #else
 924   const size_t extra = 0;
 925 #endif
 926
 927   for (i = 0; i < n; i++)
 928     offtable[i] = (size_t)(-1);
 929
 930   s_end = s + n;
 931   inptr = s;
 932   outptr = t;
 933   outsize = m + extra;
 934   while (inptr < s_end)
 935     {
 936       const char *saved_inptr;
 937       size_t insize;
 938       size_t res;
 939
 940       offtable[inptr - s] = outptr - t;
 941
 942       saved_inptr = inptr;
 943       res = (size_t)(-1);
 944       for (insize = 1; inptr + insize <= s_end; insize++)
 945         {
 946           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
 947           if (!(res == (size_t)(-1) && errno == EINVAL))
 948             break;
 949           /* We expect that no input bytes have been consumed so far.  */
 950           if (inptr != saved_inptr)
 951             abort ();
 952         }
 953       /* After we verified the convertibility and computed the translation's
 954          size m, there shouldn't be any conversion error here. */
 955       if (res == (size_t)(-1))
 956         abort ();
 957     }
 958   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
 959 #if defined _LIBICONV_VERSION \
 960     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 961   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
 962     abort ();
 963 #endif
 964   /* We should have produced exactly m output bytes.  */
 965   if (outsize != extra)
 966     abort ();
 967 }
 968
 969 #endif /* HAVE_ICONV */
 970
 971 #if C_CTYPE_ASCII
 972
 973 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
 974    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
 975 static int
 976 is_all_ascii (const char *s, size_t n)
 977 {
 978   for (; n > 0; s++, n--)
 979     {
 980       unsigned char c = (unsigned char) *s;
 981
 982       if (!(c_isprint (c) || c_isspace (c)))
 983         return 0;
 984     }
 985   return 1;
 986 }
 987
 988 #endif /* C_CTYPE_ASCII */
 989
 990 void
 991 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
 992                          char *p)
 993 {
 994   if (n == 0)
 995     return;
 996   if (is_utf8_encoding (encoding))
 997     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
 998   else
 999     {
1000 #if HAVE_ICONV
1001       iconv_t to_utf8;
1002       /* Avoid glibc-2.1 bug with EUC-KR.  */
1003 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1004       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1005         to_utf8 = (iconv_t)(-1);
1006       else
1007 # endif
1008       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1009          GB18030.  */
1010 # if defined __sun && !defined _LIBICONV_VERSION
1011       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1012           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1013           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1014           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1015           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1016           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1017         to_utf8 = (iconv_t)(-1);
1018       else
1019 # endif
1020       to_utf8 = iconv_open (UTF8_NAME, encoding);
1021       if (to_utf8 != (iconv_t)(-1))
1022         {
1023           /* Determine the length of the resulting UTF-8 string.  */
1024           size_t m = iconv_string_length (to_utf8, s, n);
1025           if (m != (size_t)(-1))
1026             {
1027               /* Convert the string to UTF-8 and build a translation table
1028                  from offsets into s to offsets into the translated string.  */
1029               size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1030               char *memory =
1031                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1032               if (memory != NULL)
1033                 {
1034                   size_t *offtable = (size_t *) memory;
1035                   char *t = (char *) (offtable + n);
1036                   char *q = (char *) (t + m);
1037                   size_t i;
1038
1039                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1040
1041                   /* Determine the possible line breaks of the UTF-8 string.  */
1042                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1043
1044                   /* Translate the result back to the original string.  */
1045                   memset (p, UC_BREAK_PROHIBITED, n);
1046                   for (i = 0; i < n; i++)
1047                     if (offtable[i] != (size_t)(-1))
1048                       p[i] = q[offtable[i]];
1049
1050                   free (memory);
1051                   iconv_close (to_utf8);
1052                   return;
1053                 }
1054             }
1055           iconv_close (to_utf8);
1056         }
1057 #endif
1058       /* Impossible to convert.  */
1059 #if C_CTYPE_ASCII
1060       if (is_all_ascii (s, n))
1061         {
1062           /* ASCII is a subset of UTF-8.  */
1063           u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1064           return;
1065         }
1066 #endif
1067       /* We have a non-ASCII string and cannot convert it.
1068          Don't produce line breaks except those already present in the
1069          input string.  All we assume here is that the encoding is
1070          minimally ASCII compatible.  */
1071       {
1072         const char *s_end = s + n;
1073         while (s < s_end)
1074           {
1075             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1076             s++;
1077             p++;
1078           }
1079       }
1080     }
1081 }
1082
1083 int
1084 mbs_width_linebreaks (const char *s, size_t n,
1085                       int width, int start_column, int at_end_columns,
1086                       const char *o, const char *encoding,
1087                       char *p)
1088 {
1089   if (n == 0)
1090     return start_column;
1091   if (is_utf8_encoding (encoding))
1092     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1093   else
1094     {
1095 #if HAVE_ICONV
1096       iconv_t to_utf8;
1097       /* Avoid glibc-2.1 bug with EUC-KR.  */
1098 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1099       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1100         to_utf8 = (iconv_t)(-1);
1101       else
1102 # endif
1103       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1104          GB18030.  */
1105 # if defined __sun && !defined _LIBICONV_VERSION
1106       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1107           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1108           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1109           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1110           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1111           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1112         to_utf8 = (iconv_t)(-1);
1113       else
1114 # endif
1115       to_utf8 = iconv_open (UTF8_NAME, encoding);
1116       if (to_utf8 != (iconv_t)(-1))
1117         {
1118           /* Determine the length of the resulting UTF-8 string.  */
1119           size_t m = iconv_string_length (to_utf8, s, n);
1120           if (m != (size_t)(-1))
1121             {
1122               /* Convert the string to UTF-8 and build a translation table
1123                  from offsets into s to offsets into the translated string.  */
1124               size_t memory_size =
1125                 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1126                        (o != NULL ? m : 0));
1127               char *memory =
1128                 (char *)
1129                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1130               if (memory != NULL)
1131                 {
1132                   size_t *offtable = (size_t *) memory;
1133                   char *t = (char *) (offtable + n);
1134                   char *q = (char *) (t + m);
1135                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1136                   int res_column;
1137                   size_t i;
1138
1139                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1140
1141                   /* Translate the overrides to the UTF-8 string.  */
1142                   if (o != NULL)
1143                     {
1144                       memset (o8, UC_BREAK_UNDEFINED, m);
1145                       for (i = 0; i < n; i++)
1146                         if (offtable[i] != (size_t)(-1))
1147                           o8[offtable[i]] = o[i];
1148                     }
1149
1150                   /* Determine the line breaks of the UTF-8 string.  */
1151                   res_column =
1152                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1153
1154                   /* Translate the result back to the original string.  */
1155                   memset (p, UC_BREAK_PROHIBITED, n);
1156                   for (i = 0; i < n; i++)
1157                     if (offtable[i] != (size_t)(-1))
1158                       p[i] = q[offtable[i]];
1159
1160                   free (memory);
1161                   iconv_close (to_utf8);
1162                   return res_column;
1163                 }
1164             }
1165           iconv_close (to_utf8);
1166         }
1167 #endif
1168       /* Impossible to convert.  */
1169 #if C_CTYPE_ASCII
1170       if (is_all_ascii (s, n))
1171         {
1172           /* ASCII is a subset of UTF-8.  */
1173           return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1174         }
1175 #endif
1176       /* We have a non-ASCII string and cannot convert it.
1177          Don't produce line breaks except those already present in the
1178          input string.  All we assume here is that the encoding is
1179          minimally ASCII compatible.  */
1180       {
1181         const char *s_end = s + n;
1182         while (s < s_end)
1183           {
1184             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1185                   ? UC_BREAK_MANDATORY
1186                   : UC_BREAK_PROHIBITED);
1187             s++;
1188             p++;
1189             if (o != NULL)
1190               o++;
1191           }
1192         /* We cannot compute widths in this case.  */
1193         return start_column;
1194       }
1195     }
1196 }
1197
1198
1199 #ifdef TEST2
1200
1201 #include <stdio.h>
1202 #include <locale.h>
1203
1204 /* Read the contents of an input stream, and return it, terminated with a NUL
1205    byte. */
1206 char *
1207 read_file (FILE *stream)
1208 {
1209 #define BUFSIZE 4096
1210   char *buf = NULL;
1211   int alloc = 0;
1212   int size = 0;
1213   int count;
1214
1215   while (! feof (stream))
1216     {
1217       if (size + BUFSIZE > alloc)
1218         {
1219           alloc = alloc + alloc / 2;
1220           if (alloc < size + BUFSIZE)
1221             alloc = size + BUFSIZE;
1222           buf = realloc (buf, alloc);
1223           if (buf == NULL)
1224             {
1225               fprintf (stderr, "out of memory\n");
1226               exit (1);
1227             }
1228         }
1229       count = fread (buf + size, 1, BUFSIZE, stream);
1230       if (count == 0)
1231         {
1232           if (ferror (stream))
1233             {
1234               perror ("fread");
1235               exit (1);
1236             }
1237         }
1238       else
1239         size += count;
1240     }
1241   buf = realloc (buf, size + 1);
1242   if (buf == NULL)
1243     {
1244       fprintf (stderr, "out of memory\n");
1245       exit (1);
1246     }
1247   buf[size] = '\0';
1248   return buf;
1249 #undef BUFSIZE
1250 }
1251
1252 int
1253 main (int argc, char * argv[])
1254 {
1255   setlocale (LC_CTYPE, "");
1256   if (argc == 1)
1257     {
1258       /* Display all the break opportunities in the input string.  */
1259       char *input = read_file (stdin);
1260       int length = strlen (input);
1261       char *breaks = malloc (length);
1262       int i;
1263
1264       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1265
1266       for (i = 0; i < length; i++)
1267         {
1268           switch (breaks[i])
1269             {
1270               case UC_BREAK_POSSIBLE:
1271                 putc ('|', stdout);
1272                 break;
1273               case UC_BREAK_MANDATORY:
1274                 break;
1275               case UC_BREAK_PROHIBITED:
1276                 break;
1277               default:
1278                 abort ();
1279             }
1280           putc (input[i], stdout);
1281         }
1282
1283       free (breaks);
1284
1285       return 0;
1286     }
1287   else if (argc == 2)
1288     {
1289       /* Insert line breaks for a given width.  */
1290       int width = atoi (argv[1]);
1291       char *input = read_file (stdin);
1292       int length = strlen (input);
1293       char *breaks = malloc (length);
1294       int i;
1295
1296       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1297
1298       for (i = 0; i < length; i++)
1299         {
1300           switch (breaks[i])
1301             {
1302               case UC_BREAK_POSSIBLE:
1303                 putc ('\n', stdout);
1304                 break;
1305               case UC_BREAK_MANDATORY:
1306                 break;
1307               case UC_BREAK_PROHIBITED:
1308                 break;
1309               default:
1310                 abort ();
1311             }
1312           putc (input[i], stdout);
1313         }
1314
1315       free (breaks);
1316
1317       return 0;
1318     }
1319   else
1320     return 1;
1321 }
1322
1323 #endif /* TEST2 */