lib/linebreak.c

   1 /* linebreak.c - line breaking of Unicode strings
   2    Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5 This program is free software: you can redistribute it and/or modify
   6 it under the terms of the GNU General Public License as published by
   7 the Free Software Foundation; either version 3 of the License, or
   8 (at your option) any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "linebreak.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include "c-ctype.h"
  26 #include "xsize.h"
  27 #include "unistr.h"
  28 #include "uniwidth.h"
  29 #include "uniwidth/cjk.h"
  30 #include "streq.h"
  31
  32
  33 static int
  34 is_utf8_encoding (const char *encoding)
  35 {
  36   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
  37     return 1;
  38   return 0;
  39 }
  40
  41
  42 /* Determine the line break points in S, and store the result at p[0..n-1].  */
  43 /* We don't support line breaking of complex-context dependent characters
  44    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
  45
  46 /* Line breaking classification.  */
  47
  48 enum
  49 {
  50   /* Values >= 20 are resolved at run time. */
  51   LBP_BK =  0, /* mandatory break */
  52 /*LBP_CR,         carriage return - not used here because it's a DOSism */
  53 /*LBP_LF,         line feed - not used here because it's a DOSism */
  54   LBP_CM = 20, /* attached characters and combining marks */
  55 /*LBP_SG,         surrogates - not used here because they are not characters */
  56   LBP_ZW =  1, /* zero width space */
  57   LBP_IN =  2, /* inseparable */
  58   LBP_GL =  3, /* non-breaking (glue) */
  59   LBP_CB = 22, /* contingent break opportunity */
  60   LBP_SP = 21, /* space */
  61   LBP_BA =  4, /* break opportunity after */
  62   LBP_BB =  5, /* break opportunity before */
  63   LBP_B2 =  6, /* break opportunity before and after */
  64   LBP_HY =  7, /* hyphen */
  65   LBP_NS =  8, /* non starter */
  66   LBP_OP =  9, /* opening punctuation */
  67   LBP_CL = 10, /* closing punctuation */
  68   LBP_QU = 11, /* ambiguous quotation */
  69   LBP_EX = 12, /* exclamation/interrogation */
  70   LBP_ID = 13, /* ideographic */
  71   LBP_NU = 14, /* numeric */
  72   LBP_IS = 15, /* infix separator (numeric) */
  73   LBP_SY = 16, /* symbols allowing breaks */
  74   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
  75   LBP_PR = 18, /* prefix (numeric) */
  76   LBP_PO = 19, /* postfix (numeric) */
  77   LBP_SA = 23, /* complex context (South East Asian) */
  78   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
  79   LBP_XX = 25  /* unknown */
  80 };
  81
  82 #include "lbrkprop.h"
  83
  84 static inline unsigned char
  85 lbrkprop_lookup (unsigned int uc)
  86 {
  87   unsigned int index1 = uc >> lbrkprop_header_0;
  88   if (index1 < lbrkprop_header_1)
  89     {
  90       int lookup1 = lbrkprop.level1[index1];
  91       if (lookup1 >= 0)
  92         {
  93           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
  94           int lookup2 = lbrkprop.level2[lookup1 + index2];
  95           if (lookup2 >= 0)
  96             {
  97               unsigned int index3 = uc & lbrkprop_header_4;
  98               return lbrkprop.level3[lookup2 + index3];
  99             }
 100         }
 101     }
 102   return LBP_XX;
 103 }
 104
 105 /* Table indexed by two line breaking classifications.  */
 106 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
 107 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 108 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 109 static const unsigned char lbrk_table[19][19] = {
 110                                 /* after */
 111         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
 112 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
 113 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 114 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 115 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 116 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 117 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 118 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 119 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 120 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
 121 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
 122 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
 123 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 124 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
 125 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
 126 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 127 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 128 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
 129 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
 130 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 131 /* "" */
 132 /* before */
 133 };
 134 /* Note: The (B2,B2) entry should probably be D instead of P.  */
 135 /* Note: The (PR,ID) entry should probably be D instead of I.  */
 136
 137 void
 138 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
 139 {
 140   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 141   const unsigned char *s_end = s + n;
 142   int last_prop = LBP_BK; /* line break property of last non-space character */
 143   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 144   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 145
 146   /* Don't break inside multibyte characters.  */
 147   memset (p, UC_BREAK_PROHIBITED, n);
 148
 149   while (s < s_end)
 150     {
 151       unsigned int uc;
 152       int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
 153       int prop = lbrkprop_lookup (uc);
 154
 155       if (prop == LBP_BK)
 156         {
 157           /* Mandatory break.  */
 158           *p = UC_BREAK_MANDATORY;
 159           last_prop = LBP_BK;
 160           seen_space = NULL;
 161           seen_space2 = NULL;
 162         }
 163       else
 164         {
 165           char *q;
 166
 167           /* Resolve property values whose behaviour is not fixed.  */
 168           switch (prop)
 169             {
 170               case LBP_AI:
 171                 /* Resolve ambiguous.  */
 172                 prop = LBP_AI_REPLACEMENT;
 173                 break;
 174               case LBP_CB:
 175                 /* This is arbitrary.  */
 176                 prop = LBP_ID;
 177                 break;
 178               case LBP_SA:
 179                 /* We don't handle complex scripts yet.
 180                    Treat LBP_SA like LBP_XX.  */
 181               case LBP_XX:
 182                 /* This is arbitrary.  */
 183                 prop = LBP_AL;
 184                 break;
 185             }
 186
 187           /* Deal with combining characters.  */
 188           q = p;
 189           if (prop == LBP_CM)
 190             {
 191               /* Don't break just before a combining character.  */
 192               *p = UC_BREAK_PROHIBITED;
 193               /* A combining character turns a preceding space into LBP_AL.  */
 194               if (seen_space != NULL)
 195                 {
 196                   q = seen_space;
 197                   seen_space = seen_space2;
 198                   prop = LBP_AL;
 199                   goto lookup_via_table;
 200                 }
 201             }
 202           else if (prop == LBP_SP)
 203             {
 204               /* Don't break just before a space.  */
 205               *p = UC_BREAK_PROHIBITED;
 206               seen_space2 = seen_space;
 207               seen_space = p;
 208             }
 209           else
 210             {
 211              lookup_via_table:
 212               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 213               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 214                 abort ();
 215
 216               if (last_prop == LBP_BK)
 217                 {
 218                   /* Don't break at the beginning of a line.  */
 219                   *q = UC_BREAK_PROHIBITED;
 220                 }
 221               else
 222                 {
 223                   switch (lbrk_table [last_prop-1] [prop-1])
 224                     {
 225                       case D:
 226                         *q = UC_BREAK_POSSIBLE;
 227                         break;
 228                       case I:
 229                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 230                         break;
 231                       case P:
 232                         *q = UC_BREAK_PROHIBITED;
 233                         break;
 234                       default:
 235                         abort ();
 236                     }
 237                 }
 238               last_prop = prop;
 239               seen_space = NULL;
 240               seen_space2 = NULL;
 241             }
 242         }
 243
 244       s += count;
 245       p += count;
 246     }
 247 }
 248
 249 void
 250 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
 251 {
 252   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 253   const unsigned short *s_end = s + n;
 254   int last_prop = LBP_BK; /* line break property of last non-space character */
 255   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 256   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 257
 258   /* Don't break inside multibyte characters.  */
 259   memset (p, UC_BREAK_PROHIBITED, n);
 260
 261   while (s < s_end)
 262     {
 263       unsigned int uc;
 264       int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
 265       int prop = lbrkprop_lookup (uc);
 266
 267       if (prop == LBP_BK)
 268         {
 269           /* Mandatory break.  */
 270           *p = UC_BREAK_MANDATORY;
 271           last_prop = LBP_BK;
 272           seen_space = NULL;
 273           seen_space2 = NULL;
 274         }
 275       else
 276         {
 277           char *q;
 278
 279           /* Resolve property values whose behaviour is not fixed.  */
 280           switch (prop)
 281             {
 282               case LBP_AI:
 283                 /* Resolve ambiguous.  */
 284                 prop = LBP_AI_REPLACEMENT;
 285                 break;
 286               case LBP_CB:
 287                 /* This is arbitrary.  */
 288                 prop = LBP_ID;
 289                 break;
 290               case LBP_SA:
 291                 /* We don't handle complex scripts yet.
 292                    Treat LBP_SA like LBP_XX.  */
 293               case LBP_XX:
 294                 /* This is arbitrary.  */
 295                 prop = LBP_AL;
 296                 break;
 297             }
 298
 299           /* Deal with combining characters.  */
 300           q = p;
 301           if (prop == LBP_CM)
 302             {
 303               /* Don't break just before a combining character.  */
 304               *p = UC_BREAK_PROHIBITED;
 305               /* A combining character turns a preceding space into LBP_AL.  */
 306               if (seen_space != NULL)
 307                 {
 308                   q = seen_space;
 309                   seen_space = seen_space2;
 310                   prop = LBP_AL;
 311                   goto lookup_via_table;
 312                 }
 313             }
 314           else if (prop == LBP_SP)
 315             {
 316               /* Don't break just before a space.  */
 317               *p = UC_BREAK_PROHIBITED;
 318               seen_space2 = seen_space;
 319               seen_space = p;
 320             }
 321           else
 322             {
 323              lookup_via_table:
 324               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 325               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 326                 abort ();
 327
 328               if (last_prop == LBP_BK)
 329                 {
 330                   /* Don't break at the beginning of a line.  */
 331                   *q = UC_BREAK_PROHIBITED;
 332                 }
 333               else
 334                 {
 335                   switch (lbrk_table [last_prop-1] [prop-1])
 336                     {
 337                       case D:
 338                         *q = UC_BREAK_POSSIBLE;
 339                         break;
 340                       case I:
 341                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 342                         break;
 343                       case P:
 344                         *q = UC_BREAK_PROHIBITED;
 345                         break;
 346                       default:
 347                         abort ();
 348                     }
 349                 }
 350               last_prop = prop;
 351               seen_space = NULL;
 352               seen_space2 = NULL;
 353             }
 354         }
 355
 356       s += count;
 357       p += count;
 358     }
 359 }
 360
 361 void
 362 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
 363 {
 364   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 365   const unsigned int *s_end = s + n;
 366   int last_prop = LBP_BK; /* line break property of last non-space character */
 367   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 368   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 369
 370   while (s < s_end)
 371     {
 372       unsigned int uc = *s;
 373       int prop = lbrkprop_lookup (uc);
 374
 375       if (prop == LBP_BK)
 376         {
 377           /* Mandatory break.  */
 378           *p = UC_BREAK_MANDATORY;
 379           last_prop = LBP_BK;
 380           seen_space = NULL;
 381           seen_space2 = NULL;
 382         }
 383       else
 384         {
 385           char *q;
 386
 387           /* Resolve property values whose behaviour is not fixed.  */
 388           switch (prop)
 389             {
 390               case LBP_AI:
 391                 /* Resolve ambiguous.  */
 392                 prop = LBP_AI_REPLACEMENT;
 393                 break;
 394               case LBP_CB:
 395                 /* This is arbitrary.  */
 396                 prop = LBP_ID;
 397                 break;
 398               case LBP_SA:
 399                 /* We don't handle complex scripts yet.
 400                    Treat LBP_SA like LBP_XX.  */
 401               case LBP_XX:
 402                 /* This is arbitrary.  */
 403                 prop = LBP_AL;
 404                 break;
 405             }
 406
 407           /* Deal with combining characters.  */
 408           q = p;
 409           if (prop == LBP_CM)
 410             {
 411               /* Don't break just before a combining character.  */
 412               *p = UC_BREAK_PROHIBITED;
 413               /* A combining character turns a preceding space into LBP_AL.  */
 414               if (seen_space != NULL)
 415                 {
 416                   q = seen_space;
 417                   seen_space = seen_space2;
 418                   prop = LBP_AL;
 419                   goto lookup_via_table;
 420                 }
 421             }
 422           else if (prop == LBP_SP)
 423             {
 424               /* Don't break just before a space.  */
 425               *p = UC_BREAK_PROHIBITED;
 426               seen_space2 = seen_space;
 427               seen_space = p;
 428             }
 429           else
 430             {
 431              lookup_via_table:
 432               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 433               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 434                 abort ();
 435
 436               if (last_prop == LBP_BK)
 437                 {
 438                   /* Don't break at the beginning of a line.  */
 439                   *q = UC_BREAK_PROHIBITED;
 440                 }
 441               else
 442                 {
 443                   switch (lbrk_table [last_prop-1] [prop-1])
 444                     {
 445                       case D:
 446                         *q = UC_BREAK_POSSIBLE;
 447                         break;
 448                       case I:
 449                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 450                         break;
 451                       case P:
 452                         *q = UC_BREAK_PROHIBITED;
 453                         break;
 454                       default:
 455                         abort ();
 456                     }
 457                 }
 458               last_prop = prop;
 459               seen_space = NULL;
 460               seen_space2 = NULL;
 461             }
 462         }
 463
 464       s++;
 465       p++;
 466     }
 467 }
 468
 469
 470 /* Choose the best line breaks, assuming the uc_width function.
 471    Return the column after the end of the string.  */
 472
 473 int
 474 u8_width_linebreaks (const unsigned char *s, size_t n,
 475                      int width, int start_column, int at_end_columns,
 476                      const char *o, const char *encoding,
 477                      char *p)
 478 {
 479   const unsigned char *s_end;
 480   char *last_p;
 481   int last_column;
 482   int piece_width;
 483
 484   u8_possible_linebreaks (s, n, encoding, p);
 485
 486   s_end = s + n;
 487   last_p = NULL;
 488   last_column = start_column;
 489   piece_width = 0;
 490   while (s < s_end)
 491     {
 492       unsigned int uc;
 493       int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
 494
 495       /* Respect the override.  */
 496       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 497         *p = *o;
 498
 499       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 500         {
 501           /* An atomic piece of text ends here.  */
 502           if (last_p != NULL && last_column + piece_width > width)
 503             {
 504               /* Insert a line break.  */
 505               *last_p = UC_BREAK_POSSIBLE;
 506               last_column = 0;
 507             }
 508         }
 509
 510       if (*p == UC_BREAK_MANDATORY)
 511         {
 512           /* uc is a line break character.  */
 513           /* Start a new piece at column 0.  */
 514           last_p = NULL;
 515           last_column = 0;
 516           piece_width = 0;
 517         }
 518       else
 519         {
 520           /* uc is not a line break character.  */
 521           int w;
 522
 523           if (*p == UC_BREAK_POSSIBLE)
 524             {
 525               /* Start a new piece.  */
 526               last_p = p;
 527               last_column += piece_width;
 528               piece_width = 0;
 529               /* No line break for the moment, may be turned into
 530                  UC_BREAK_POSSIBLE later, via last_p. */
 531             }
 532
 533           *p = UC_BREAK_PROHIBITED;
 534
 535           w = uc_width (uc, encoding);
 536           if (w >= 0) /* ignore control characters in the string */
 537             piece_width += w;
 538          }
 539
 540       s += count;
 541       p += count;
 542       if (o != NULL)
 543         o += count;
 544     }
 545
 546   /* The last atomic piece of text ends here.  */
 547   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 548     {
 549       /* Insert a line break.  */
 550       *last_p = UC_BREAK_POSSIBLE;
 551       last_column = 0;
 552     }
 553
 554   return last_column + piece_width;
 555 }
 556
 557 int
 558 u16_width_linebreaks (const unsigned short *s, size_t n,
 559                       int width, int start_column, int at_end_columns,
 560                       const char *o, const char *encoding,
 561                       char *p)
 562 {
 563   const unsigned short *s_end;
 564   char *last_p;
 565   int last_column;
 566   int piece_width;
 567
 568   u16_possible_linebreaks (s, n, encoding, p);
 569
 570   s_end = s + n;
 571   last_p = NULL;
 572   last_column = start_column;
 573   piece_width = 0;
 574   while (s < s_end)
 575     {
 576       unsigned int uc;
 577       int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
 578
 579       /* Respect the override.  */
 580       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 581         *p = *o;
 582
 583       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 584         {
 585           /* An atomic piece of text ends here.  */
 586           if (last_p != NULL && last_column + piece_width > width)
 587             {
 588               /* Insert a line break.  */
 589               *last_p = UC_BREAK_POSSIBLE;
 590               last_column = 0;
 591             }
 592         }
 593
 594       if (*p == UC_BREAK_MANDATORY)
 595         {
 596           /* uc is a line break character.  */
 597           /* Start a new piece at column 0.  */
 598           last_p = NULL;
 599           last_column = 0;
 600           piece_width = 0;
 601         }
 602       else
 603         {
 604           /* uc is not a line break character.  */
 605           int w;
 606
 607           if (*p == UC_BREAK_POSSIBLE)
 608             {
 609               /* Start a new piece.  */
 610               last_p = p;
 611               last_column += piece_width;
 612               piece_width = 0;
 613               /* No line break for the moment, may be turned into
 614                  UC_BREAK_POSSIBLE later, via last_p. */
 615             }
 616
 617           *p = UC_BREAK_PROHIBITED;
 618
 619           w = uc_width (uc, encoding);
 620           if (w >= 0) /* ignore control characters in the string */
 621             piece_width += w;
 622          }
 623
 624       s += count;
 625       p += count;
 626       if (o != NULL)
 627         o += count;
 628     }
 629
 630   /* The last atomic piece of text ends here.  */
 631   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 632     {
 633       /* Insert a line break.  */
 634       *last_p = UC_BREAK_POSSIBLE;
 635       last_column = 0;
 636     }
 637
 638   return last_column + piece_width;
 639 }
 640
 641 int
 642 u32_width_linebreaks (const unsigned int *s, size_t n,
 643                       int width, int start_column, int at_end_columns,
 644                       const char *o, const char *encoding,
 645                       char *p)
 646 {
 647   const unsigned int *s_end;
 648   char *last_p;
 649   int last_column;
 650   int piece_width;
 651
 652   u32_possible_linebreaks (s, n, encoding, p);
 653
 654   s_end = s + n;
 655   last_p = NULL;
 656   last_column = start_column;
 657   piece_width = 0;
 658   while (s < s_end)
 659     {
 660       unsigned int uc = *s;
 661
 662       /* Respect the override.  */
 663       if (o != NULL && *o != UC_BREAK_UNDEFINED)
 664         *p = *o;
 665
 666       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
 667         {
 668           /* An atomic piece of text ends here.  */
 669           if (last_p != NULL && last_column + piece_width > width)
 670             {
 671               /* Insert a line break.  */
 672               *last_p = UC_BREAK_POSSIBLE;
 673               last_column = 0;
 674             }
 675         }
 676
 677       if (*p == UC_BREAK_MANDATORY)
 678         {
 679           /* uc is a line break character.  */
 680           /* Start a new piece at column 0.  */
 681           last_p = NULL;
 682           last_column = 0;
 683           piece_width = 0;
 684         }
 685       else
 686         {
 687           /* uc is not a line break character.  */
 688           int w;
 689
 690           if (*p == UC_BREAK_POSSIBLE)
 691             {
 692               /* Start a new piece.  */
 693               last_p = p;
 694               last_column += piece_width;
 695               piece_width = 0;
 696               /* No line break for the moment, may be turned into
 697                  UC_BREAK_POSSIBLE later, via last_p. */
 698             }
 699
 700           *p = UC_BREAK_PROHIBITED;
 701
 702           w = uc_width (uc, encoding);
 703           if (w >= 0) /* ignore control characters in the string */
 704             piece_width += w;
 705          }
 706
 707       s++;
 708       p++;
 709       if (o != NULL)
 710         o++;
 711     }
 712
 713   /* The last atomic piece of text ends here.  */
 714   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
 715     {
 716       /* Insert a line break.  */
 717       *last_p = UC_BREAK_POSSIBLE;
 718       last_column = 0;
 719     }
 720
 721   return last_column + piece_width;
 722 }
 723
 724
 725 #ifdef TEST1
 726
 727 #include <stdio.h>
 728
 729 /* Read the contents of an input stream, and return it, terminated with a NUL
 730    byte. */
 731 char *
 732 read_file (FILE *stream)
 733 {
 734 #define BUFSIZE 4096
 735   char *buf = NULL;
 736   int alloc = 0;
 737   int size = 0;
 738   int count;
 739
 740   while (! feof (stream))
 741     {
 742       if (size + BUFSIZE > alloc)
 743         {
 744           alloc = alloc + alloc / 2;
 745           if (alloc < size + BUFSIZE)
 746             alloc = size + BUFSIZE;
 747           buf = realloc (buf, alloc);
 748           if (buf == NULL)
 749             {
 750               fprintf (stderr, "out of memory\n");
 751               exit (1);
 752             }
 753         }
 754       count = fread (buf + size, 1, BUFSIZE, stream);
 755       if (count == 0)
 756         {
 757           if (ferror (stream))
 758             {
 759               perror ("fread");
 760               exit (1);
 761             }
 762         }
 763       else
 764         size += count;
 765     }
 766   buf = realloc (buf, size + 1);
 767   if (buf == NULL)
 768     {
 769       fprintf (stderr, "out of memory\n");
 770       exit (1);
 771     }
 772   buf[size] = '\0';
 773   return buf;
 774 #undef BUFSIZE
 775 }
 776
 777 int
 778 main (int argc, char * argv[])
 779 {
 780   if (argc == 1)
 781     {
 782       /* Display all the break opportunities in the input string.  */
 783       char *input = read_file (stdin);
 784       int length = strlen (input);
 785       char *breaks = malloc (length);
 786       int i;
 787
 788       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
 789
 790       for (i = 0; i < length; i++)
 791         {
 792           switch (breaks[i])
 793             {
 794               case UC_BREAK_POSSIBLE:
 795                 /* U+2027 in UTF-8 encoding */
 796                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
 797                 break;
 798               case UC_BREAK_MANDATORY:
 799                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
 800                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
 801                 break;
 802               case UC_BREAK_PROHIBITED:
 803                 break;
 804               default:
 805                 abort ();
 806             }
 807           putc (input[i], stdout);
 808         }
 809
 810       free (breaks);
 811
 812       return 0;
 813     }
 814   else if (argc == 2)
 815     {
 816       /* Insert line breaks for a given width.  */
 817       int width = atoi (argv[1]);
 818       char *input = read_file (stdin);
 819       int length = strlen (input);
 820       char *breaks = malloc (length);
 821       int i;
 822
 823       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
 824
 825       for (i = 0; i < length; i++)
 826         {
 827           switch (breaks[i])
 828             {
 829               case UC_BREAK_POSSIBLE:
 830                 putc ('\n', stdout);
 831                 break;
 832               case UC_BREAK_MANDATORY:
 833                 break;
 834               case UC_BREAK_PROHIBITED:
 835                 break;
 836               default:
 837                 abort ();
 838             }
 839           putc (input[i], stdout);
 840         }
 841
 842       free (breaks);
 843
 844       return 0;
 845     }
 846   else
 847     return 1;
 848 }
 849
 850 #endif /* TEST1 */
 851
 852
 853 /* Now the same thing with an arbitrary encoding.
 854
 855    We convert the input string to Unicode.
 856
 857    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
 858    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
 859    \U0000FFFF.  UTF-16 and variants support only characters up to
 860    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
 861    UCS-4 specification leaves doubts about endianness and byte order mark.
 862    glibc currently interprets it as big endian without byte order mark,
 863    but this is not backed by an RFC.  So we use UTF-8. It supports
 864    characters up to \U7FFFFFFF and is unambiguously defined.  */
 865
 866 #if HAVE_ICONV
 867
 868 #include <iconv.h>
 869 #include <errno.h>
 870
 871 /* Luckily, the encoding's name is platform independent.  */
 872 #define UTF8_NAME "UTF-8"
 873
 874 /* Return the length of a string after conversion through an iconv_t.  */
 875 static size_t
 876 iconv_string_length (iconv_t cd, const char *s, size_t n)
 877 {
 878 #define TMPBUFSIZE 4096
 879   size_t count = 0;
 880   char tmpbuf[TMPBUFSIZE];
 881   const char *inptr = s;
 882   size_t insize = n;
 883   while (insize > 0)
 884     {
 885       char *outptr = tmpbuf;
 886       size_t outsize = TMPBUFSIZE;
 887       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
 888       if (res == (size_t)(-1) && errno != E2BIG)
 889         return (size_t)(-1);
 890       count += outptr - tmpbuf;
 891     }
 892   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
 893 #if defined _LIBICONV_VERSION \
 894     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 895   {
 896     char *outptr = tmpbuf;
 897     size_t outsize = TMPBUFSIZE;
 898     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
 899     if (res == (size_t)(-1))
 900       return (size_t)(-1);
 901     count += outptr - tmpbuf;
 902   }
 903   /* Return to the initial state.  */
 904   iconv (cd, NULL, NULL, NULL, NULL);
 905 #endif
 906   return count;
 907 #undef TMPBUFSIZE
 908 }
 909
 910 static void
 911 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
 912                               size_t *offtable, char *t, size_t m)
 913 {
 914   size_t i;
 915   const char *s_end;
 916   const char *inptr;
 917   char *outptr;
 918   size_t outsize;
 919   /* Avoid glibc-2.1 bug.  */
 920 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
 921   const size_t extra = 1;
 922 #else
 923   const size_t extra = 0;
 924 #endif
 925
 926   for (i = 0; i < n; i++)
 927     offtable[i] = (size_t)(-1);
 928
 929   s_end = s + n;
 930   inptr = s;
 931   outptr = t;
 932   outsize = m + extra;
 933   while (inptr < s_end)
 934     {
 935       const char *saved_inptr;
 936       size_t insize;
 937       size_t res;
 938
 939       offtable[inptr - s] = outptr - t;
 940
 941       saved_inptr = inptr;
 942       res = (size_t)(-1);
 943       for (insize = 1; inptr + insize <= s_end; insize++)
 944         {
 945           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
 946           if (!(res == (size_t)(-1) && errno == EINVAL))
 947             break;
 948           /* We expect that no input bytes have been consumed so far.  */
 949           if (inptr != saved_inptr)
 950             abort ();
 951         }
 952       /* After we verified the convertibility and computed the translation's
 953          size m, there shouldn't be any conversion error here. */
 954       if (res == (size_t)(-1))
 955         abort ();
 956     }
 957   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
 958 #if defined _LIBICONV_VERSION \
 959     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 960   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
 961     abort ();
 962 #endif
 963   /* We should have produced exactly m output bytes.  */
 964   if (outsize != extra)
 965     abort ();
 966 }
 967
 968 #endif /* HAVE_ICONV */
 969
 970 #if C_CTYPE_ASCII
 971
 972 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
 973    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
 974 static int
 975 is_all_ascii (const char *s, size_t n)
 976 {
 977   for (; n > 0; s++, n--)
 978     {
 979       unsigned char c = (unsigned char) *s;
 980
 981       if (!(c_isprint (c) || c_isspace (c)))
 982         return 0;
 983     }
 984   return 1;
 985 }
 986
 987 #endif /* C_CTYPE_ASCII */
 988
 989 void
 990 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
 991                          char *p)
 992 {
 993   if (n == 0)
 994     return;
 995   if (is_utf8_encoding (encoding))
 996     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
 997   else
 998     {
 999 #if HAVE_ICONV
1000       iconv_t to_utf8;
1001       /* Avoid glibc-2.1 bug with EUC-KR.  */
1002 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1003       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1004         to_utf8 = (iconv_t)(-1);
1005       else
1006 # endif
1007       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1008          GB18030.  */
1009 # if defined __sun && !defined _LIBICONV_VERSION
1010       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1011           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1012           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1013           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1014           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1015           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1016         to_utf8 = (iconv_t)(-1);
1017       else
1018 # endif
1019       to_utf8 = iconv_open (UTF8_NAME, encoding);
1020       if (to_utf8 != (iconv_t)(-1))
1021         {
1022           /* Determine the length of the resulting UTF-8 string.  */
1023           size_t m = iconv_string_length (to_utf8, s, n);
1024           if (m != (size_t)(-1))
1025             {
1026               /* Convert the string to UTF-8 and build a translation table
1027                  from offsets into s to offsets into the translated string.  */
1028               size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1029               char *memory =
1030                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1031               if (memory != NULL)
1032                 {
1033                   size_t *offtable = (size_t *) memory;
1034                   char *t = (char *) (offtable + n);
1035                   char *q = (char *) (t + m);
1036                   size_t i;
1037
1038                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1039
1040                   /* Determine the possible line breaks of the UTF-8 string.  */
1041                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1042
1043                   /* Translate the result back to the original string.  */
1044                   memset (p, UC_BREAK_PROHIBITED, n);
1045                   for (i = 0; i < n; i++)
1046                     if (offtable[i] != (size_t)(-1))
1047                       p[i] = q[offtable[i]];
1048
1049                   free (memory);
1050                   iconv_close (to_utf8);
1051                   return;
1052                 }
1053             }
1054           iconv_close (to_utf8);
1055         }
1056 #endif
1057       /* Impossible to convert.  */
1058 #if C_CTYPE_ASCII
1059       if (is_all_ascii (s, n))
1060         {
1061           /* ASCII is a subset of UTF-8.  */
1062           u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1063           return;
1064         }
1065 #endif
1066       /* We have a non-ASCII string and cannot convert it.
1067          Don't produce line breaks except those already present in the
1068          input string.  All we assume here is that the encoding is
1069          minimally ASCII compatible.  */
1070       {
1071         const char *s_end = s + n;
1072         while (s < s_end)
1073           {
1074             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1075             s++;
1076             p++;
1077           }
1078       }
1079     }
1080 }
1081
1082 int
1083 mbs_width_linebreaks (const char *s, size_t n,
1084                       int width, int start_column, int at_end_columns,
1085                       const char *o, const char *encoding,
1086                       char *p)
1087 {
1088   if (n == 0)
1089     return start_column;
1090   if (is_utf8_encoding (encoding))
1091     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1092   else
1093     {
1094 #if HAVE_ICONV
1095       iconv_t to_utf8;
1096       /* Avoid glibc-2.1 bug with EUC-KR.  */
1097 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1098       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1099         to_utf8 = (iconv_t)(-1);
1100       else
1101 # endif
1102       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1103          GB18030.  */
1104 # if defined __sun && !defined _LIBICONV_VERSION
1105       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1106           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1107           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1108           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1109           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1110           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1111         to_utf8 = (iconv_t)(-1);
1112       else
1113 # endif
1114       to_utf8 = iconv_open (UTF8_NAME, encoding);
1115       if (to_utf8 != (iconv_t)(-1))
1116         {
1117           /* Determine the length of the resulting UTF-8 string.  */
1118           size_t m = iconv_string_length (to_utf8, s, n);
1119           if (m != (size_t)(-1))
1120             {
1121               /* Convert the string to UTF-8 and build a translation table
1122                  from offsets into s to offsets into the translated string.  */
1123               size_t memory_size =
1124                 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1125                        (o != NULL ? m : 0));
1126               char *memory =
1127                 (char *)
1128                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1129               if (memory != NULL)
1130                 {
1131                   size_t *offtable = (size_t *) memory;
1132                   char *t = (char *) (offtable + n);
1133                   char *q = (char *) (t + m);
1134                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1135                   int res_column;
1136                   size_t i;
1137
1138                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1139
1140                   /* Translate the overrides to the UTF-8 string.  */
1141                   if (o != NULL)
1142                     {
1143                       memset (o8, UC_BREAK_UNDEFINED, m);
1144                       for (i = 0; i < n; i++)
1145                         if (offtable[i] != (size_t)(-1))
1146                           o8[offtable[i]] = o[i];
1147                     }
1148
1149                   /* Determine the line breaks of the UTF-8 string.  */
1150                   res_column =
1151                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1152
1153                   /* Translate the result back to the original string.  */
1154                   memset (p, UC_BREAK_PROHIBITED, n);
1155                   for (i = 0; i < n; i++)
1156                     if (offtable[i] != (size_t)(-1))
1157                       p[i] = q[offtable[i]];
1158
1159                   free (memory);
1160                   iconv_close (to_utf8);
1161                   return res_column;
1162                 }
1163             }
1164           iconv_close (to_utf8);
1165         }
1166 #endif
1167       /* Impossible to convert.  */
1168 #if C_CTYPE_ASCII
1169       if (is_all_ascii (s, n))
1170         {
1171           /* ASCII is a subset of UTF-8.  */
1172           return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1173         }
1174 #endif
1175       /* We have a non-ASCII string and cannot convert it.
1176          Don't produce line breaks except those already present in the
1177          input string.  All we assume here is that the encoding is
1178          minimally ASCII compatible.  */
1179       {
1180         const char *s_end = s + n;
1181         while (s < s_end)
1182           {
1183             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1184                   ? UC_BREAK_MANDATORY
1185                   : UC_BREAK_PROHIBITED);
1186             s++;
1187             p++;
1188             if (o != NULL)
1189               o++;
1190           }
1191         /* We cannot compute widths in this case.  */
1192         return start_column;
1193       }
1194     }
1195 }
1196
1197
1198 #ifdef TEST2
1199
1200 #include <stdio.h>
1201 #include <locale.h>
1202
1203 /* Read the contents of an input stream, and return it, terminated with a NUL
1204    byte. */
1205 char *
1206 read_file (FILE *stream)
1207 {
1208 #define BUFSIZE 4096
1209   char *buf = NULL;
1210   int alloc = 0;
1211   int size = 0;
1212   int count;
1213
1214   while (! feof (stream))
1215     {
1216       if (size + BUFSIZE > alloc)
1217         {
1218           alloc = alloc + alloc / 2;
1219           if (alloc < size + BUFSIZE)
1220             alloc = size + BUFSIZE;
1221           buf = realloc (buf, alloc);
1222           if (buf == NULL)
1223             {
1224               fprintf (stderr, "out of memory\n");
1225               exit (1);
1226             }
1227         }
1228       count = fread (buf + size, 1, BUFSIZE, stream);
1229       if (count == 0)
1230         {
1231           if (ferror (stream))
1232             {
1233               perror ("fread");
1234               exit (1);
1235             }
1236         }
1237       else
1238         size += count;
1239     }
1240   buf = realloc (buf, size + 1);
1241   if (buf == NULL)
1242     {
1243       fprintf (stderr, "out of memory\n");
1244       exit (1);
1245     }
1246   buf[size] = '\0';
1247   return buf;
1248 #undef BUFSIZE
1249 }
1250
1251 int
1252 main (int argc, char * argv[])
1253 {
1254   setlocale (LC_CTYPE, "");
1255   if (argc == 1)
1256     {
1257       /* Display all the break opportunities in the input string.  */
1258       char *input = read_file (stdin);
1259       int length = strlen (input);
1260       char *breaks = malloc (length);
1261       int i;
1262
1263       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1264
1265       for (i = 0; i < length; i++)
1266         {
1267           switch (breaks[i])
1268             {
1269               case UC_BREAK_POSSIBLE:
1270                 putc ('|', stdout);
1271                 break;
1272               case UC_BREAK_MANDATORY:
1273                 break;
1274               case UC_BREAK_PROHIBITED:
1275                 break;
1276               default:
1277                 abort ();
1278             }
1279           putc (input[i], stdout);
1280         }
1281
1282       free (breaks);
1283
1284       return 0;
1285     }
1286   else if (argc == 2)
1287     {
1288       /* Insert line breaks for a given width.  */
1289       int width = atoi (argv[1]);
1290       char *input = read_file (stdin);
1291       int length = strlen (input);
1292       char *breaks = malloc (length);
1293       int i;
1294
1295       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1296
1297       for (i = 0; i < length; i++)
1298         {
1299           switch (breaks[i])
1300             {
1301               case UC_BREAK_POSSIBLE:
1302                 putc ('\n', stdout);
1303                 break;
1304               case UC_BREAK_MANDATORY:
1305                 break;
1306               case UC_BREAK_PROHIBITED:
1307                 break;
1308               default:
1309                 abort ();
1310             }
1311           putc (input[i], stdout);
1312         }
1313
1314       free (breaks);
1315
1316       return 0;
1317     }
1318   else
1319     return 1;
1320 }
1321
1322 #endif /* TEST2 */