lib/unilbrk/ulc-width-linebreaks.c

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2014 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "unilbrk.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25
  26 #include "c-ctype.h"
  27 #include "uniconv.h"
  28 #include "unilbrk/ulc-common.h"
  29
  30 /* Line breaking of a string in an arbitrary encoding.
  31
  32    We convert the input string to Unicode.
  33
  34    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  35    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  36    \U0000FFFF.  UTF-16 and variants support only characters up to
  37    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  38    UCS-4 specification leaves doubts about endianness and byte order mark.
  39    glibc currently interprets it as big endian without byte order mark,
  40    but this is not backed by an RFC.  So we use UTF-8. It supports
  41    characters up to \U7FFFFFFF and is unambiguously defined.  */
  42
  43 int
  44 ulc_width_linebreaks (const char *s, size_t n,
  45                       int width, int start_column, int at_end_columns,
  46                       const char *o, const char *encoding,
  47                       char *p)
  48 {
  49   if (n > 0)
  50     {
  51       if (is_utf8_encoding (encoding))
  52         return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
  53       else
  54         {
  55           /* Convert the string to UTF-8 and build a translation table
  56              from offsets into s to offsets into the translated string.  */
  57           size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
  58
  59           if (offsets != NULL)
  60             {
  61               uint8_t *t;
  62               size_t m;
  63
  64               t = u8_conv_from_encoding (encoding, iconveh_question_mark,
  65                                          s, n, offsets, NULL, &m);
  66               if (t != NULL)
  67                 {
  68                   char *memory =
  69                     (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
  70
  71                   if (m == 0 || memory != NULL)
  72                     {
  73                       char *q = (char *) memory;
  74                       char *o8 = (o != NULL ? (char *) (q + m) : NULL);
  75                       int res_column;
  76                       size_t i;
  77
  78                       /* Translate the overrides to the UTF-8 string.  */
  79                       if (o != NULL)
  80                         {
  81                           memset (o8, UC_BREAK_UNDEFINED, m);
  82                           for (i = 0; i < n; i++)
  83                             if (offsets[i] != (size_t)(-1))
  84                               o8[offsets[i]] = o[i];
  85                         }
  86
  87                       /* Determine the line breaks of the UTF-8 string.  */
  88                       res_column =
  89                         u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
  90
  91                       /* Translate the result back to the original string.  */
  92                       memset (p, UC_BREAK_PROHIBITED, n);
  93                       for (i = 0; i < n; i++)
  94                         if (offsets[i] != (size_t)(-1))
  95                           p[i] = q[offsets[i]];
  96
  97                       free (memory);
  98                       free (t);
  99                       free (offsets);
 100                       return res_column;
 101                     }
 102                   free (t);
 103                 }
 104               free (offsets);
 105             }
 106           /* Impossible to convert.  */
 107 #if C_CTYPE_ASCII
 108           if (is_all_ascii (s, n))
 109             {
 110               /* ASCII is a subset of UTF-8.  */
 111               return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
 112             }
 113 #endif
 114           /* We have a non-ASCII string and cannot convert it.
 115              Don't produce line breaks except those already present in the
 116              input string.  All we assume here is that the encoding is
 117              minimally ASCII compatible.  */
 118           {
 119             const char *s_end = s + n;
 120             while (s < s_end)
 121               {
 122                 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
 123                       ? UC_BREAK_MANDATORY
 124                       : UC_BREAK_PROHIBITED);
 125                 s++;
 126                 p++;
 127                 if (o != NULL)
 128                   o++;
 129               }
 130             /* We cannot compute widths in this case.  */
 131           }
 132         }
 133     }
 134   return start_column;
 135 }
 136
 137
 138 #ifdef TEST
 139
 140 #include <stdio.h>
 141 #include <locale.h>
 142
 143 /* Read the contents of an input stream, and return it, terminated with a NUL
 144    byte. */
 145 char *
 146 read_file (FILE *stream)
 147 {
 148 #define BUFSIZE 4096
 149   char *buf = NULL;
 150   int alloc = 0;
 151   int size = 0;
 152   int count;
 153
 154   while (! feof (stream))
 155     {
 156       if (size + BUFSIZE > alloc)
 157         {
 158           alloc = alloc + alloc / 2;
 159           if (alloc < size + BUFSIZE)
 160             alloc = size + BUFSIZE;
 161           buf = realloc (buf, alloc);
 162           if (buf == NULL)
 163             {
 164               fprintf (stderr, "out of memory\n");
 165               exit (1);
 166             }
 167         }
 168       count = fread (buf + size, 1, BUFSIZE, stream);
 169       if (count == 0)
 170         {
 171           if (ferror (stream))
 172             {
 173               perror ("fread");
 174               exit (1);
 175             }
 176         }
 177       else
 178         size += count;
 179     }
 180   buf = realloc (buf, size + 1);
 181   if (buf == NULL)
 182     {
 183       fprintf (stderr, "out of memory\n");
 184       exit (1);
 185     }
 186   buf[size] = '\0';
 187   return buf;
 188 #undef BUFSIZE
 189 }
 190
 191 int
 192 main (int argc, char * argv[])
 193 {
 194   setlocale (LC_CTYPE, "");
 195   if (argc == 2)
 196     {
 197       /* Insert line breaks for a given width.  */
 198       int width = atoi (argv[1]);
 199       char *input = read_file (stdin);
 200       int length = strlen (input);
 201       char *breaks = malloc (length);
 202       int i;
 203
 204       ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 205
 206       for (i = 0; i < length; i++)
 207         {
 208           switch (breaks[i])
 209             {
 210             case UC_BREAK_POSSIBLE:
 211               putc ('\n', stdout);
 212               break;
 213             case UC_BREAK_MANDATORY:
 214               break;
 215             case UC_BREAK_PROHIBITED:
 216               break;
 217             default:
 218               abort ();
 219             }
 220           putc (input[i], stdout);
 221         }
 222
 223       free (breaks);
 224
 225       return 0;
 226     }
 227   else
 228     return 1;
 229 }
 230
 231 #endif /* TEST */