lib/unilbrk/ulc-width-linebreaks.c

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2008 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "unilbrk.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25
  26 #include "c-ctype.h"
  27 #include "uniconv.h"
  28 #include "unilbrk/ulc-common.h"
  29
  30 /* Line breaking of a string in an arbitrary encoding.
  31
  32    We convert the input string to Unicode.
  33
  34    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  35    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  36    \U0000FFFF.  UTF-16 and variants support only characters up to
  37    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  38    UCS-4 specification leaves doubts about endianness and byte order mark.
  39    glibc currently interprets it as big endian without byte order mark,
  40    but this is not backed by an RFC.  So we use UTF-8. It supports
  41    characters up to \U7FFFFFFF and is unambiguously defined.  */
  42
  43 int
  44 ulc_width_linebreaks (const char *s, size_t n,
  45                       int width, int start_column, int at_end_columns,
  46                       const char *o, const char *encoding,
  47                       char *p)
  48 {
  49   if (n > 0)
  50     {
  51       if (is_utf8_encoding (encoding))
  52         return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
  53       else
  54         {
  55           /* Convert the string to UTF-8 and build a translation table
  56              from offsets into s to offsets into the translated string.  */
  57           size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
  58
  59           if (offsets != NULL)
  60             {
  61               uint8_t *t = NULL;
  62               size_t m;
  63               if (u8_conv_from_encoding (encoding, iconveh_question_mark,
  64                                          s, n, offsets, &t, &m)
  65                   == 0)
  66                 {
  67                   char *memory = (char *) malloc (m + (o != NULL ? m : 0));
  68
  69                   if (memory != NULL)
  70                     {
  71                       char *q = (char *) memory;
  72                       char *o8 = (o != NULL ? (char *) (q + m) : NULL);
  73                       int res_column;
  74                       size_t i;
  75
  76                       /* Translate the overrides to the UTF-8 string.  */
  77                       if (o != NULL)
  78                         {
  79                           memset (o8, UC_BREAK_UNDEFINED, m);
  80                           for (i = 0; i < n; i++)
  81                             if (offsets[i] != (size_t)(-1))
  82                               o8[offsets[i]] = o[i];
  83                         }
  84
  85                       /* Determine the line breaks of the UTF-8 string.  */
  86                       res_column =
  87                         u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
  88
  89                       /* Translate the result back to the original string.  */
  90                       memset (p, UC_BREAK_PROHIBITED, n);
  91                       for (i = 0; i < n; i++)
  92                         if (offsets[i] != (size_t)(-1))
  93                           p[i] = q[offsets[i]];
  94
  95                       free (memory);
  96                       free (t);
  97                       free (offsets);
  98                       return res_column;
  99                     }
 100                   free (t);
 101                 }
 102               free (offsets);
 103             }
 104           /* Impossible to convert.  */
 105 #if C_CTYPE_ASCII
 106           if (is_all_ascii (s, n))
 107             {
 108               /* ASCII is a subset of UTF-8.  */
 109               return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
 110             }
 111 #endif
 112           /* We have a non-ASCII string and cannot convert it.
 113              Don't produce line breaks except those already present in the
 114              input string.  All we assume here is that the encoding is
 115              minimally ASCII compatible.  */
 116           {
 117             const char *s_end = s + n;
 118             while (s < s_end)
 119               {
 120                 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
 121                       ? UC_BREAK_MANDATORY
 122                       : UC_BREAK_PROHIBITED);
 123                 s++;
 124                 p++;
 125                 if (o != NULL)
 126                   o++;
 127               }
 128             /* We cannot compute widths in this case.  */
 129           }
 130         }
 131     }
 132   return start_column;
 133 }
 134
 135
 136 #ifdef TEST
 137
 138 #include <stdio.h>
 139 #include <locale.h>
 140
 141 /* Read the contents of an input stream, and return it, terminated with a NUL
 142    byte. */
 143 char *
 144 read_file (FILE *stream)
 145 {
 146 #define BUFSIZE 4096
 147   char *buf = NULL;
 148   int alloc = 0;
 149   int size = 0;
 150   int count;
 151
 152   while (! feof (stream))
 153     {
 154       if (size + BUFSIZE > alloc)
 155         {
 156           alloc = alloc + alloc / 2;
 157           if (alloc < size + BUFSIZE)
 158             alloc = size + BUFSIZE;
 159           buf = realloc (buf, alloc);
 160           if (buf == NULL)
 161             {
 162               fprintf (stderr, "out of memory\n");
 163               exit (1);
 164             }
 165         }
 166       count = fread (buf + size, 1, BUFSIZE, stream);
 167       if (count == 0)
 168         {
 169           if (ferror (stream))
 170             {
 171               perror ("fread");
 172               exit (1);
 173             }
 174         }
 175       else
 176         size += count;
 177     }
 178   buf = realloc (buf, size + 1);
 179   if (buf == NULL)
 180     {
 181       fprintf (stderr, "out of memory\n");
 182       exit (1);
 183     }
 184   buf[size] = '\0';
 185   return buf;
 186 #undef BUFSIZE
 187 }
 188
 189 int
 190 main (int argc, char * argv[])
 191 {
 192   setlocale (LC_CTYPE, "");
 193   if (argc == 2)
 194     {
 195       /* Insert line breaks for a given width.  */
 196       int width = atoi (argv[1]);
 197       char *input = read_file (stdin);
 198       int length = strlen (input);
 199       char *breaks = malloc (length);
 200       int i;
 201
 202       ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 203
 204       for (i = 0; i < length; i++)
 205         {
 206           switch (breaks[i])
 207             {
 208             case UC_BREAK_POSSIBLE:
 209               putc ('\n', stdout);
 210               break;
 211             case UC_BREAK_MANDATORY:
 212               break;
 213             case UC_BREAK_PROHIBITED:
 214               break;
 215             default:
 216               abort ();
 217             }
 218           putc (input[i], stdout);
 219         }
 220
 221       free (breaks);
 222
 223       return 0;
 224     }
 225   else
 226     return 1;
 227 }
 228
 229 #endif /* TEST */