lib/unilbrk/ulc-width-linebreaks.c

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "unilbrk.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25
  26 #include "c-ctype.h"
  27 #include "uniconv.h"
  28 #include "unilbrk/ulc-common.h"
  29
  30 /* Line breaking of a string in an arbitrary encoding.
  31
  32    We convert the input string to Unicode.
  33
  34    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  35    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  36    \U0000FFFF.  UTF-16 and variants support only characters up to
  37    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  38    UCS-4 specification leaves doubts about endianness and byte order mark.
  39    glibc currently interprets it as big endian without byte order mark,
  40    but this is not backed by an RFC.  So we use UTF-8. It supports
  41    characters up to \U7FFFFFFF and is unambiguously defined.  */
  42
  43 int
  44 ulc_width_linebreaks (const char *s, size_t n,
  45                       int width, int start_column, int at_end_columns,
  46                       const char *o, const char *encoding,
  47                       char *p)
  48 {
  49   if (n > 0)
  50     {
  51       if (is_utf8_encoding (encoding))
  52         return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
  53       else
  54         {
  55           /* Convert the string to UTF-8 and build a translation table
  56              from offsets into s to offsets into the translated string.  */
  57           size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
  58
  59           if (offsets != NULL)
  60             {
  61               uint8_t *t = NULL;
  62               size_t m;
  63               if (u8_conv_from_encoding (encoding, iconveh_question_mark,
  64                                          s, n, offsets, &t, &m)
  65                   == 0)
  66                 {
  67                   char *memory =
  68                     (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
  69
  70                   if (m == 0 || memory != NULL)
  71                     {
  72                       char *q = (char *) memory;
  73                       char *o8 = (o != NULL ? (char *) (q + m) : NULL);
  74                       int res_column;
  75                       size_t i;
  76
  77                       /* Translate the overrides to the UTF-8 string.  */
  78                       if (o != NULL)
  79                         {
  80                           memset (o8, UC_BREAK_UNDEFINED, m);
  81                           for (i = 0; i < n; i++)
  82                             if (offsets[i] != (size_t)(-1))
  83                               o8[offsets[i]] = o[i];
  84                         }
  85
  86                       /* Determine the line breaks of the UTF-8 string.  */
  87                       res_column =
  88                         u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
  89
  90                       /* Translate the result back to the original string.  */
  91                       memset (p, UC_BREAK_PROHIBITED, n);
  92                       for (i = 0; i < n; i++)
  93                         if (offsets[i] != (size_t)(-1))
  94                           p[i] = q[offsets[i]];
  95
  96                       free (memory);
  97                       free (t);
  98                       free (offsets);
  99                       return res_column;
 100                     }
 101                   free (t);
 102                 }
 103               free (offsets);
 104             }
 105           /* Impossible to convert.  */
 106 #if C_CTYPE_ASCII
 107           if (is_all_ascii (s, n))
 108             {
 109               /* ASCII is a subset of UTF-8.  */
 110               return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
 111             }
 112 #endif
 113           /* We have a non-ASCII string and cannot convert it.
 114              Don't produce line breaks except those already present in the
 115              input string.  All we assume here is that the encoding is
 116              minimally ASCII compatible.  */
 117           {
 118             const char *s_end = s + n;
 119             while (s < s_end)
 120               {
 121                 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
 122                       ? UC_BREAK_MANDATORY
 123                       : UC_BREAK_PROHIBITED);
 124                 s++;
 125                 p++;
 126                 if (o != NULL)
 127                   o++;
 128               }
 129             /* We cannot compute widths in this case.  */
 130           }
 131         }
 132     }
 133   return start_column;
 134 }
 135
 136
 137 #ifdef TEST
 138
 139 #include <stdio.h>
 140 #include <locale.h>
 141
 142 /* Read the contents of an input stream, and return it, terminated with a NUL
 143    byte. */
 144 char *
 145 read_file (FILE *stream)
 146 {
 147 #define BUFSIZE 4096
 148   char *buf = NULL;
 149   int alloc = 0;
 150   int size = 0;
 151   int count;
 152
 153   while (! feof (stream))
 154     {
 155       if (size + BUFSIZE > alloc)
 156         {
 157           alloc = alloc + alloc / 2;
 158           if (alloc < size + BUFSIZE)
 159             alloc = size + BUFSIZE;
 160           buf = realloc (buf, alloc);
 161           if (buf == NULL)
 162             {
 163               fprintf (stderr, "out of memory\n");
 164               exit (1);
 165             }
 166         }
 167       count = fread (buf + size, 1, BUFSIZE, stream);
 168       if (count == 0)
 169         {
 170           if (ferror (stream))
 171             {
 172               perror ("fread");
 173               exit (1);
 174             }
 175         }
 176       else
 177         size += count;
 178     }
 179   buf = realloc (buf, size + 1);
 180   if (buf == NULL)
 181     {
 182       fprintf (stderr, "out of memory\n");
 183       exit (1);
 184     }
 185   buf[size] = '\0';
 186   return buf;
 187 #undef BUFSIZE
 188 }
 189
 190 int
 191 main (int argc, char * argv[])
 192 {
 193   setlocale (LC_CTYPE, "");
 194   if (argc == 2)
 195     {
 196       /* Insert line breaks for a given width.  */
 197       int width = atoi (argv[1]);
 198       char *input = read_file (stdin);
 199       int length = strlen (input);
 200       char *breaks = malloc (length);
 201       int i;
 202
 203       ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 204
 205       for (i = 0; i < length; i++)
 206         {
 207           switch (breaks[i])
 208             {
 209             case UC_BREAK_POSSIBLE:
 210               putc ('\n', stdout);
 211               break;
 212             case UC_BREAK_MANDATORY:
 213               break;
 214             case UC_BREAK_PROHIBITED:
 215               break;
 216             default:
 217               abort ();
 218             }
 219           putc (input[i], stdout);
 220         }
 221
 222       free (breaks);
 223
 224       return 0;
 225     }
 226   else
 227     return 1;
 228 }
 229
 230 #endif /* TEST */