lib/unicodeio.c

   1 /* Unicode character output to streams with locale dependent encoding.
   2
   3    Copyright (C) 2000 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify it
   6    under the terms of the GNU Library General Public License as published
   7    by the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with this program; if not, write to the Free Software
  17    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  18    USA.  */
  19
  20 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 # include <config.h>
  24 #endif
  25
  26 #if HAVE_STDDEF_H
  27 # include <stddef.h>
  28 #endif
  29
  30 #include <stdio.h>
  31 #if HAVE_STRING_H
  32 # include <string.h>
  33 #else
  34 # include <strings.h>
  35 #endif
  36
  37 #include <errno.h>
  38 #ifndef errno
  39 extern int errno;
  40 #endif
  41 #ifndef ENOTSUP
  42 # define ENOTSUP EINVAL
  43 #endif
  44
  45 #if HAVE_LIMITS_H
  46 # include <limits.h>
  47 #endif
  48
  49 /* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
  50    installation; work around this configuration error.  */
  51 #if MB_LEN_MAX < 6
  52 # undef MB_LEN_MAX
  53 # define MB_LEN_MAX 6
  54 #endif
  55
  56 #if HAVE_ICONV
  57 # include <iconv.h>
  58 #endif
  59
  60 #include <error.h>
  61
  62 #if ENABLE_NLS
  63 # include <libintl.h>
  64 # define _(Text) gettext (Text)
  65 #else
  66 # define _(Text) Text
  67 #endif
  68
  69 #include "unicodeio.h"
  70
  71 #if __STDC_ISO_10646__ && HAVE_WCTOMB
  72
  73 /* Values of type wchar_t are Unicode code points.  */
  74
  75 /* Place into BUF the locale-dependent representation of the character
  76    CODE.  Return the size of the result.  If there is a conversion
  77    error, return -1, setting errno appropriately.  Assumes that the
  78    locale doesn't change between two calls.  */
  79 static size_t
  80 convert_unicode_char (char buf[MB_LEN_MAX], unsigned int code)
  81 {
  82   wchar_t wc = code;
  83   errno = 0;
  84   /* Test for truncation before invoking wctomb.  */
  85   return wc == code ? wctomb (buf, wc) : -1;
  86 }
  87
  88 #else
  89
  90 /* When we pass a Unicode character to iconv(), we must pass it in a
  91    suitable encoding. The standardized Unicode encodings are
  92    UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
  93    UCS-2 supports only characters up to \U0000FFFF.
  94    UTF-16 and variants support only characters up to \U0010FFFF.
  95    UTF-7 is way too complex and not supported by glibc-2.1.
  96    UCS-4 specification leaves doubts about endianness and byte order
  97    mark. glibc currently interprets it as big endian without byte order
  98    mark, but this is not backed by an RFC.
  99    So we use UTF-8. It supports characters up to \U7FFFFFFF and is
 100    unambiguously defined.  */
 101
 102 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
 103    Returns the number of bytes stored, or -1 if wc is out of range.  */
 104 static int
 105 utf8_wctomb (unsigned char *r, unsigned int wc)
 106 {
 107   int count;
 108
 109   if (wc < 0x80)
 110     count = 1;
 111   else if (wc < 0x800)
 112     count = 2;
 113   else if (wc < 0x10000)
 114     count = 3;
 115   else if (wc < 0x200000)
 116     count = 4;
 117   else if (wc < 0x4000000)
 118     count = 5;
 119   else if (wc <= 0x7fffffff)
 120     count = 6;
 121   else
 122     return -1;
 123
 124   switch (count)
 125     {
 126       /* Note: code falls through cases! */
 127       case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
 128       case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
 129       case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
 130       case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
 131       case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
 132       case 1: r[0] = wc;
 133     }
 134
 135   return count;
 136 }
 137
 138 /* Luckily, the encoding's name is platform independent.  */
 139 # define UTF8_NAME "UTF-8"
 140
 141 /* Place into BUF the locale-dependent representation of the character
 142    CODE.  Return the size of the result.  If there is a conversion
 143    error, return -1, setting errno appropriately.  Assumes that the
 144    locale doesn't change between two calls.  */
 145 static size_t
 146 convert_unicode_char (char buf[MB_LEN_MAX], unsigned int code)
 147 {
 148   static int initialized;
 149   static int is_utf8;
 150 # if HAVE_ICONV
 151   static iconv_t utf8_to_local;
 152 # endif
 153
 154   if (!initialized)
 155     {
 156       extern const char *locale_charset PARAMS ((void));
 157       const char *charset = locale_charset ();
 158
 159       is_utf8 = (charset != NULL && !strcmp (charset, UTF8_NAME));
 160 # if HAVE_ICONV
 161       if (!is_utf8)
 162         {
 163           utf8_to_local = (charset != NULL
 164                            ? iconv_open (charset, UTF8_NAME)
 165                            : (iconv_t) -1);
 166           if (utf8_to_local == (iconv_t) -1)
 167             {
 168               /* For an unknown encoding, assume ASCII.  */
 169               utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
 170               if (utf8_to_local == (iconv_t) -1)
 171                 {
 172                   errno = ENOTSUP;
 173                   return -1;
 174                 }
 175             }
 176         }
 177 # endif
 178       initialized = 1;
 179     }
 180
 181   /* Convert the character to UTF-8.  */
 182   if (is_utf8)
 183     return utf8_wctomb ((unsigned char *) buf, code);
 184   else
 185     {
 186 # if HAVE_ICONV
 187       char inbuf[6];
 188       const char *inptr = inbuf;
 189       size_t inbytesleft = utf8_wctomb ((unsigned char *) inbuf, code);
 190       char *outptr = buf;
 191       size_t outbytesleft = MB_LEN_MAX;
 192       size_t res;
 193
 194       if (inbytesleft == (size_t) -1)
 195         return -1;
 196
 197       /* Convert the character from UTF-8 to the locale's charset.  */
 198       res = iconv (utf8_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft);
 199       if (inbytesleft > 0 || res == (size_t) -1
 200           /* Irix iconv() inserts a NUL byte if it cannot convert. */
 201 #  if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
 202           || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
 203 #  endif
 204           )
 205         return -1;
 206
 207       /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 208 #  if defined _LIBICONV_VERSION \
 209     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 210
 211       /* Get back to the initial shift state.  */
 212       return iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
 213 #  endif
 214
 215       return outptr - buf;
 216 # else
 217       errno = ENOTSUP;
 218       return -1;
 219 # endif
 220     }
 221 }
 222
 223 #endif
 224
 225 /* Output the Unicode character CODE to the output stream STREAM.  */
 226 void
 227 print_unicode_char (FILE *stream, unsigned int code)
 228 {
 229   char buf[MB_LEN_MAX];
 230   size_t s = convert_unicode_char (buf, code);
 231
 232   if (s == (size_t) -1)
 233     error (1, errno, _("cannot convert U+%04X to local character set"), code);
 234   else
 235     fwrite (buf, 1, s, stream);
 236 }