From 269accf808a245264aa79c30adbdee7dfa952451 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Fri, 17 Mar 2000 19:32:27 +0000 Subject: [PATCH] (utf8_wctomb): New function. (print_unicode_char): Pass the Unicode character to iconv in UTF-8 format instead of in UCS-4 with platform dependent endianness. --- lib/unicodeio.c | 160 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 112 insertions(+), 48 deletions(-) diff --git a/lib/unicodeio.c b/lib/unicodeio.c index 676126058..148d64ea6 100644 --- a/lib/unicodeio.c +++ b/lib/unicodeio.c @@ -28,6 +28,7 @@ #endif #include +#include #include #ifndef errno @@ -36,12 +37,6 @@ extern int errno; #if HAVE_ICONV # include -/* Name of UCS-4 encoding with machine dependent endianness and alignment. */ -# ifdef _LIBICONV_VERSION -# define UCS4_NAME "UCS-4-INTERNAL" -# else -# define UCS4_NAME "INTERNAL" -# endif #endif #include @@ -55,72 +50,141 @@ extern int errno; #include "unicodeio.h" -/* Use md5.h for its nice detection of unsigned 32-bit type. */ -#include "md5.h" -#undef uint32_t -#define uint32_t md5_uint32 +/* When we pass a Unicode character to iconv(), we must pass it in a + suitable encoding. The standardized Unicode encodings are + UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7. + UCS-2 supports only characters up to \U0000FFFF. + UTF-16 and variants support only characters up to \U0010FFFF. + UTF-7 is way too complex and not supported by glibc-2.1. + UCS-4 specification leaves doubts about endianness and byte order + mark. glibc currently interprets it as big endian without byte order + mark, but this is not backed by an RFC. + So we use UTF-8. It supports characters up to \U7FFFFFFF and is + unambiguously defined. */ + +/* Stores the UTF-8 representation of the Unicode character wc in r[0..5]. + Returns the number of bytes stored, or -1 if wc is out of range. */ +static int +utf8_wctomb (unsigned char *r, unsigned int wc) +{ + int count; + + if (wc < 0x80) + count = 1; + else if (wc < 0x800) + count = 2; + else if (wc < 0x10000) + count = 3; + else if (wc < 0x200000) + count = 4; + else if (wc < 0x4000000) + count = 5; + else if (wc <= 0x7fffffff) + count = 6; + else + return -1; + + switch (count) + { + /* Note: code falls through cases! */ + case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000; + case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000; + case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000; + case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800; + case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0; + case 1: r[0] = wc; + } + + return count; +} + +/* Luckily, the encoding's name is platform independent. */ +#define UTF8_NAME "UTF-8" /* Outputs the Unicode character CODE to the output stream STREAM. Assumes that the locale doesn't change between two calls. */ void print_unicode_char (FILE *stream, unsigned int code) { -#if HAVE_ICONV static int initialized; - static iconv_t ucs4_to_local; + static int is_utf8; +#if HAVE_ICONV + static iconv_t utf8_to_local; +#endif - uint32_t in; - char outbuf[25]; - const char *inptr; - size_t inbytesleft; - char *outptr; - size_t outbytesleft; - size_t res; + char inbuf[6]; + int count; if (!initialized) { extern const char *locale_charset (void); const char *charset = locale_charset (); - ucs4_to_local = (charset != NULL - ? iconv_open (charset, UCS4_NAME) - : (iconv_t)(-1)); - if (ucs4_to_local == (iconv_t)(-1)) + is_utf8 = (charset != NULL && !strcmp (charset, UTF8_NAME)); +#if HAVE_ICONV + if (!is_utf8) { - /* For an unknown encoding, assume ASCII. */ - ucs4_to_local = iconv_open ("ASCII", UCS4_NAME); - if (ucs4_to_local == (iconv_t)(-1)) - error (1, 0, _("cannot output U+%04X: iconv function not usable"), - code); + utf8_to_local = (charset != NULL + ? iconv_open (charset, UTF8_NAME) + : (iconv_t)(-1)); + if (utf8_to_local == (iconv_t)(-1)) + { + /* For an unknown encoding, assume ASCII. */ + utf8_to_local = iconv_open ("ASCII", UTF8_NAME); + if (utf8_to_local == (iconv_t)(-1)) + error (1, 0, + _("cannot output U+%04X: iconv function not usable"), + code); + } } +#endif initialized = 1; } - in = code; - inptr = (char *) ∈ - inbytesleft = sizeof (in); - outptr = outbuf; - outbytesleft = sizeof (outbuf); - - /* Convert the character. */ - res = iconv (ucs4_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft); - if (inbytesleft > 0 || res == (size_t)(-1)) - error (1, res == (size_t)(-1) ? errno : 0, - _("cannot convert U+%04X to local character set"), code); + /* Convert the character to UTF-8. */ + count = utf8_wctomb ((unsigned char *) inbuf, code); + if (count < 0) + error (1, 0, _("U+%04X: character out of range"), code); - /* Avoid glibc-2.1 bug. */ + if (is_utf8) + { + fwrite (inbuf, 1, count, stream); + } + else + { +#if HAVE_ICONV + char outbuf[25]; + const char *inptr; + size_t inbytesleft; + char *outptr; + size_t outbytesleft; + size_t res; + + inptr = inbuf; + inbytesleft = count; + outptr = outbuf; + outbytesleft = sizeof (outbuf); + + /* Convert the character from UTF-8 to the locale's charset. */ + res = iconv (utf8_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft); + if (inbytesleft > 0 || res == (size_t)(-1)) + error (1, res == (size_t)(-1) ? errno : 0, + _("cannot convert U+%04X to local character set"), code); + + /* Avoid glibc-2.1 bug. */ # if defined _LIBICONV_VERSION || !(__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) - /* Get back to the initial shift state. */ - res = iconv (ucs4_to_local, NULL, NULL, &outptr, &outbytesleft); - if (res == (size_t)(-1)) - error (1, errno, _("cannot convert U+%04X to local character set"), code); - + /* Get back to the initial shift state. */ + res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft); + if (res == (size_t)(-1)) + error (1, errno, _("cannot convert U+%04X to local character set"), + code); # endif - fwrite (outbuf, 1, outptr - outbuf, stream); - + fwrite (outbuf, 1, outptr - outbuf, stream); #else - error (1, 0, _("cannot output U+%04X: iconv function not available"), code); + error (1, 0, _("cannot output U+%04X: iconv function not available"), + code); #endif + } } -- 2.11.0