From 3bcc3f3d73d014f863c561fd129b34c3e058feed Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sat, 10 May 2008 15:23:28 +0200 Subject: [PATCH] Use u8_conv_from_encoding instead of using special code for the conversion. --- ChangeLog | 19 ++++ lib/unilbrk/ulc-common.c | 118 ----------------------- lib/unilbrk/ulc-common.h | 16 ---- lib/unilbrk/ulc-possible-linebreaks.c | 138 +++++++++++--------------- lib/unilbrk/ulc-width-linebreaks.c | 165 +++++++++++++------------------- modules/unilbrk/ulc-common | 1 - modules/unilbrk/ulc-possible-linebreaks | 4 +- modules/unilbrk/ulc-width-linebreaks | 4 +- 8 files changed, 148 insertions(+), 317 deletions(-) diff --git a/ChangeLog b/ChangeLog index a533eead6..5a96a0fe2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,24 @@ 2008-05-10 Bruno Haible + * lib/unilbrk/ulc-common.c: Don't include . + (iconv_string_length, iconv_string_keeping_offsets): Remove functions. + * lib/unilbrk/ulc-common.h (iconv_string_length, + iconv_string_keeping_offsets): Remove declarations. + * lib/unilbrk/ulc-possible-linebreaks.c: Include , uniconv.h. + Don't include , streq.h, xsize.h. + (ulc_possible_linebreaks): Use u8_conv_from_encoding for doing the + conversion. + * lib/unilbrk/ulc-width-linebreaks.c: Include uniconv.h. Don't include + , streq.h, xsize.h. + (ulc_width_linebreaks): Use u8_conv_from_encoding for doing the + conversion. + * modules/unilbrk/ulc-common (Depends-on): Remove iconv. + * modules/unilbrk/ulc-possible-linebreaks (Depends-on): Add + uniconv/u8-conv-from-enc. Remove iconv_open, streq, xsize. + * modules/unilbrk/ulc-width-linebreaks (Depends-on): Likewise. + +2008-05-10 Bruno Haible + * modules/unilbrk/ulc-width-linebreaks-tests: New file. * tests/unilbrk/test-ulc-width-linebreaks.c: New file. diff --git a/lib/unilbrk/ulc-common.c b/lib/unilbrk/ulc-common.c index 3ab31c2d9..7bdfa4491 100644 --- a/lib/unilbrk/ulc-common.c +++ b/lib/unilbrk/ulc-common.c @@ -20,8 +20,6 @@ /* Specification. */ #include "unilbrk/ulc-common.h" -#include - #include "c-ctype.h" #include "streq.h" @@ -33,122 +31,6 @@ is_utf8_encoding (const char *encoding) return 0; } -#if HAVE_ICONV - -# include - -size_t -iconv_string_length (iconv_t cd, const char *s, size_t n) -{ -# define TMPBUFSIZE 4096 - size_t count = 0; - char tmpbuf[TMPBUFSIZE]; - const char *inptr = s; - size_t insize = n; - - while (insize > 0) - { - char *outptr = tmpbuf; - size_t outsize = TMPBUFSIZE; - size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); - if (res == (size_t)(-1) && errno != E2BIG -# if !defined _LIBICONV_VERSION && !defined __GLIBC__ - /* Irix iconv() inserts a NUL byte if it cannot convert. - NetBSD iconv() inserts a question mark if it cannot convert. - Only GNU libiconv and GNU libc are known to prefer to fail rather - than doing a lossy conversion. */ - || res > 0 -# endif - ) - return (size_t)(-1); - count += outptr - tmpbuf; - } - /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */ -# if defined _LIBICONV_VERSION \ - || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) - { - char *outptr = tmpbuf; - size_t outsize = TMPBUFSIZE; - size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); - if (res == (size_t)(-1)) - return (size_t)(-1); - count += outptr - tmpbuf; - } - /* Return to the initial state. */ - iconv (cd, NULL, NULL, NULL, NULL); -# endif - return count; -# undef TMPBUFSIZE -} - -void -iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, - size_t *offtable, char *t, size_t m) -{ - size_t i; - const char *s_end; - const char *inptr; - char *outptr; - size_t outsize; - /* Avoid glibc-2.1 bug. */ -# if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) - const size_t extra = 1; -# else - const size_t extra = 0; -# endif - - for (i = 0; i < n; i++) - offtable[i] = (size_t)(-1); - - s_end = s + n; - inptr = s; - outptr = t; - outsize = m + extra; - while (inptr < s_end) - { - const char *saved_inptr; - size_t insize; - size_t res; - - offtable[inptr - s] = outptr - t; - - saved_inptr = inptr; - res = (size_t)(-1); - for (insize = 1; inptr + insize <= s_end; insize++) - { - res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); - if (!(res == (size_t)(-1) && errno == EINVAL)) - break; - /* We expect that no input bytes have been consumed so far. */ - if (inptr != saved_inptr) - abort (); - } - /* After we verified the convertibility and computed the translation's - size m, there shouldn't be any conversion error here. */ - if (res == (size_t)(-1) -# if !defined _LIBICONV_VERSION && !defined __GLIBC__ - /* Irix iconv() inserts a NUL byte if it cannot convert. - NetBSD iconv() inserts a question mark if it cannot convert. - Only GNU libiconv and GNU libc are known to prefer to fail rather - than doing a lossy conversion. */ - || res > 0 -# endif - ) - abort (); - } - /* Avoid glibc-2.1 bug and Solaris 7 bug. */ -# if defined _LIBICONV_VERSION \ - || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) - if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) - abort (); -# endif - /* We should have produced exactly m output bytes. */ - if (outsize != extra) - abort (); -} - -#endif /* HAVE_ICONV */ - #if C_CTYPE_ASCII /* Tests whether a string is entirely ASCII. Returns 1 if yes. diff --git a/lib/unilbrk/ulc-common.h b/lib/unilbrk/ulc-common.h index bba8ec6d2..3b4818875 100644 --- a/lib/unilbrk/ulc-common.h +++ b/lib/unilbrk/ulc-common.h @@ -23,22 +23,6 @@ #define is_utf8_encoding unilbrk_is_utf8_encoding extern int is_utf8_encoding (const char *encoding); -#if HAVE_ICONV - -# include - -/* Luckily, the encoding's name is platform independent. */ -# define UTF8_NAME "UTF-8" - -/* Return the length of a string after conversion through an iconv_t. */ -# define iconv_string_length unilbrk_iconv_string_length -extern size_t iconv_string_length (iconv_t cd, const char *s, size_t n); - -# define iconv_string_keeping_offsets unilbrk_iconv_string_keeping_offsets -extern void iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, size_t *offtable, char *t, size_t m); - -#endif /* HAVE_ICONV */ - #if C_CTYPE_ASCII # define is_all_ascii unilbrk_is_all_ascii diff --git a/lib/unilbrk/ulc-possible-linebreaks.c b/lib/unilbrk/ulc-possible-linebreaks.c index 444e9a946..74cd42a67 100644 --- a/lib/unilbrk/ulc-possible-linebreaks.c +++ b/lib/unilbrk/ulc-possible-linebreaks.c @@ -21,13 +21,10 @@ #include "unilbrk.h" #include -#if HAVE_ICONV -# include -#endif +#include #include "c-ctype.h" -#include "streq.h" -#include "xsize.h" +#include "uniconv.h" #include "unilbrk/ulc-common.h" /* Line breaking of a string in an arbitrary encoding. @@ -47,92 +44,73 @@ void ulc_possible_linebreaks (const char *s, size_t n, const char *encoding, char *p) { - if (n == 0) - return; - if (is_utf8_encoding (encoding)) - u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); - else + if (n > 0) { -#if HAVE_ICONV - iconv_t to_utf8; - /* Avoid glibc-2.1 bug with EUC-KR. */ -# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION - if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) - to_utf8 = (iconv_t)(-1); + if (is_utf8_encoding (encoding)) + u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); else -# endif - /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, - GB18030. */ -# if defined __sun && !defined _LIBICONV_VERSION - if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) - || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) - || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) - || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') - || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) - || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) - to_utf8 = (iconv_t)(-1); - else -# endif - to_utf8 = iconv_open (UTF8_NAME, encoding); - if (to_utf8 != (iconv_t)(-1)) { - /* Determine the length of the resulting UTF-8 string. */ - size_t m = iconv_string_length (to_utf8, s, n); - if (m != (size_t)(-1)) + /* Convert the string to UTF-8 and build a translation table + from offsets into s to offsets into the translated string. */ + size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); + + if (offsets != NULL) { - /* Convert the string to UTF-8 and build a translation table - from offsets into s to offsets into the translated string. */ - size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m); - char *memory = - (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); - if (memory != NULL) + uint8_t *t = NULL; + size_t m; + if (u8_conv_from_encoding (encoding, iconveh_question_mark, + s, n, offsets, &t, &m) + == 0) { - size_t *offtable = (size_t *) memory; - char *t = (char *) (offtable + n); - char *q = (char *) (t + m); - size_t i; - - iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); - - /* Determine the possible line breaks of the UTF-8 string. */ - u8_possible_linebreaks ((const uint8_t *) t, m, encoding, q); - - /* Translate the result back to the original string. */ - memset (p, UC_BREAK_PROHIBITED, n); - for (i = 0; i < n; i++) - if (offtable[i] != (size_t)(-1)) - p[i] = q[offtable[i]]; - - free (memory); - iconv_close (to_utf8); - return; + char *q = (char *) malloc (m); + + if (q != NULL) + { + size_t i; + + /* Determine the possible line breaks of the UTF-8 + string. */ + u8_possible_linebreaks (t, m, encoding, q); + + /* Translate the result back to the original string. */ + memset (p, UC_BREAK_PROHIBITED, n); + for (i = 0; i < n; i++) + if (offsets[i] != (size_t)(-1)) + p[i] = q[offsets[i]]; + + free (q); + free (t); + free (offsets); + return; + } + free (t); } + free (offsets); } - iconv_close (to_utf8); - } -#endif - /* Impossible to convert. */ + + /* Impossible to convert. */ #if C_CTYPE_ASCII - if (is_all_ascii (s, n)) - { - /* ASCII is a subset of UTF-8. */ - u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); - return; - } + if (is_all_ascii (s, n)) + { + /* ASCII is a subset of UTF-8. */ + u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); + return; + } #endif - /* We have a non-ASCII string and cannot convert it. - Don't produce line breaks except those already present in the - input string. All we assume here is that the encoding is - minimally ASCII compatible. */ - { - const char *s_end = s + n; - while (s < s_end) + /* We have a non-ASCII string and cannot convert it. + Don't produce line breaks except those already present in the + input string. All we assume here is that the encoding is + minimally ASCII compatible. */ { - *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); - s++; - p++; + const char *s_end = s + n; + while (s < s_end) + { + *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); + s++; + p++; + } } - } + } } } diff --git a/lib/unilbrk/ulc-width-linebreaks.c b/lib/unilbrk/ulc-width-linebreaks.c index 892e01ff1..5340a4deb 100644 --- a/lib/unilbrk/ulc-width-linebreaks.c +++ b/lib/unilbrk/ulc-width-linebreaks.c @@ -22,13 +22,9 @@ #include #include -#if HAVE_ICONV -# include -#endif #include "c-ctype.h" -#include "streq.h" -#include "xsize.h" +#include "uniconv.h" #include "unilbrk/ulc-common.h" /* Line breaking of a string in an arbitrary encoding. @@ -50,113 +46,90 @@ ulc_width_linebreaks (const char *s, size_t n, const char *o, const char *encoding, char *p) { - if (n == 0) - return start_column; - if (is_utf8_encoding (encoding)) - return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); - else + if (n > 0) { -#if HAVE_ICONV - iconv_t to_utf8; - /* Avoid glibc-2.1 bug with EUC-KR. */ -# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION - if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) - to_utf8 = (iconv_t)(-1); + if (is_utf8_encoding (encoding)) + return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); else -# endif - /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, - GB18030. */ -# if defined __sun && !defined _LIBICONV_VERSION - if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) - || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) - || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) - || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') - || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) - || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) - to_utf8 = (iconv_t)(-1); - else -# endif - to_utf8 = iconv_open (UTF8_NAME, encoding); - if (to_utf8 != (iconv_t)(-1)) { - /* Determine the length of the resulting UTF-8 string. */ - size_t m = iconv_string_length (to_utf8, s, n); - if (m != (size_t)(-1)) + /* Convert the string to UTF-8 and build a translation table + from offsets into s to offsets into the translated string. */ + size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); + + if (offsets != NULL) { - /* Convert the string to UTF-8 and build a translation table - from offsets into s to offsets into the translated string. */ - size_t memory_size = - xsum4 (xtimes (n, sizeof (size_t)), m, m, - (o != NULL ? m : 0)); - char *memory = - (char *) - (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); - if (memory != NULL) + uint8_t *t = NULL; + size_t m; + if (u8_conv_from_encoding (encoding, iconveh_question_mark, + s, n, offsets, &t, &m) + == 0) { - size_t *offtable = (size_t *) memory; - char *t = (char *) (offtable + n); - char *q = (char *) (t + m); - char *o8 = (o != NULL ? (char *) (q + m) : NULL); - int res_column; - size_t i; + char *memory = (char *) malloc (m + (o != NULL ? m : 0)); - iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); - - /* Translate the overrides to the UTF-8 string. */ - if (o != NULL) + if (memory != NULL) { - memset (o8, UC_BREAK_UNDEFINED, m); + char *q = (char *) memory; + char *o8 = (o != NULL ? (char *) (q + m) : NULL); + int res_column; + size_t i; + + /* Translate the overrides to the UTF-8 string. */ + if (o != NULL) + { + memset (o8, UC_BREAK_UNDEFINED, m); + for (i = 0; i < n; i++) + if (offsets[i] != (size_t)(-1)) + o8[offsets[i]] = o[i]; + } + + /* Determine the line breaks of the UTF-8 string. */ + res_column = + u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q); + + /* Translate the result back to the original string. */ + memset (p, UC_BREAK_PROHIBITED, n); for (i = 0; i < n; i++) - if (offtable[i] != (size_t)(-1)) - o8[offtable[i]] = o[i]; - } - - /* Determine the line breaks of the UTF-8 string. */ - res_column = - u8_width_linebreaks ((const uint8_t *) t, m, width, start_column, at_end_columns, o8, encoding, q); - - /* Translate the result back to the original string. */ - memset (p, UC_BREAK_PROHIBITED, n); - for (i = 0; i < n; i++) - if (offtable[i] != (size_t)(-1)) - p[i] = q[offtable[i]]; + if (offsets[i] != (size_t)(-1)) + p[i] = q[offsets[i]]; - free (memory); - iconv_close (to_utf8); - return res_column; + free (memory); + free (t); + free (offsets); + return res_column; + } + free (t); } + free (offsets); } - iconv_close (to_utf8); - } -#endif - /* Impossible to convert. */ + /* Impossible to convert. */ #if C_CTYPE_ASCII - if (is_all_ascii (s, n)) - { - /* ASCII is a subset of UTF-8. */ - return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); - } + if (is_all_ascii (s, n)) + { + /* ASCII is a subset of UTF-8. */ + return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); + } #endif - /* We have a non-ASCII string and cannot convert it. - Don't produce line breaks except those already present in the - input string. All we assume here is that the encoding is - minimally ASCII compatible. */ - { - const char *s_end = s + n; - while (s < s_end) + /* We have a non-ASCII string and cannot convert it. + Don't produce line breaks except those already present in the + input string. All we assume here is that the encoding is + minimally ASCII compatible. */ { - *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' - ? UC_BREAK_MANDATORY - : UC_BREAK_PROHIBITED); - s++; - p++; - if (o != NULL) - o++; + const char *s_end = s + n; + while (s < s_end) + { + *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' + ? UC_BREAK_MANDATORY + : UC_BREAK_PROHIBITED); + s++; + p++; + if (o != NULL) + o++; + } + /* We cannot compute widths in this case. */ } - /* We cannot compute widths in this case. */ - return start_column; - } + } } + return start_column; } diff --git a/modules/unilbrk/ulc-common b/modules/unilbrk/ulc-common index 2b6c00d24..4729f662f 100644 --- a/modules/unilbrk/ulc-common +++ b/modules/unilbrk/ulc-common @@ -7,7 +7,6 @@ lib/unilbrk/ulc-common.c Depends-on: c-ctype -iconv streq configure.ac: diff --git a/modules/unilbrk/ulc-possible-linebreaks b/modules/unilbrk/ulc-possible-linebreaks index 2b3587a81..f7cc283e3 100644 --- a/modules/unilbrk/ulc-possible-linebreaks +++ b/modules/unilbrk/ulc-possible-linebreaks @@ -8,10 +8,8 @@ Depends-on: unilbrk/base unilbrk/u8-possible-linebreaks unilbrk/ulc-common +uniconv/u8-conv-from-enc c-ctype -iconv_open -streq -xsize configure.ac: diff --git a/modules/unilbrk/ulc-width-linebreaks b/modules/unilbrk/ulc-width-linebreaks index e84e1f76f..5763e8508 100644 --- a/modules/unilbrk/ulc-width-linebreaks +++ b/modules/unilbrk/ulc-width-linebreaks @@ -8,10 +8,8 @@ Depends-on: unilbrk/base unilbrk/u8-width-linebreaks unilbrk/ulc-common +uniconv/u8-conv-from-enc c-ctype -iconv_open -streq -xsize configure.ac: -- 2.11.0