X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fstriconveh.c;h=56c0c04782accb0d2edeb448375d27f07e685c50;hb=4ad44dd748d5da39555cdf305568280d69a23354;hp=9e916e656c7e728cbc4dc1ca920375111a40f394;hpb=c6ad67bb80efa455e52904f98af0c8c4ec4f36ee;p=gnulib.git diff --git a/lib/striconveh.c b/lib/striconveh.c index 9e916e656..56c0c0478 100644 --- a/lib/striconveh.c +++ b/lib/striconveh.c @@ -28,13 +28,13 @@ #if HAVE_ICONV # include -# include "utf8-ucs4-safe.h" +# include "utf8-ucs4.h" # include "ucs4-utf8.h" # include "unistr.h" #endif -#include "strdup.h" #include "c-strcase.h" +#include "c-strcaseeq.h" #ifndef SIZE_MAX # define SIZE_MAX ((size_t) -1) @@ -119,11 +119,68 @@ iconv_carefully (iconv_t cd, iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft)) # endif +/* iconv_carefully_1 is like iconv_carefully, except that it stops after + converting one character. */ +static size_t +iconv_carefully_1 (iconv_t cd, + const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft, + bool *incremented) +{ + const char *inptr = *inbuf; + const char *inptr_end = inptr + *inbytesleft; + char *outptr = *outbuf; + size_t outsize = *outbytesleft; + const char *inptr_before = inptr; + size_t res = (size_t)(-1); + size_t insize; + + for (insize = 1; inptr + insize <= inptr_end; insize++) + { + res = iconv (cd, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize); + if (!(res == (size_t)(-1) && errno == EINVAL)) + break; + /* We expect that no input bytes have been consumed so far. */ + if (inptr != inptr_before) + abort (); + } + + *inbuf = inptr; + *inbytesleft = inptr_end - inptr; +# if !defined _LIBICONV_VERSION && !defined __GLIBC__ + /* Irix iconv() inserts a NUL byte if it cannot convert. + NetBSD iconv() inserts a question mark if it cannot convert. + Only GNU libiconv and GNU libc are known to prefer to fail rather + than doing a lossy conversion. */ + if (res != (size_t)(-1) && res > 0) + { + /* iconv() has already incremented INPTR. We cannot go back to a + previous INPTR, otherwise the state inside CD would become invalid, + if FROM_CODESET is a stateful encoding. So, tell the caller that + *INBUF has already been incremented. */ + *incremented = (inptr > inptr_before); + errno = EILSEQ; + return (size_t)(-1); + } +# endif + + if (res != (size_t)(-1)) + { + *outbuf = outptr; + *outbytesleft = outsize; + } + *incremented = false; + return res; +} + static int mem_cd_iconveh_internal (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, size_t extra_alloc, + size_t *offsets, char **resultp, size_t *lengthp) { /* When a conversion error occurs, we cannot start using CD1 and CD2 at @@ -137,9 +194,34 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, union { unsigned int align; char buf[tmpbufsize]; } tmp; # define tmpbuf tmp.buf - char *result = tmpbuf; - size_t allocated = sizeof (tmpbuf); - size_t length = 0; + char *initial_result; + char *result; + size_t allocated; + size_t length; + size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */ + + if (*resultp != NULL && *lengthp >= sizeof (tmpbuf)) + { + initial_result = *resultp; + allocated = *lengthp; + } + else + { + initial_result = tmpbuf; + allocated = sizeof (tmpbuf); + } + result = initial_result; + + if (offsets != NULL) + { + size_t i; + + for (i = 0; i < srclen; i++) + offsets[i] = (size_t)(-1); + + last_length = (size_t)(-1); + } + length = 0; /* First, try a direct conversion, and see whether a conversion error occurs at all. */ @@ -162,16 +244,29 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, size_t res; bool grow; - /* Use iconv_carefully instead of iconv here, because: - - If TO_CODESET is UTF-8, we can do the error handling in this loop, - no need for a second loop, - - With iconv() implementations other than GNU libiconv and GNU libc, - if we use iconv() in a big swoop, checking for an E2BIG return, - we lose the number of irreversible conversions. */ - res = iconv_carefully (cd, - &inptr, &insize, - &outptr, &outsize, - &incremented); + if (offsets != NULL) + { + if (length != last_length) /* ensure that offset[] be increasing */ + { + offsets[inptr - src] = length; + last_length = length; + } + res = iconv_carefully_1 (cd, + &inptr, &insize, + &outptr, &outsize, + &incremented); + } + else + /* Use iconv_carefully instead of iconv here, because: + - If TO_CODESET is UTF-8, we can do the error handling in this + loop, no need for a second loop, + - With iconv() implementations other than GNU libiconv and GNU + libc, if we use iconv() in a big swoop, checking for an E2BIG + return, we lose the number of irreversible conversions. */ + res = iconv_carefully (cd, + &inptr, &insize, + &outptr, &outsize, + &incremented); length = outptr - result; grow = (length + extra_alloc > allocated / 2); @@ -194,19 +289,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, allocated = 2 * allocated; if (length + 1 + extra_alloc > allocated) abort (); - if (result == tmpbuf) + if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = ENOMEM; return -1; } - if (result == tmpbuf) - memcpy (memory, tmpbuf, length); + if (result == initial_result) + memcpy (memory, initial_result, length); result = memory; grow = false; } @@ -227,7 +322,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, } else { - if (result != tmpbuf) + if (result != initial_result) { int saved_errno = errno; free (result); @@ -243,19 +338,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, char *memory; allocated = 2 * allocated; - if (result == tmpbuf) + if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = ENOMEM; return -1; } - if (result == tmpbuf) - memcpy (memory, tmpbuf, length); + if (result == initial_result) + memcpy (memory, initial_result, length); result = memory; } } @@ -280,24 +375,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, char *memory; allocated = 2 * allocated; - if (result == tmpbuf) + if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = ENOMEM; return -1; } - if (result == tmpbuf) - memcpy (memory, tmpbuf, length); + if (result == initial_result) + memcpy (memory, initial_result, length); result = memory; } else { - if (result != tmpbuf) + if (result != initial_result) { int saved_errno = errno; free (result); @@ -318,6 +413,15 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, /* The direct conversion failed, handler != iconveh_error, and cd2 != (iconv_t)(-1). Use a conversion through UTF-8. */ + if (offsets != NULL) + { + size_t i; + + for (i = 0; i < srclen; i++) + offsets[i] = (size_t)(-1); + + last_length = (size_t)(-1); + } length = 0; { # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ @@ -348,11 +452,25 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, /* Conversion step 1: from FROM_CODESET to UTF-8. */ if (in1size > 0) { + if (offsets != NULL + && length != last_length) /* ensure that offset[] be increasing */ + { + offsets[in1ptr - src] = length; + last_length = length; + } if (cd1 != (iconv_t)(-1)) - res1 = iconv_carefully (cd1, - (ICONV_CONST char **) &in1ptr, &in1size, - &out1ptr, &out1size, - &incremented1); + { + if (offsets != NULL) + res1 = iconv_carefully_1 (cd1, + &in1ptr, &in1size, + &out1ptr, &out1size, + &incremented1); + else + res1 = iconv_carefully (cd1, + &in1ptr, &in1size, + &out1ptr, &out1size, + &incremented1); + } else { /* FROM_CODESET is UTF-8. */ @@ -363,7 +481,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, int n; int m; - n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size); + n = u8_mbtouc (&uc, (const uint8_t *) in1ptr, in1size); if (uc == 0xfffd && !(n >= 3 && (uint8_t)in1ptr[0] == 0xEF @@ -404,7 +522,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, out1ptr += m; out1size -= m; } - while (in1size > 0); + while (offsets == NULL && in1size > 0); } } else if (do_final_flush1) @@ -429,7 +547,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, if (res1 == (size_t)(-1) && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ)) { - if (result != tmpbuf) + if (result != initial_result) { int saved_errno = errno; free (result); @@ -455,7 +573,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, errno1 = errno; utf8len = out1ptr - utf8buf; - if (in1size == 0 + if (offsets != NULL + || in1size == 0 || utf8len > utf8bufsize / 2 || (res1 == (size_t)(-1) && errno1 == E2BIG)) { @@ -524,8 +643,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, int n; if (in2size == 0) abort (); - n = u8_mbtouc (&uc, (const uint8_t *) in2ptr, - in2size); + n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr, + in2size); in2ptr += n; in2size -= n; } @@ -569,19 +688,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, allocated = 2 * allocated; if (length + 1 + extra_alloc > allocated) abort (); - if (result == tmpbuf) + if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = ENOMEM; return -1; } - if (result == tmpbuf) - memcpy (memory, tmpbuf, length); + if (result == initial_result) + memcpy (memory, initial_result, length); result = memory; grow = false; @@ -607,7 +726,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, if (res == (size_t)(-1)) { /* Failure converting the ASCII replacement. */ - if (result != tmpbuf) + if (result != initial_result) { int saved_errno = errno; free (result); @@ -618,7 +737,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, } else { - if (result != tmpbuf) + if (result != initial_result) { int saved_errno = errno; free (result); @@ -635,19 +754,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, char *memory; allocated = 2 * allocated; - if (result == tmpbuf) + if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = ENOMEM; return -1; } - if (result == tmpbuf) - memcpy (memory, tmpbuf, length); + if (result == initial_result) + memcpy (memory, initial_result, length); result = memory; } } @@ -664,7 +783,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, in1size = 0; else if (errno1 == EILSEQ) { - if (result != tmpbuf) + if (result != initial_result) free (result); errno = errno1; return -1; @@ -676,42 +795,33 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, done: /* Now the final memory allocation. */ - if (resultp != NULL) + if (result == tmpbuf) { - if (result == tmpbuf) - { - char *memory; + char *memory; - memory = (char *) malloc (length + extra_alloc); - if (memory != NULL) - { - memcpy (memory, tmpbuf, length); - result = memory; - } - else - { - errno = ENOMEM; - return -1; - } - } - else if (length + extra_alloc < allocated) + memory = (char *) malloc (length + extra_alloc); + if (memory != NULL) { - /* Shrink the allocated memory if possible. */ - char *memory; - - memory = (char *) realloc (result, length + extra_alloc); - if (memory != NULL) - result = memory; + memcpy (memory, tmpbuf, length); + result = memory; } - *resultp = result; + else + { + errno = ENOMEM; + return -1; + } } - else + else if (result != *resultp && length + extra_alloc < allocated) { - if (result != tmpbuf) - free (result); + /* Shrink the allocated memory if possible. */ + char *memory; + + memory = (char *) realloc (result, length + extra_alloc); + if (memory != NULL) + result = memory; } - if (lengthp != NULL) - *lengthp = length; + *resultp = result; + *lengthp = length; return 0; # undef tmpbuf # undef tmpbufsize @@ -721,10 +831,11 @@ int mem_cd_iconveh (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp) { return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0, - resultp, lengthp); + offsets, resultp, lengthp); } char * @@ -737,9 +848,9 @@ str_cd_iconveh (const char *src, function is usable for UTF-7, we have to exclude the NUL byte from the conversion and add it by hand afterwards. */ char *result = NULL; - size_t length; + size_t length = 0; int retval = mem_cd_iconveh_internal (src, strlen (src), - cd, cd1, cd2, handler, 1, + cd, cd1, cd2, handler, 1, NULL, &result, &length); if (retval < 0) @@ -761,13 +872,174 @@ str_cd_iconveh (const char *src, #endif +int +mem_iconveh (const char *src, size_t srclen, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler, + size_t *offsets, + char **resultp, size_t *lengthp) +{ + if (srclen == 0) + { + /* Nothing to convert. */ + *lengthp = 0; + return 0; + } + else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0) + { + char *result; + + if (*resultp != NULL && *lengthp >= srclen) + result = *resultp; + else + { + result = (char *) malloc (srclen); + if (result == NULL) + { + errno = ENOMEM; + return -1; + } + } + memcpy (result, src, srclen); + *resultp = result; + *lengthp = srclen; + return 0; + } + else + { +#if HAVE_ICONV + iconv_t cd; + iconv_t cd1; + iconv_t cd2; + char *result; + size_t length; + int retval; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (c_strcasecmp (from_codeset, "EUC-KR") == 0 + || c_strcasecmp (to_codeset, "EUC-KR") == 0) + { + errno = EINVAL; + return -1; + } +# endif + + cd = iconv_open (to_codeset, from_codeset); + if (cd == (iconv_t)(-1)) + return -1; + + if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)) + cd1 = (iconv_t)(-1); + else + { + cd1 = iconv_open ("UTF-8", from_codeset); + if (cd1 == (iconv_t)(-1)) + { + int saved_errno = errno; + iconv_close (cd); + errno = saved_errno; + return -1; + } + } + + if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)) + cd2 = (iconv_t)(-1); + else + { + cd2 = iconv_open (to_codeset, "UTF-8"); + if (cd2 == (iconv_t)(-1)) + { + int saved_errno = errno; + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + errno = saved_errno; + return -1; + } + } + + result = *resultp; + length = *lengthp; + retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets, + &result, &length); + + if (retval < 0) + { + /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */ + int saved_errno = errno; + if (cd2 != (iconv_t)(-1)) + iconv_close (cd2); + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + errno = saved_errno; + } + else + { + if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0) + { + /* Return -1, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + if (result != *resultp && result != NULL) + free (result); + errno = saved_errno; + return -1; + } + if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0) + { + /* Return -1, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + iconv_close (cd); + if (result != *resultp && result != NULL) + free (result); + errno = saved_errno; + return -1; + } + if (iconv_close (cd) < 0) + { + /* Return -1, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + if (result != *resultp && result != NULL) + free (result); + errno = saved_errno; + return -1; + } + *resultp = result; + *lengthp = length; + } + return retval; +#else + /* This is a different error code than if iconv_open existed but didn't + support from_codeset and to_codeset, so that the caller can emit + an error message such as + "iconv() is not supported. Installing GNU libiconv and + then reinstalling this package would fix this." */ + errno = ENOSYS; + return -1; +#endif + } +} + char * str_iconveh (const char *src, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler) { - if (c_strcasecmp (from_codeset, to_codeset) == 0) - return strdup (src); + if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) + { + char *result = strdup (src); + + if (result == NULL) + errno = ENOMEM; + return result; + } else { #if HAVE_ICONV @@ -790,7 +1062,7 @@ str_iconveh (const char *src, if (cd == (iconv_t)(-1)) return NULL; - if (c_strcasecmp (from_codeset, "UTF-8") == 0) + if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)) cd1 = (iconv_t)(-1); else { @@ -804,7 +1076,7 @@ str_iconveh (const char *src, } } - if (c_strcasecmp (to_codeset, "UTF-8") == 0) + if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)) cd2 = (iconv_t)(-1); else {