X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fstriconveh.c;h=10cb06f9d65af1f4c065c49588c7d887cc9ac526;hb=671c9011cb6944a63f0d15f60252c31bf0ee8e9b;hp=7feb768168da91d1589a8cd280f6ef845d975e30;hpb=6d6f8a5cd56ca54a9e9d49516cd2e849f8a6b1c6;p=gnulib.git
diff --git a/lib/striconveh.c b/lib/striconveh.c
index 7feb76816..10cb06f9d 100644
--- a/lib/striconveh.c
+++ b/lib/striconveh.c
@@ -1,11 +1,11 @@
/* Character set conversion with error handling.
- Copyright (C) 2001-2007 Free Software Foundation, Inc.
+ Copyright (C) 2001-2008 Free Software Foundation, Inc.
Written by Bruno Haible and Simon Josefsson.
- This program is free software; you can redistribute it and/or modify
+ This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -13,8 +13,7 @@
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software Foundation,
- Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+ along with this program. If not, see . */
#include
@@ -28,13 +27,11 @@
#if HAVE_ICONV
# include
-# include "utf8-ucs4-safe.h"
-# include "ucs4-utf8.h"
# include "unistr.h"
#endif
-#include "strdup.h"
#include "c-strcase.h"
+#include "c-strcaseeq.h"
#ifndef SIZE_MAX
# define SIZE_MAX ((size_t) -1)
@@ -82,9 +79,13 @@ iconv_carefully (iconv_t cd,
&outptr, &outsize);
if (!(res == (size_t)(-1) && errno == EINVAL))
break;
- /* We expect that no input bytes have been consumed so far. */
- if (inptr != inptr_before)
- abort ();
+ /* iconv can eat up a shift sequence but give EINVAL while attempting
+ to convert the first character. E.g. libiconv does this. */
+ if (inptr > inptr_before)
+ {
+ res = 0;
+ break;
+ }
}
if (res == 0)
@@ -120,31 +121,36 @@ iconv_carefully (iconv_t cd,
# endif
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
- converting one character. */
+ converting one character or one shift sequence. */
static size_t
iconv_carefully_1 (iconv_t cd,
const char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft,
bool *incremented)
{
- const char *inptr = *inbuf;
- const char *inptr_end = inptr + *inbytesleft;
+ const char *inptr_before = *inbuf;
+ const char *inptr = inptr_before;
+ const char *inptr_end = inptr_before + *inbytesleft;
char *outptr = *outbuf;
size_t outsize = *outbytesleft;
- const char *inptr_before = inptr;
size_t res = (size_t)(-1);
size_t insize;
- for (insize = 1; inptr + insize <= inptr_end; insize++)
+ for (insize = 1; inptr_before + insize <= inptr_end; insize++)
{
+ inptr = inptr_before;
res = iconv (cd,
(ICONV_CONST char **) &inptr, &insize,
&outptr, &outsize);
if (!(res == (size_t)(-1) && errno == EINVAL))
break;
- /* We expect that no input bytes have been consumed so far. */
- if (inptr != inptr_before)
- abort ();
+ /* iconv can eat up a shift sequence but give EINVAL while attempting
+ to convert the first character. E.g. libiconv does this. */
+ if (inptr > inptr_before)
+ {
+ res = 0;
+ break;
+ }
}
*inbuf = inptr;
@@ -175,6 +181,79 @@ iconv_carefully_1 (iconv_t cd,
return res;
}
+/* utf8conv_carefully is like iconv, except that
+ - it converts from UTF-8 to UTF-8,
+ - it stops as soon as it encounters a conversion error, and it returns
+ in *INCREMENTED a boolean telling whether it has incremented the input
+ pointers past the error location,
+ - if one_character_only is true, it stops after converting one
+ character. */
+static size_t
+utf8conv_carefully (bool one_character_only,
+ const char **inbuf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft,
+ bool *incremented)
+{
+ const char *inptr = *inbuf;
+ size_t insize = *inbytesleft;
+ char *outptr = *outbuf;
+ size_t outsize = *outbytesleft;
+ size_t res;
+
+ res = 0;
+ do
+ {
+ ucs4_t uc;
+ int n;
+ int m;
+
+ n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
+ if (n < 0)
+ {
+ errno = (n == -2 ? EINVAL : EILSEQ);
+ n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
+ inptr += n;
+ insize -= n;
+ res = (size_t)(-1);
+ *incremented = true;
+ break;
+ }
+ if (outsize == 0)
+ {
+ errno = E2BIG;
+ res = (size_t)(-1);
+ *incremented = false;
+ break;
+ }
+ m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
+ if (m == -2)
+ {
+ errno = E2BIG;
+ res = (size_t)(-1);
+ *incremented = false;
+ break;
+ }
+ inptr += n;
+ insize -= n;
+ if (m == -1)
+ {
+ errno = EILSEQ;
+ res = (size_t)(-1);
+ *incremented = true;
+ break;
+ }
+ outptr += m;
+ outsize -= m;
+ }
+ while (!one_character_only && insize > 0);
+
+ *inbuf = inptr;
+ *inbytesleft = insize;
+ *outbuf = outptr;
+ *outbytesleft = outsize;
+ return res;
+}
+
static int
mem_cd_iconveh_internal (const char *src, size_t srclen,
iconv_t cd, iconv_t cd1, iconv_t cd2,
@@ -200,7 +279,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
size_t length;
size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
- if (*lengthp >= sizeof (tmpbuf))
+ if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
{
initial_result = *resultp;
allocated = *lengthp;
@@ -212,6 +291,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
}
result = initial_result;
+ /* Test whether a direct conversion is possible at all. */
+ if (cd == (iconv_t)(-1))
+ goto indirectly;
+
if (offsets != NULL)
{
size_t i;
@@ -410,8 +493,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
goto done;
indirectly:
- /* The direct conversion failed, handler != iconveh_error,
- and cd2 != (iconv_t)(-1).
+ /* The direct conversion failed.
Use a conversion through UTF-8. */
if (offsets != NULL)
{
@@ -424,6 +506,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
}
length = 0;
{
+ const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
char utf8buf[utf8bufsize + 1];
size_t utf8len = 0;
@@ -438,7 +521,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
/* Set to the initial state. */
if (cd1 != (iconv_t)(-1))
iconv (cd1, NULL, NULL, NULL, NULL);
- iconv (cd2, NULL, NULL, NULL, NULL);
+ if (cd2 != (iconv_t)(-1))
+ iconv (cd2, NULL, NULL, NULL, NULL);
# endif
while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
@@ -460,7 +544,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
}
if (cd1 != (iconv_t)(-1))
{
- if (offsets != NULL)
+ if (slowly)
res1 = iconv_carefully_1 (cd1,
&in1ptr, &in1size,
&out1ptr, &out1size,
@@ -474,55 +558,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
else
{
/* FROM_CODESET is UTF-8. */
- res1 = 0;
- do
- {
- ucs4_t uc;
- int n;
- int m;
-
- n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size);
- if (uc == 0xfffd
- && !(n >= 3
- && (uint8_t)in1ptr[0] == 0xEF
- && (uint8_t)in1ptr[1] == 0xBF
- && (uint8_t)in1ptr[2] == 0xBD))
- {
- in1ptr += n;
- in1size -= n;
- errno = EILSEQ;
- res1 = (size_t)(-1);
- incremented1 = true;
- break;
- }
- if (out1size == 0)
- {
- errno = E2BIG;
- res1 = (size_t)(-1);
- incremented1 = false;
- break;
- }
- m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
- if (m == -2)
- {
- errno = E2BIG;
- res1 = (size_t)(-1);
- incremented1 = false;
- break;
- }
- in1ptr += n;
- in1size -= n;
- if (m == -1)
- {
- errno = EILSEQ;
- res1 = (size_t)(-1);
- incremented1 = true;
- break;
- }
- out1ptr += m;
- out1size -= m;
- }
- while (offsets == NULL && in1size > 0);
+ res1 = utf8conv_carefully (slowly,
+ &in1ptr, &in1size,
+ &out1ptr, &out1size,
+ &incremented1);
}
}
else if (do_final_flush1)
@@ -592,10 +631,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
bool grow;
if (in2size > 0)
- res2 = iconv_carefully (cd2,
- &in2ptr, &in2size,
- &out2ptr, &out2size,
- &incremented2);
+ {
+ if (cd2 != (iconv_t)(-1))
+ res2 = iconv_carefully (cd2,
+ &in2ptr, &in2size,
+ &out2ptr, &out2size,
+ &incremented2);
+ else
+ /* TO_CODESET is UTF-8. */
+ res2 = utf8conv_carefully (false,
+ &in2ptr, &in2size,
+ &out2ptr, &out2size,
+ &incremented2);
+ }
else /* in1size == 0 && !do_final_flush1
&& in2size == 0 && do_final_flush2 */
{
@@ -603,10 +651,11 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
# if defined _LIBICONV_VERSION \
|| !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
- res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
-# else
- res2 = 0;
+ if (cd2 != (iconv_t)(-1))
+ res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
+ else
# endif
+ res2 = 0;
do_final_flush2 = false;
incremented2 = true;
}
@@ -643,8 +692,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
int n;
if (in2size == 0)
abort ();
- n = u8_mbtouc (&uc, (const uint8_t *) in2ptr,
- in2size);
+ n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
+ in2size);
in2ptr += n;
in2size -= n;
}
@@ -677,9 +726,28 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
inptr = scratchbuf;
insize = scratchlen;
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr, &out2size);
+ if (cd2 != (iconv_t)(-1))
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr, &insize,
+ &out2ptr, &out2size);
+ else
+ {
+ /* TO_CODESET is UTF-8. */
+ if (out2size >= insize)
+ {
+ memcpy (out2ptr, inptr, insize);
+ out2ptr += insize;
+ out2size -= insize;
+ inptr += insize;
+ insize = 0;
+ res = 0;
+ }
+ else
+ {
+ errno = E2BIG;
+ res = (size_t)(-1);
+ }
+ }
length = out2ptr - result;
if (res == (size_t)(-1) && errno == E2BIG)
{
@@ -706,9 +774,23 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
out2ptr = result + length;
out2size = allocated - extra_alloc - length;
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr, &out2size);
+ if (cd2 != (iconv_t)(-1))
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr,
+ &insize,
+ &out2ptr, &out2size);
+ else
+ {
+ /* TO_CODESET is UTF-8. */
+ if (!(out2size >= insize))
+ abort ();
+ memcpy (out2ptr, inptr, insize);
+ out2ptr += insize;
+ out2size -= insize;
+ inptr += insize;
+ insize = 0;
+ res = 0;
+ }
length = out2ptr - result;
}
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
@@ -797,9 +879,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
/* Now the final memory allocation. */
if (result == tmpbuf)
{
+ size_t memsize = length + extra_alloc;
char *memory;
- memory = (char *) malloc (length + extra_alloc);
+ memory = (char *) malloc (memsize > 0 ? memsize : 1);
if (memory != NULL)
{
memcpy (memory, tmpbuf, length);
@@ -814,9 +897,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
else if (result != *resultp && length + extra_alloc < allocated)
{
/* Shrink the allocated memory if possible. */
+ size_t memsize = length + extra_alloc;
char *memory;
- memory = (char *) realloc (result, length + extra_alloc);
+ memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
if (memory != NULL)
result = memory;
}
@@ -926,10 +1010,8 @@ mem_iconveh (const char *src, size_t srclen,
# endif
cd = iconv_open (to_codeset, from_codeset);
- if (cd == (iconv_t)(-1))
- return -1;
- if (c_strcasecmp (from_codeset, "UTF-8") == 0)
+ if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
cd1 = (iconv_t)(-1);
else
{
@@ -937,13 +1019,18 @@ mem_iconveh (const char *src, size_t srclen,
if (cd1 == (iconv_t)(-1))
{
int saved_errno = errno;
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
return -1;
}
}
- if (c_strcasecmp (to_codeset, "UTF-8") == 0)
+ if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
+# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
+ || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
+# endif
+ )
cd2 = (iconv_t)(-1);
else
{
@@ -953,7 +1040,8 @@ mem_iconveh (const char *src, size_t srclen,
int saved_errno = errno;
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
return -1;
}
@@ -972,7 +1060,8 @@ mem_iconveh (const char *src, size_t srclen,
iconv_close (cd2);
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
}
else
@@ -984,7 +1073,8 @@ mem_iconveh (const char *src, size_t srclen,
int saved_errno = errno;
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
if (result != *resultp && result != NULL)
free (result);
errno = saved_errno;
@@ -995,13 +1085,14 @@ mem_iconveh (const char *src, size_t srclen,
/* Return -1, but free the allocated memory, and while doing
that, preserve the errno from iconv_close. */
int saved_errno = errno;
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
if (result != *resultp && result != NULL)
free (result);
errno = saved_errno;
return -1;
}
- if (iconv_close (cd) < 0)
+ if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
{
/* Return -1, but free the allocated memory, and while doing
that, preserve the errno from iconv_close. */
@@ -1059,10 +1150,8 @@ str_iconveh (const char *src,
# endif
cd = iconv_open (to_codeset, from_codeset);
- if (cd == (iconv_t)(-1))
- return NULL;
- if (c_strcasecmp (from_codeset, "UTF-8") == 0)
+ if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
cd1 = (iconv_t)(-1);
else
{
@@ -1070,13 +1159,18 @@ str_iconveh (const char *src,
if (cd1 == (iconv_t)(-1))
{
int saved_errno = errno;
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
return NULL;
}
}
- if (c_strcasecmp (to_codeset, "UTF-8") == 0)
+ if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
+# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
+ || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
+# endif
+ )
cd2 = (iconv_t)(-1);
else
{
@@ -1086,7 +1180,8 @@ str_iconveh (const char *src,
int saved_errno = errno;
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
return NULL;
}
@@ -1102,7 +1197,8 @@ str_iconveh (const char *src,
iconv_close (cd2);
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
errno = saved_errno;
}
else
@@ -1114,7 +1210,8 @@ str_iconveh (const char *src,
int saved_errno = errno;
if (cd1 != (iconv_t)(-1))
iconv_close (cd1);
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
free (result);
errno = saved_errno;
return NULL;
@@ -1124,12 +1221,13 @@ str_iconveh (const char *src,
/* Return NULL, but free the allocated memory, and while doing
that, preserve the errno from iconv_close. */
int saved_errno = errno;
- iconv_close (cd);
+ if (cd != (iconv_t)(-1))
+ iconv_close (cd);
free (result);
errno = saved_errno;
return NULL;
}
- if (iconv_close (cd) < 0)
+ if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
{
/* Return NULL, but free the allocated memory, and while doing
that, preserve the errno from iconv_close. */