New module 'striconveh'.

author Bruno Haible <bruno@clisp.org>

Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)

committer Bruno Haible <bruno@clisp.org>

Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)
author Bruno Haible <bruno@clisp.org>
Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)
committer Bruno Haible <bruno@clisp.org>
Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)
diff --git a/ChangeLog b/ChangeLog

index 1e0ca0d..ed3916d 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
  2007-01-15  Bruno Haible  <bruno@clisp.org>
  
+       * modules/striconveh: New file.
+       * lib/striconveh.h: New file.
+       * lib/striconveh.c: New file.
+       * MODULES.html.sh (Internationalization functions): Add striconveh.
+
+2007-01-15  Bruno Haible  <bruno@clisp.org>
+
         * lib/striconv.c (str_cd_iconv): Use the first algorithm if iconv is
         not from GNU libiconv or GNU libc.
  
diff --git a/MODULES.html.sh b/MODULES.html.sh

index 68a3152..d1a6684 100755 (executable)
--- a/MODULES.html.sh
+++ b/MODULES.html.sh
@@ -2142,6 +2142,7 @@ func_all_modules ()
    func_module iconv
    func_module striconv
    func_module xstriconv
+  func_module striconveh
    func_module iconvme
    func_module localcharset
    func_module hard-locale
diff --git a/lib/striconveh.c b/lib/striconveh.c

new file mode 100644 (file)

index 0000000..9e916e6
--- /dev/null
+++ b/lib/striconveh.c
@@ -0,0 +1,881 @@
+/* Character set conversion with error handling.
+   Copyright (C) 2001-2007 Free Software Foundation, Inc.
+   Written by Bruno Haible and Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "striconveh.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_ICONV
+# include <iconv.h>
+# include "utf8-ucs4-safe.h"
+# include "ucs4-utf8.h"
+# include "unistr.h"
+#endif
+
+#include "strdup.h"
+#include "c-strcase.h"
+
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+
+
+#if HAVE_ICONV
+
+/* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
+   error occurs, we may have to determine the Unicode representation of the
+   inconvertible character.  */
+
+/* iconv_carefully is like iconv, except that it stops as soon as it encounters
+   a conversion error, and it returns in *INCREMENTED a boolean telling whether
+   it has incremented the input pointers past the error location.  */
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+/* Irix iconv() inserts a NUL byte if it cannot convert.
+   NetBSD iconv() inserts a question mark if it cannot convert.
+   Only GNU libiconv and GNU libc are known to prefer to fail rather
+   than doing a lossy conversion.  */
+static size_t
+iconv_carefully (iconv_t cd,
+                const char **inbuf, size_t *inbytesleft,
+                char **outbuf, size_t *outbytesleft,
+                bool *incremented)
+{
+  const char *inptr = *inbuf;
+  const char *inptr_end = inptr + *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  const char *inptr_before;
+  size_t res;
+
+  do
+    {
+      size_t insize;
+
+      inptr_before = inptr;
+      res = (size_t)(-1);
+
+      for (insize = 1; inptr + insize <= inptr_end; insize++)
+       {
+         res = iconv (cd,
+                      (ICONV_CONST char **) &inptr, &insize,
+                      &outptr, &outsize);
+         if (!(res == (size_t)(-1) && errno == EINVAL))
+           break;
+         /* We expect that no input bytes have been consumed so far.  */
+         if (inptr != inptr_before)
+           abort ();
+       }
+
+      if (res == 0)
+       {
+         *outbuf = outptr;
+         *outbytesleft = outsize;
+       }
+    }
+  while (res == 0 && inptr < inptr_end);
+
+  *inbuf = inptr;
+  *inbytesleft = inptr_end - inptr;
+  if (res != (size_t)(-1) && res > 0)
+    {
+      /* iconv() has already incremented INPTR.  We cannot go back to a
+        previous INPTR, otherwise the state inside CD would become invalid,
+        if FROM_CODESET is a stateful encoding.  So, tell the caller that
+        *INBUF has already been incremented.  */
+      *incremented = (inptr > inptr_before);
+      errno = EILSEQ;
+      return (size_t)(-1);
+    }
+  else
+    {
+      *incremented = false;
+      return res;
+    }
+}
+# else
+#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
+     (*(incremented) = false, \
+      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
+# endif
+
+static int
+mem_cd_iconveh_internal (const char *src, size_t srclen,
+                        iconv_t cd, iconv_t cd1, iconv_t cd2,
+                        enum iconv_ilseq_handler handler,
+                        size_t extra_alloc,
+                        char **resultp, size_t *lengthp)
+{
+  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
+     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
+     Instead, we have to start afresh from the beginning of SRC.  */
+  /* Use a temporary buffer, so that for small strings, a single malloc()
+     call will be sufficient.  */
+# define tmpbufsize 4096
+  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
+     libiconv's UCS-4-INTERNAL encoding.  */
+  union { unsigned int align; char buf[tmpbufsize]; } tmp;
+# define tmpbuf tmp.buf
+
+  char *result = tmpbuf;
+  size_t allocated = sizeof (tmpbuf);
+  size_t length = 0;
+
+  /* First, try a direct conversion, and see whether a conversion error
+     occurs at all.  */
+  {
+    const char *inptr = src;
+    size_t insize = srclen;
+
+    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    /* Set to the initial state.  */
+    iconv (cd, NULL, NULL, NULL, NULL);
+# endif
+
+    while (insize > 0)
+      {
+       char *outptr = result + length;
+       size_t outsize = allocated - extra_alloc - length;
+       bool incremented;
+       size_t res;
+       bool grow;
+
+       /* Use iconv_carefully instead of iconv here, because:
+          - If TO_CODESET is UTF-8, we can do the error handling in this loop,
+            no need for a second loop,
+          - With iconv() implementations other than GNU libiconv and GNU libc,
+            if we use iconv() in a big swoop, checking for an E2BIG return,
+            we lose the number of irreversible conversions.  */
+       res = iconv_carefully (cd,
+                              &inptr, &insize,
+                              &outptr, &outsize,
+                              &incremented);
+
+       length = outptr - result;
+       grow = (length + extra_alloc > allocated / 2);
+       if (res == (size_t)(-1))
+         {
+           if (errno == E2BIG)
+             grow = true;
+           else if (errno == EINVAL)
+             break;
+           else if (errno == EILSEQ && handler != iconveh_error)
+             {
+               if (cd2 == (iconv_t)(-1))
+                 {
+                   /* TO_CODESET is UTF-8.  */
+                   /* Error handling can produce up to 1 byte of output.  */
+                   if (length + 1 + extra_alloc > allocated)
+                     {
+                       char *memory;
+
+                       allocated = 2 * allocated;
+                       if (length + 1 + extra_alloc > allocated)
+                         abort ();
+                       if (result == tmpbuf)
+                         memory = (char *) malloc (allocated);
+                       else
+                         memory = (char *) realloc (result, allocated);
+                       if (memory == NULL)
+                         {
+                           if (result != tmpbuf)
+                             free (result);
+                           errno = ENOMEM;
+                           return -1;
+                         }
+                       if (result == tmpbuf)
+                         memcpy (memory, tmpbuf, length);
+                       result = memory;
+                       grow = false;
+                     }
+                   /* The input is invalid in FROM_CODESET.  Eat up one byte
+                      and emit a question mark.  */
+                   if (!incremented)
+                     {
+                       if (insize == 0)
+                         abort ();
+                       inptr++;
+                       insize--;
+                     }
+                   result[length] = '?';
+                   length++;
+                 }
+               else
+                 goto indirectly;
+             }
+           else
+             {
+               if (result != tmpbuf)
+                 {
+                   int saved_errno = errno;
+                   free (result);
+                   errno = saved_errno;
+                 }
+               return -1;
+             }
+         }
+       if (insize == 0)
+         break;
+       if (grow)
+         {
+           char *memory;
+
+           allocated = 2 * allocated;
+           if (result == tmpbuf)
+             memory = (char *) malloc (allocated);
+           else
+             memory = (char *) realloc (result, allocated);
+           if (memory == NULL)
+             {
+               if (result != tmpbuf)
+                 free (result);
+               errno = ENOMEM;
+               return -1;
+             }
+           if (result == tmpbuf)
+             memcpy (memory, tmpbuf, length);
+           result = memory;
+         }
+      }
+  }
+
+  /* Now get the conversion state back to the initial state.
+     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+#if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+  for (;;)
+    {
+      char *outptr = result + length;
+      size_t outsize = allocated - extra_alloc - length;
+      size_t res;
+
+      res = iconv (cd, NULL, NULL, &outptr, &outsize);
+      length = outptr - result;
+      if (res == (size_t)(-1))
+       {
+         if (errno == E2BIG)
+           {
+             char *memory;
+
+             allocated = 2 * allocated;
+             if (result == tmpbuf)
+               memory = (char *) malloc (allocated);
+             else
+               memory = (char *) realloc (result, allocated);
+             if (memory == NULL)
+               {
+                 if (result != tmpbuf)
+                   free (result);
+                 errno = ENOMEM;
+                 return -1;
+               }
+             if (result == tmpbuf)
+               memcpy (memory, tmpbuf, length);
+             result = memory;
+           }
+         else
+           {
+             if (result != tmpbuf)
+               {
+                 int saved_errno = errno;
+                 free (result);
+                 errno = saved_errno;
+               }
+             return -1;
+           }
+       }
+      else
+       break;
+    }
+#endif
+
+  /* The direct conversion succeeded.  */
+  goto done;
+
+ indirectly:
+  /* The direct conversion failed, handler != iconveh_error,
+     and cd2 != (iconv_t)(-1).
+     Use a conversion through UTF-8.  */
+  length = 0;
+  {
+# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
+    char utf8buf[utf8bufsize + 1];
+    size_t utf8len = 0;
+    const char *in1ptr = src;
+    size_t in1size = srclen;
+    bool do_final_flush1 = true;
+    bool do_final_flush2 = true;
+
+    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    /* Set to the initial state.  */
+    if (cd1 != (iconv_t)(-1))
+      iconv (cd1, NULL, NULL, NULL, NULL);
+    iconv (cd2, NULL, NULL, NULL, NULL);
+# endif
+
+    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
+      {
+       char *out1ptr = utf8buf + utf8len;
+       size_t out1size = utf8bufsize - utf8len;
+       bool incremented1;
+       size_t res1;
+       int errno1;
+
+       /* Conversion step 1: from FROM_CODESET to UTF-8.  */
+       if (in1size > 0)
+         {
+           if (cd1 != (iconv_t)(-1))
+             res1 = iconv_carefully (cd1,
+                                     (ICONV_CONST char **) &in1ptr, &in1size,
+                                     &out1ptr, &out1size,
+                                     &incremented1);
+           else
+             {
+               /* FROM_CODESET is UTF-8.  */
+               res1 = 0;
+               do
+                 {
+                   ucs4_t uc;
+                   int n;
+                   int m;
+
+                   n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size);
+                   if (uc == 0xfffd
+                       && !(n >= 3
+                            && (uint8_t)in1ptr[0] == 0xEF
+                            && (uint8_t)in1ptr[1] == 0xBF
+                            && (uint8_t)in1ptr[2] == 0xBD))
+                     {
+                       in1ptr += n;
+                       in1size -= n;
+                       errno = EILSEQ;
+                       res1 = (size_t)(-1);
+                       incremented1 = true;
+                       break;
+                     }
+                   if (out1size == 0)
+                     {
+                       errno = E2BIG;
+                       res1 = (size_t)(-1);
+                       incremented1 = false;
+                       break;
+                     }
+                   m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
+                   if (m == -2)
+                     {
+                       errno = E2BIG;
+                       res1 = (size_t)(-1);
+                       incremented1 = false;
+                       break;
+                     }
+                   in1ptr += n;
+                   in1size -= n;
+                   if (m == -1)
+                     {
+                       errno = EILSEQ;
+                       res1 = (size_t)(-1);
+                       incremented1 = true;
+                       break;
+                     }
+                   out1ptr += m;
+                   out1size -= m;
+                 }
+               while (in1size > 0);
+             }
+         }
+       else if (do_final_flush1)
+         {
+           /* Now get the conversion state of CD1 back to the initial state.
+              But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+           if (cd1 != (iconv_t)(-1))
+             res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
+           else
+# endif
+             res1 = 0;
+           do_final_flush1 = false;
+           incremented1 = true;
+         }
+       else
+         {
+           res1 = 0;
+           incremented1 = true;
+         }
+       if (res1 == (size_t)(-1)
+           && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
+         {
+           if (result != tmpbuf)
+             {
+               int saved_errno = errno;
+               free (result);
+               errno = saved_errno;
+             }
+           return -1;
+         }
+       if (res1 == (size_t)(-1)
+           && errno == EILSEQ && handler != iconveh_error)
+         {
+           /* The input is invalid in FROM_CODESET.  Eat up one byte and
+              emit a question mark.  Room for the question mark was allocated
+              at the end of utf8buf.  */
+           if (!incremented1)
+             {
+               if (in1size == 0)
+                 abort ();
+               in1ptr++;
+               in1size--;
+             }
+           utf8buf[utf8len++] = '?';
+         }
+       errno1 = errno;
+       utf8len = out1ptr - utf8buf;
+
+       if (in1size == 0
+           || utf8len > utf8bufsize / 2
+           || (res1 == (size_t)(-1) && errno1 == E2BIG))
+         {
+           /* Conversion step 2: from UTF-8 to TO_CODESET.  */
+           const char *in2ptr = utf8buf;
+           size_t in2size = utf8len;
+
+           while (in2size > 0
+                  || (in1size == 0 && !do_final_flush1 && do_final_flush2))
+             {
+               char *out2ptr = result + length;
+               size_t out2size = allocated - extra_alloc - length;
+               bool incremented2;
+               size_t res2;
+               bool grow;
+
+               if (in2size > 0)
+                 res2 = iconv_carefully (cd2,
+                                         &in2ptr, &in2size,
+                                         &out2ptr, &out2size,
+                                         &incremented2);
+               else /* in1size == 0 && !do_final_flush1
+                       && in2size == 0 && do_final_flush2 */
+                 {
+                   /* Now get the conversion state of CD1 back to the initial
+                      state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+                   res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
+# else
+                   res2 = 0;
+# endif
+                   do_final_flush2 = false;
+                   incremented2 = true;
+                 }
+
+               length = out2ptr - result;
+               grow = (length + extra_alloc > allocated / 2);
+               if (res2 == (size_t)(-1))
+                 {
+                   if (errno == E2BIG)
+                     grow = true;
+                   else if (errno == EINVAL)
+                     break;
+                   else if (errno == EILSEQ && handler != iconveh_error)
+                     {
+                       /* Error handling can produce up to 10 bytes of ASCII
+                          output.  But TO_CODESET may be UCS-2, UTF-16 or
+                          UCS-4, so use CD2 here as well.  */
+                       char scratchbuf[10];
+                       size_t scratchlen;
+                       ucs4_t uc;
+                       const char *inptr;
+                       size_t insize;
+                       size_t res;
+
+                       if (incremented2)
+                         {
+                           if (u8_prev (&uc, (const uint8_t *) in2ptr,
+                                        (const uint8_t *) utf8buf)
+                               == NULL)
+                             abort ();
+                         }
+                       else
+                         {
+                           int n;
+                           if (in2size == 0)
+                             abort ();
+                           n = u8_mbtouc (&uc, (const uint8_t *) in2ptr,
+                                          in2size);
+                           in2ptr += n;
+                           in2size -= n;
+                         }
+
+                       if (handler == iconveh_escape_sequence)
+                         {
+                           static char hex[16] = "0123456789ABCDEF";
+                           scratchlen = 0;
+                           scratchbuf[scratchlen++] = '\\';
+                           if (uc < 0x10000)
+                             scratchbuf[scratchlen++] = 'u';
+                           else
+                             {
+                               scratchbuf[scratchlen++] = 'U';
+                               scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
+                               scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
+                               scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
+                               scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
+                             }
+                           scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
+                           scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
+                           scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
+                           scratchbuf[scratchlen++] = hex[uc & 15];
+                         }
+                       else
+                         {
+                           scratchbuf[0] = '?';
+                           scratchlen = 1;
+                         }
+
+                       inptr = scratchbuf;
+                       insize = scratchlen;
+                       res = iconv (cd2,
+                                    (ICONV_CONST char **) &inptr, &insize,
+                                    &out2ptr, &out2size);
+                       length = out2ptr - result;
+                       if (res == (size_t)(-1) && errno == E2BIG)
+                         {
+                           char *memory;
+
+                           allocated = 2 * allocated;
+                           if (length + 1 + extra_alloc > allocated)
+                             abort ();
+                           if (result == tmpbuf)
+                             memory = (char *) malloc (allocated);
+                           else
+                             memory = (char *) realloc (result, allocated);
+                           if (memory == NULL)
+                             {
+                               if (result != tmpbuf)
+                                 free (result);
+                               errno = ENOMEM;
+                               return -1;
+                             }
+                           if (result == tmpbuf)
+                             memcpy (memory, tmpbuf, length);
+                           result = memory;
+                           grow = false;
+
+                           out2ptr = result + length;
+                           out2size = allocated - extra_alloc - length;
+                           res = iconv (cd2,
+                                        (ICONV_CONST char **) &inptr, &insize,
+                                        &out2ptr, &out2size);
+                           length = out2ptr - result;
+                         }
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+                       /* Irix iconv() inserts a NUL byte if it cannot convert.
+                          NetBSD iconv() inserts a question mark if it cannot
+                          convert.
+                          Only GNU libiconv and GNU libc are known to prefer
+                          to fail rather than doing a lossy conversion.  */
+                       if (res != (size_t)(-1) && res > 0)
+                         {
+                           errno = EILSEQ;
+                           res = (size_t)(-1);
+                         }
+# endif
+                       if (res == (size_t)(-1))
+                         {
+                           /* Failure converting the ASCII replacement.  */
+                           if (result != tmpbuf)
+                             {
+                               int saved_errno = errno;
+                               free (result);
+                               errno = saved_errno;
+                             }
+                           return -1;
+                         }
+                     }
+                   else
+                     {
+                       if (result != tmpbuf)
+                         {
+                           int saved_errno = errno;
+                           free (result);
+                           errno = saved_errno;
+                         }
+                       return -1;
+                     }
+                 }
+               if (!(in2size > 0
+                     || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
+                 break;
+               if (grow)
+                 {
+                   char *memory;
+
+                   allocated = 2 * allocated;
+                   if (result == tmpbuf)
+                     memory = (char *) malloc (allocated);
+                   else
+                     memory = (char *) realloc (result, allocated);
+                   if (memory == NULL)
+                     {
+                       if (result != tmpbuf)
+                         free (result);
+                       errno = ENOMEM;
+                       return -1;
+                     }
+                   if (result == tmpbuf)
+                     memcpy (memory, tmpbuf, length);
+                   result = memory;
+                 }
+             }
+
+           /* Move the remaining bytes to the beginning of utf8buf.  */
+           if (in2size > 0)
+             memmove (utf8buf, in2ptr, in2size);
+           utf8len = in2size;
+         }
+
+       if (res1 == (size_t)(-1))
+         {
+           if (errno1 == EINVAL)
+             in1size = 0;
+           else if (errno1 == EILSEQ)
+             {
+               if (result != tmpbuf)
+                 free (result);
+               errno = errno1;
+               return -1;
+             }
+         }
+      }
+# undef utf8bufsize
+  }
+
+ done:
+  /* Now the final memory allocation.  */
+  if (resultp != NULL)
+    {
+      if (result == tmpbuf)
+       {
+         char *memory;
+
+         memory = (char *) malloc (length + extra_alloc);
+         if (memory != NULL)
+           {
+             memcpy (memory, tmpbuf, length);
+             result = memory;
+           }
+         else
+           {
+             errno = ENOMEM;
+             return -1;
+           }
+       }
+      else if (length + extra_alloc < allocated)
+       {
+         /* Shrink the allocated memory if possible.  */
+         char *memory;
+
+         memory = (char *) realloc (result, length + extra_alloc);
+         if (memory != NULL)
+           result = memory;
+       }
+      *resultp = result;
+    }
+  else
+    {
+      if (result != tmpbuf)
+       free (result);
+    }
+  if (lengthp != NULL)
+    *lengthp = length;
+  return 0;
+# undef tmpbuf
+# undef tmpbufsize
+}
+
+int
+mem_cd_iconveh (const char *src, size_t srclen,
+               iconv_t cd, iconv_t cd1, iconv_t cd2,
+               enum iconv_ilseq_handler handler,
+               char **resultp, size_t *lengthp)
+{
+  return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
+                                 resultp, lengthp);
+}
+
+char *
+str_cd_iconveh (const char *src,
+               iconv_t cd, iconv_t cd1, iconv_t cd2,
+               enum iconv_ilseq_handler handler)
+{
+  /* For most encodings, a trailing NUL byte in the input will be converted
+     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
+     function is usable for UTF-7, we have to exclude the NUL byte from the
+     conversion and add it by hand afterwards.  */
+  char *result = NULL;
+  size_t length;
+  int retval = mem_cd_iconveh_internal (src, strlen (src),
+                                       cd, cd1, cd2, handler, 1,
+                                       &result, &length);
+
+  if (retval < 0)
+    {
+      if (result != NULL)
+       {
+         int saved_errno = errno;
+         free (result);
+         errno = saved_errno;
+       }
+      return NULL;
+    }
+
+  /* Add the terminating NUL byte.  */
+  result[length] = '\0';
+
+  return result;
+}
+
+#endif
+
+char *
+str_iconveh (const char *src,
+            const char *from_codeset, const char *to_codeset,
+            enum iconv_ilseq_handler handler)
+{
+  if (c_strcasecmp (from_codeset, to_codeset) == 0)
+    return strdup (src);
+  else
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+      iconv_t cd1;
+      iconv_t cd2;
+      char *result;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
+         || c_strcasecmp (to_codeset, "EUC-KR") == 0)
+       {
+         errno = EINVAL;
+         return NULL;
+       }
+# endif
+
+      cd = iconv_open (to_codeset, from_codeset);
+      if (cd == (iconv_t)(-1))
+       return NULL;
+
+      if (c_strcasecmp (from_codeset, "UTF-8") == 0)
+       cd1 = (iconv_t)(-1);
+      else
+       {
+         cd1 = iconv_open ("UTF-8", from_codeset);
+         if (cd1 == (iconv_t)(-1))
+           {
+             int saved_errno = errno;
+             iconv_close (cd);
+             errno = saved_errno;
+             return NULL;
+           }
+       }
+
+      if (c_strcasecmp (to_codeset, "UTF-8") == 0)
+       cd2 = (iconv_t)(-1);
+      else
+       {
+         cd2 = iconv_open (to_codeset, "UTF-8");
+         if (cd2 == (iconv_t)(-1))
+           {
+             int saved_errno = errno;
+             if (cd1 != (iconv_t)(-1))
+               iconv_close (cd1);
+             iconv_close (cd);
+             errno = saved_errno;
+             return NULL;
+           }
+       }
+
+      result = str_cd_iconveh (src, cd, cd1, cd2, handler);
+
+      if (result == NULL)
+       {
+         /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv.  */
+         int saved_errno = errno;
+         if (cd2 != (iconv_t)(-1))
+           iconv_close (cd2);
+         if (cd1 != (iconv_t)(-1))
+           iconv_close (cd1);
+         iconv_close (cd);
+         errno = saved_errno;
+       }
+      else
+       {
+         if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
+           {
+             /* Return NULL, but free the allocated memory, and while doing
+                that, preserve the errno from iconv_close.  */
+             int saved_errno = errno;
+             if (cd1 != (iconv_t)(-1))
+               iconv_close (cd1);
+             iconv_close (cd);
+             free (result);
+             errno = saved_errno;
+             return NULL;
+           }
+         if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
+           {
+             /* Return NULL, but free the allocated memory, and while doing
+                that, preserve the errno from iconv_close.  */
+             int saved_errno = errno;
+             iconv_close (cd);
+             free (result);
+             errno = saved_errno;
+             return NULL;
+           }
+         if (iconv_close (cd) < 0)
+           {
+             /* Return NULL, but free the allocated memory, and while doing
+                that, preserve the errno from iconv_close.  */
+             int saved_errno = errno;
+             free (result);
+             errno = saved_errno;
+             return NULL;
+           }
+       }
+      return result;
+#else
+      /* This is a different error code than if iconv_open existed but didn't
+        support from_codeset and to_codeset, so that the caller can emit
+        an error message such as
+          "iconv() is not supported. Installing GNU libiconv and
+           then reinstalling this package would fix this."  */
+      errno = ENOSYS;
+      return NULL;
+#endif
+    }
+}
diff --git a/lib/striconveh.h b/lib/striconveh.h

new file mode 100644 (file)

index 0000000..b528e51
--- /dev/null
+++ b/lib/striconveh.h
@@ -0,0 +1,99 @@
+/* Character set conversion with error handling.
+   Copyright (C) 2001-2007 Free Software Foundation, Inc.
+   Written by Bruno Haible and Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef _STRICONVEH_H
+#define _STRICONVEH_H
+
+#include <stddef.h>
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Handling of unconvertible characters.  */
+enum iconv_ilseq_handler
+{
+  iconveh_error,               /* return and set errno = EILSEQ */
+  iconveh_question_mark,       /* use one '?' per unconvertible character */
+  iconveh_escape_sequence      /* use escape sequence \uxxxx or \Uxxxxxxxx */
+};
+
+#if HAVE_ICONV
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is at [SRC,...,SRC+SRCLEN-1].
+   The conversion descriptor from FROMCODE to TOCODE is passed as CD.
+   CD1 is the conversion descriptor from FROM_CODESET to UTF-8 (or
+   (iconv_t)(-1) if FROM_CODESET is UTF-8).
+   CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1)
+   if TO_CODESET is UTF-8).
+   *RESULTP should initially contain NULL or a malloced memory block.
+   May change the size of the allocated memory block in *RESULTP, storing
+   its new address in *RESULTP and its new length in *LENGTHP.
+   Return value: 0 if successful, otherwise -1 and errno set.
+   If successful, the resulting string is stored in *RESULTP and its length
+   in *LENGTHP.  */
+extern int
+       mem_cd_iconveh (const char *src, size_t srclen,
+                      iconv_t cd, iconv_t cd1, iconv_t cd2,
+                      enum iconv_ilseq_handler handler,
+                      char **resultp, size_t *lengthp);
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is the NUL-terminated string starting at SRC.
+   The conversion descriptor is passed as CD.  Both the "from" and the "to"
+   encoding must use a single NUL byte at the end of the string (i.e. not
+   UCS-2, UCS-4, UTF-16, UTF-32).
+   CD1 is the conversion descriptor from FROM_CODESET to UTF-8 (or
+   (iconv_t)(-1) if FROM_CODESET is UTF-8).
+   CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1)
+   if TO_CODESET is UTF-8).
+   Allocate a malloced memory block for the result.
+   Return value: the freshly allocated resulting NUL-terminated string if
+   successful, otherwise NULL and errno set.  */
+extern char *
+       str_cd_iconveh (const char *src,
+                      iconv_t cd, iconv_t cd1, iconv_t cd2,
+                      enum iconv_ilseq_handler handler);
+
+#endif
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is the NUL-terminated string starting at SRC.
+   Both the "from" and the "to" encoding must use a single NUL byte at the
+   end of the string (i.e. not UCS-2, UCS-4, UTF-16, UTF-32).
+   Allocate a malloced memory block for the result.
+   Return value: the freshly allocated resulting NUL-terminated string if
+   successful, otherwise NULL and errno set.  */
+extern char *
+       str_iconveh (const char *src,
+                   const char *from_codeset, const char *to_codeset,
+                   enum iconv_ilseq_handler handler);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _STRICONVEH_H */
diff --git a/modules/striconveh b/modules/striconveh

new file mode 100644 (file)

index 0000000..e3649e0
--- /dev/null
+++ b/modules/striconveh
@@ -0,0 +1,38 @@
+Description:
+Character set conversion of strings with error handling, uses iconv.
+
+Files:
+lib/striconveh.h
+lib/striconveh.c
+
+Depends-on:
+stdbool
+iconv
+utf8-ucs4-safe
+ucs4-utf8
+unistr/u8-prev
+unistr/u8-mbtouc
+strdup
+c-strcase
+
+configure.ac:
+if test $gl_cond_libtool = false; then
+  gl_ltlibdeps="$gl_ltlibdeps $LTLIBICONV"
+  gl_libdeps="$gl_libdeps $LIBICONV"
+fi
+
+Makefile.am:
+lib_SOURCES += striconveh.h striconveh.c
+if GL_COND_LIBTOOL
+lib_LDFLAGS += $(LTLIBICONV)
+endif
+
+Include:
+"striconveh.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible
+
author	Bruno Haible <bruno@clisp.org>
	Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 16 Jan 2007 03:47:23 +0000 (03:47 +0000)
ChangeLog		patch \| blob \| history
MODULES.html.sh		patch \| blob \| history
lib/striconveh.c	[new file with mode: 0644]	patch \| blob
lib/striconveh.h	[new file with mode: 0644]	patch \| blob
modules/striconveh	[new file with mode: 0644]	patch \| blob