Do an indirect conversion if iconv_open does not support a direct conversion.
[gnulib.git] / lib / striconveh.c
index b02a182..2b67305 100644 (file)
 
 #if HAVE_ICONV
 # include <iconv.h>
-# include "utf8-ucs4-safe.h"
-# include "ucs4-utf8.h"
 # include "unistr.h"
 #endif
 
-#include "strdup.h"
 #include "c-strcase.h"
+#include "c-strcaseeq.h"
 
 #ifndef SIZE_MAX
 # define SIZE_MAX ((size_t) -1)
@@ -119,11 +117,141 @@ iconv_carefully (iconv_t cd,
       iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
 # endif
 
+/* iconv_carefully_1 is like iconv_carefully, except that it stops after
+   converting one character.  */
+static size_t
+iconv_carefully_1 (iconv_t cd,
+                  const char **inbuf, size_t *inbytesleft,
+                  char **outbuf, size_t *outbytesleft,
+                  bool *incremented)
+{
+  const char *inptr = *inbuf;
+  const char *inptr_end = inptr + *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  const char *inptr_before = inptr;
+  size_t res = (size_t)(-1);
+  size_t insize;
+
+  for (insize = 1; inptr + insize <= inptr_end; insize++)
+    {
+      res = iconv (cd,
+                  (ICONV_CONST char **) &inptr, &insize,
+                  &outptr, &outsize);
+      if (!(res == (size_t)(-1) && errno == EINVAL))
+       break;
+      /* We expect that no input bytes have been consumed so far.  */
+      if (inptr != inptr_before)
+       abort ();
+    }
+
+  *inbuf = inptr;
+  *inbytesleft = inptr_end - inptr;
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+  /* Irix iconv() inserts a NUL byte if it cannot convert.
+     NetBSD iconv() inserts a question mark if it cannot convert.
+     Only GNU libiconv and GNU libc are known to prefer to fail rather
+     than doing a lossy conversion.  */
+  if (res != (size_t)(-1) && res > 0)
+    {
+      /* iconv() has already incremented INPTR.  We cannot go back to a
+        previous INPTR, otherwise the state inside CD would become invalid,
+        if FROM_CODESET is a stateful encoding.  So, tell the caller that
+        *INBUF has already been incremented.  */
+      *incremented = (inptr > inptr_before);
+      errno = EILSEQ;
+      return (size_t)(-1);
+    }
+# endif
+
+  if (res != (size_t)(-1))
+    {
+      *outbuf = outptr;
+      *outbytesleft = outsize;
+    }
+  *incremented = false;
+  return res;
+}
+
+/* utf8conv_carefully is like iconv, except that
+     - it converts from UTF-8 to UTF-8,
+     - it stops as soon as it encounters a conversion error, and it returns
+       in *INCREMENTED a boolean telling whether it has incremented the input
+       pointers past the error location,
+     - if one_character_only is true, it stops after converting one
+       character.  */
+static size_t
+utf8conv_carefully (bool one_character_only,
+                   const char **inbuf, size_t *inbytesleft,
+                   char **outbuf, size_t *outbytesleft,
+                   bool *incremented)
+{
+  const char *inptr = *inbuf;
+  size_t insize = *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  size_t res;
+
+  res = 0;
+  do
+    {
+      ucs4_t uc;
+      int n;
+      int m;
+
+      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
+      if (n < 0)
+       {
+         errno = (n == -2 ? EINVAL : EILSEQ);
+         n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
+         inptr += n;
+         insize -= n;
+         res = (size_t)(-1);
+         *incremented = true;
+         break;
+       }
+      if (outsize == 0)
+       {
+         errno = E2BIG;
+         res = (size_t)(-1);
+         *incremented = false;
+         break;
+       }
+      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
+      if (m == -2)
+       {
+         errno = E2BIG;
+         res = (size_t)(-1);
+         *incremented = false;
+         break;
+       }
+      inptr += n;
+      insize -= n;
+      if (m == -1)
+       {
+         errno = EILSEQ;
+         res = (size_t)(-1);
+         *incremented = true;
+         break;
+       }
+      outptr += m;
+      outsize -= m;
+    }
+  while (!one_character_only && insize > 0);
+
+  *inbuf = inptr;
+  *inbytesleft = insize;
+  *outbuf = outptr;
+  *outbytesleft = outsize;
+  return res;
+}
+
 static int
 mem_cd_iconveh_internal (const char *src, size_t srclen,
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
                         enum iconv_ilseq_handler handler,
                         size_t extra_alloc,
+                        size_t *offsets,
                         char **resultp, size_t *lengthp)
 {
   /* When a conversion error occurs, we cannot start using CD1 and CD2 at
@@ -141,8 +269,9 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
   char *result;
   size_t allocated;
   size_t length;
+  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
 
-  if (*lengthp >= sizeof (tmpbuf))
+  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
     {
       initial_result = *resultp;
       allocated = *lengthp;
@@ -153,6 +282,20 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
       allocated = sizeof (tmpbuf);
     }
   result = initial_result;
+
+  /* Test whether a direct conversion is possible at all.  */
+  if (cd == (iconv_t)(-1))
+    goto indirectly;
+
+  if (offsets != NULL)
+    {
+      size_t i;
+
+      for (i = 0; i < srclen; i++)
+       offsets[i] = (size_t)(-1);
+
+      last_length = (size_t)(-1);
+    }
   length = 0;
 
   /* First, try a direct conversion, and see whether a conversion error
@@ -176,16 +319,29 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
        size_t res;
        bool grow;
 
-       /* Use iconv_carefully instead of iconv here, because:
-          - If TO_CODESET is UTF-8, we can do the error handling in this loop,
-            no need for a second loop,
-          - With iconv() implementations other than GNU libiconv and GNU libc,
-            if we use iconv() in a big swoop, checking for an E2BIG return,
-            we lose the number of irreversible conversions.  */
-       res = iconv_carefully (cd,
-                              &inptr, &insize,
-                              &outptr, &outsize,
-                              &incremented);
+       if (offsets != NULL)
+         {
+           if (length != last_length) /* ensure that offset[] be increasing */
+             {
+               offsets[inptr - src] = length;
+               last_length = length;
+             }
+           res = iconv_carefully_1 (cd,
+                                    &inptr, &insize,
+                                    &outptr, &outsize,
+                                    &incremented);
+         }
+       else
+         /* Use iconv_carefully instead of iconv here, because:
+            - If TO_CODESET is UTF-8, we can do the error handling in this
+              loop, no need for a second loop,
+            - With iconv() implementations other than GNU libiconv and GNU
+              libc, if we use iconv() in a big swoop, checking for an E2BIG
+              return, we lose the number of irreversible conversions.  */
+         res = iconv_carefully (cd,
+                                &inptr, &insize,
+                                &outptr, &outsize,
+                                &incremented);
 
        length = outptr - result;
        grow = (length + extra_alloc > allocated / 2);
@@ -329,11 +485,20 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
   goto done;
 
  indirectly:
-  /* The direct conversion failed, handler != iconveh_error,
-     and cd2 != (iconv_t)(-1).
+  /* The direct conversion failed.
      Use a conversion through UTF-8.  */
+  if (offsets != NULL)
+    {
+      size_t i;
+
+      for (i = 0; i < srclen; i++)
+       offsets[i] = (size_t)(-1);
+
+      last_length = (size_t)(-1);
+    }
   length = 0;
   {
+    const bool slowly = (offsets != NULL || handler == iconveh_error);
 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
     char utf8buf[utf8bufsize + 1];
     size_t utf8len = 0;
@@ -348,7 +513,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
     /* Set to the initial state.  */
     if (cd1 != (iconv_t)(-1))
       iconv (cd1, NULL, NULL, NULL, NULL);
-    iconv (cd2, NULL, NULL, NULL, NULL);
+    if (cd2 != (iconv_t)(-1))
+      iconv (cd2, NULL, NULL, NULL, NULL);
 # endif
 
     while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
@@ -362,63 +528,32 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
        if (in1size > 0)
          {
+           if (offsets != NULL
+               && length != last_length) /* ensure that offset[] be increasing */
+             {
+               offsets[in1ptr - src] = length;
+               last_length = length;
+             }
            if (cd1 != (iconv_t)(-1))
-             res1 = iconv_carefully (cd1,
-                                     (ICONV_CONST char **) &in1ptr, &in1size,
-                                     &out1ptr, &out1size,
-                                     &incremented1);
+             {
+               if (slowly)
+                 res1 = iconv_carefully_1 (cd1,
+                                           &in1ptr, &in1size,
+                                           &out1ptr, &out1size,
+                                           &incremented1);
+               else
+                 res1 = iconv_carefully (cd1,
+                                         &in1ptr, &in1size,
+                                         &out1ptr, &out1size,
+                                         &incremented1);
+             }
            else
              {
                /* FROM_CODESET is UTF-8.  */
-               res1 = 0;
-               do
-                 {
-                   ucs4_t uc;
-                   int n;
-                   int m;
-
-                   n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size);
-                   if (uc == 0xfffd
-                       && !(n >= 3
-                            && (uint8_t)in1ptr[0] == 0xEF
-                            && (uint8_t)in1ptr[1] == 0xBF
-                            && (uint8_t)in1ptr[2] == 0xBD))
-                     {
-                       in1ptr += n;
-                       in1size -= n;
-                       errno = EILSEQ;
-                       res1 = (size_t)(-1);
-                       incremented1 = true;
-                       break;
-                     }
-                   if (out1size == 0)
-                     {
-                       errno = E2BIG;
-                       res1 = (size_t)(-1);
-                       incremented1 = false;
-                       break;
-                     }
-                   m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
-                   if (m == -2)
-                     {
-                       errno = E2BIG;
-                       res1 = (size_t)(-1);
-                       incremented1 = false;
-                       break;
-                     }
-                   in1ptr += n;
-                   in1size -= n;
-                   if (m == -1)
-                     {
-                       errno = EILSEQ;
-                       res1 = (size_t)(-1);
-                       incremented1 = true;
-                       break;
-                     }
-                   out1ptr += m;
-                   out1size -= m;
-                 }
-               while (in1size > 0);
+               res1 = utf8conv_carefully (slowly,
+                                          &in1ptr, &in1size,
+                                          &out1ptr, &out1size,
+                                          &incremented1);
              }
          }
        else if (do_final_flush1)
@@ -469,7 +604,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
        errno1 = errno;
        utf8len = out1ptr - utf8buf;
 
-       if (in1size == 0
+       if (offsets != NULL
+           || in1size == 0
            || utf8len > utf8bufsize / 2
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
          {
@@ -487,10 +623,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                bool grow;
 
                if (in2size > 0)
-                 res2 = iconv_carefully (cd2,
-                                         &in2ptr, &in2size,
-                                         &out2ptr, &out2size,
-                                         &incremented2);
+                 {
+                   if (cd2 != (iconv_t)(-1))
+                     res2 = iconv_carefully (cd2,
+                                             &in2ptr, &in2size,
+                                             &out2ptr, &out2size,
+                                             &incremented2);
+                   else
+                     /* TO_CODESET is UTF-8.  */
+                     res2 = utf8conv_carefully (false,
+                                                &in2ptr, &in2size,
+                                                &out2ptr, &out2size,
+                                                &incremented2);
+                 }
                else /* in1size == 0 && !do_final_flush1
                        && in2size == 0 && do_final_flush2 */
                  {
@@ -498,10 +643,11 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 # if defined _LIBICONV_VERSION \
      || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
-                   res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
-# else
-                   res2 = 0;
+                   if (cd2 != (iconv_t)(-1))
+                     res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
+                   else
 # endif
+                     res2 = 0;
                    do_final_flush2 = false;
                    incremented2 = true;
                  }
@@ -538,8 +684,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                            int n;
                            if (in2size == 0)
                              abort ();
-                           n = u8_mbtouc (&uc, (const uint8_t *) in2ptr,
-                                          in2size);
+                           n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
+                                                 in2size);
                            in2ptr += n;
                            in2size -= n;
                          }
@@ -572,9 +718,28 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
 
                        inptr = scratchbuf;
                        insize = scratchlen;
-                       res = iconv (cd2,
-                                    (ICONV_CONST char **) &inptr, &insize,
-                                    &out2ptr, &out2size);
+                       if (cd2 != (iconv_t)(-1))
+                         res = iconv (cd2,
+                                      (ICONV_CONST char **) &inptr, &insize,
+                                      &out2ptr, &out2size);
+                       else
+                         {
+                           /* TO_CODESET is UTF-8.  */
+                           if (out2size >= insize)
+                             {
+                               memcpy (out2ptr, inptr, insize);
+                               out2ptr += insize;
+                               out2size -= insize;
+                               inptr += insize;
+                               insize = 0;
+                               res = 0;
+                             }
+                           else
+                             {
+                               errno = E2BIG;
+                               res = (size_t)(-1);
+                             }
+                         }
                        length = out2ptr - result;
                        if (res == (size_t)(-1) && errno == E2BIG)
                          {
@@ -601,9 +766,23 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
 
                            out2ptr = result + length;
                            out2size = allocated - extra_alloc - length;
-                           res = iconv (cd2,
-                                        (ICONV_CONST char **) &inptr, &insize,
-                                        &out2ptr, &out2size);
+                           if (cd2 != (iconv_t)(-1))
+                             res = iconv (cd2,
+                                          (ICONV_CONST char **) &inptr,
+                                          &insize,
+                                          &out2ptr, &out2size);
+                           else
+                             {
+                               /* TO_CODESET is UTF-8.  */
+                               if (!(out2size >= insize))
+                                 abort ();
+                               memcpy (out2ptr, inptr, insize);
+                               out2ptr += insize;
+                               out2size -= insize;
+                               inptr += insize;
+                               insize = 0;
+                               res = 0;
+                             }
                            length = out2ptr - result;
                          }
 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
@@ -726,10 +905,11 @@ int
 mem_cd_iconveh (const char *src, size_t srclen,
                iconv_t cd, iconv_t cd1, iconv_t cd2,
                enum iconv_ilseq_handler handler,
+               size_t *offsets,
                char **resultp, size_t *lengthp)
 {
   return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
-                                 resultp, lengthp);
+                                 offsets, resultp, lengthp);
 }
 
 char *
@@ -744,7 +924,7 @@ str_cd_iconveh (const char *src,
   char *result = NULL;
   size_t length = 0;
   int retval = mem_cd_iconveh_internal (src, strlen (src),
-                                       cd, cd1, cd2, handler, 1,
+                                       cd, cd1, cd2, handler, 1, NULL,
                                        &result, &length);
 
   if (retval < 0)
@@ -770,6 +950,7 @@ int
 mem_iconveh (const char *src, size_t srclen,
             const char *from_codeset, const char *to_codeset,
             enum iconv_ilseq_handler handler,
+            size_t *offsets,
             char **resultp, size_t *lengthp)
 {
   if (srclen == 0)
@@ -778,7 +959,7 @@ mem_iconveh (const char *src, size_t srclen,
       *lengthp = 0;
       return 0;
     }
-  else if (c_strcasecmp (from_codeset, to_codeset) == 0)
+  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
     {
       char *result;
 
@@ -819,10 +1000,8 @@ mem_iconveh (const char *src, size_t srclen,
 # endif
 
       cd = iconv_open (to_codeset, from_codeset);
-      if (cd == (iconv_t)(-1))
-       return -1;
 
-      if (c_strcasecmp (from_codeset, "UTF-8") == 0)
+      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
        cd1 = (iconv_t)(-1);
       else
        {
@@ -830,13 +1009,14 @@ mem_iconveh (const char *src, size_t srclen,
          if (cd1 == (iconv_t)(-1))
            {
              int saved_errno = errno;
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              errno = saved_errno;
              return -1;
            }
        }
 
-      if (c_strcasecmp (to_codeset, "UTF-8") == 0)
+      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
        cd2 = (iconv_t)(-1);
       else
        {
@@ -846,7 +1026,8 @@ mem_iconveh (const char *src, size_t srclen,
              int saved_errno = errno;
              if (cd1 != (iconv_t)(-1))
                iconv_close (cd1);
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              errno = saved_errno;
              return -1;
            }
@@ -854,8 +1035,8 @@ mem_iconveh (const char *src, size_t srclen,
 
       result = *resultp;
       length = *lengthp;
-      retval =
-       mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, &result, &length);
+      retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
+                              &result, &length);
 
       if (retval < 0)
        {
@@ -865,7 +1046,8 @@ mem_iconveh (const char *src, size_t srclen,
            iconv_close (cd2);
          if (cd1 != (iconv_t)(-1))
            iconv_close (cd1);
-         iconv_close (cd);
+         if (cd != (iconv_t)(-1))
+           iconv_close (cd);
          errno = saved_errno;
        }
       else
@@ -877,7 +1059,8 @@ mem_iconveh (const char *src, size_t srclen,
              int saved_errno = errno;
              if (cd1 != (iconv_t)(-1))
                iconv_close (cd1);
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              if (result != *resultp && result != NULL)
                free (result);
              errno = saved_errno;
@@ -888,13 +1071,14 @@ mem_iconveh (const char *src, size_t srclen,
              /* Return -1, but free the allocated memory, and while doing
                 that, preserve the errno from iconv_close.  */
              int saved_errno = errno;
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              if (result != *resultp && result != NULL)
                free (result);
              errno = saved_errno;
              return -1;
            }
-         if (iconv_close (cd) < 0)
+         if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
            {
              /* Return -1, but free the allocated memory, and while doing
                 that, preserve the errno from iconv_close.  */
@@ -952,10 +1136,8 @@ str_iconveh (const char *src,
 # endif
 
       cd = iconv_open (to_codeset, from_codeset);
-      if (cd == (iconv_t)(-1))
-       return NULL;
 
-      if (c_strcasecmp (from_codeset, "UTF-8") == 0)
+      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
        cd1 = (iconv_t)(-1);
       else
        {
@@ -963,13 +1145,14 @@ str_iconveh (const char *src,
          if (cd1 == (iconv_t)(-1))
            {
              int saved_errno = errno;
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              errno = saved_errno;
              return NULL;
            }
        }
 
-      if (c_strcasecmp (to_codeset, "UTF-8") == 0)
+      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
        cd2 = (iconv_t)(-1);
       else
        {
@@ -979,7 +1162,8 @@ str_iconveh (const char *src,
              int saved_errno = errno;
              if (cd1 != (iconv_t)(-1))
                iconv_close (cd1);
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              errno = saved_errno;
              return NULL;
            }
@@ -995,7 +1179,8 @@ str_iconveh (const char *src,
            iconv_close (cd2);
          if (cd1 != (iconv_t)(-1))
            iconv_close (cd1);
-         iconv_close (cd);
+         if (cd != (iconv_t)(-1))
+           iconv_close (cd);
          errno = saved_errno;
        }
       else
@@ -1007,7 +1192,8 @@ str_iconveh (const char *src,
              int saved_errno = errno;
              if (cd1 != (iconv_t)(-1))
                iconv_close (cd1);
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              free (result);
              errno = saved_errno;
              return NULL;
@@ -1017,12 +1203,13 @@ str_iconveh (const char *src,
              /* Return NULL, but free the allocated memory, and while doing
                 that, preserve the errno from iconv_close.  */
              int saved_errno = errno;
-             iconv_close (cd);
+             if (cd != (iconv_t)(-1))
+               iconv_close (cd);
              free (result);
              errno = saved_errno;
              return NULL;
            }
-         if (iconv_close (cd) < 0)
+         if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
            {
              /* Return NULL, but free the allocated memory, and while doing
                 that, preserve the errno from iconv_close.  */