1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "striconveh.h"
31 # include "utf8-ucs4-safe.h"
32 # include "ucs4-utf8.h"
37 #include "c-strcase.h"
38 #include "c-strcaseeq.h"
41 # define SIZE_MAX ((size_t) -1)
47 /* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
48 error occurs, we may have to determine the Unicode representation of the
49 inconvertible character. */
51 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
52 a conversion error, and it returns in *INCREMENTED a boolean telling whether
53 it has incremented the input pointers past the error location. */
54 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
55 /* Irix iconv() inserts a NUL byte if it cannot convert.
56 NetBSD iconv() inserts a question mark if it cannot convert.
57 Only GNU libiconv and GNU libc are known to prefer to fail rather
58 than doing a lossy conversion. */
60 iconv_carefully (iconv_t cd,
61 const char **inbuf, size_t *inbytesleft,
62 char **outbuf, size_t *outbytesleft,
65 const char *inptr = *inbuf;
66 const char *inptr_end = inptr + *inbytesleft;
67 char *outptr = *outbuf;
68 size_t outsize = *outbytesleft;
69 const char *inptr_before;
79 for (insize = 1; inptr + insize <= inptr_end; insize++)
82 (ICONV_CONST char **) &inptr, &insize,
84 if (!(res == (size_t)(-1) && errno == EINVAL))
86 /* We expect that no input bytes have been consumed so far. */
87 if (inptr != inptr_before)
94 *outbytesleft = outsize;
97 while (res == 0 && inptr < inptr_end);
100 *inbytesleft = inptr_end - inptr;
101 if (res != (size_t)(-1) && res > 0)
103 /* iconv() has already incremented INPTR. We cannot go back to a
104 previous INPTR, otherwise the state inside CD would become invalid,
105 if FROM_CODESET is a stateful encoding. So, tell the caller that
106 *INBUF has already been incremented. */
107 *incremented = (inptr > inptr_before);
113 *incremented = false;
118 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
119 (*(incremented) = false, \
120 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
123 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
124 converting one character. */
126 iconv_carefully_1 (iconv_t cd,
127 const char **inbuf, size_t *inbytesleft,
128 char **outbuf, size_t *outbytesleft,
131 const char *inptr = *inbuf;
132 const char *inptr_end = inptr + *inbytesleft;
133 char *outptr = *outbuf;
134 size_t outsize = *outbytesleft;
135 const char *inptr_before = inptr;
136 size_t res = (size_t)(-1);
139 for (insize = 1; inptr + insize <= inptr_end; insize++)
142 (ICONV_CONST char **) &inptr, &insize,
144 if (!(res == (size_t)(-1) && errno == EINVAL))
146 /* We expect that no input bytes have been consumed so far. */
147 if (inptr != inptr_before)
152 *inbytesleft = inptr_end - inptr;
153 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
154 /* Irix iconv() inserts a NUL byte if it cannot convert.
155 NetBSD iconv() inserts a question mark if it cannot convert.
156 Only GNU libiconv and GNU libc are known to prefer to fail rather
157 than doing a lossy conversion. */
158 if (res != (size_t)(-1) && res > 0)
160 /* iconv() has already incremented INPTR. We cannot go back to a
161 previous INPTR, otherwise the state inside CD would become invalid,
162 if FROM_CODESET is a stateful encoding. So, tell the caller that
163 *INBUF has already been incremented. */
164 *incremented = (inptr > inptr_before);
170 if (res != (size_t)(-1))
173 *outbytesleft = outsize;
175 *incremented = false;
180 mem_cd_iconveh_internal (const char *src, size_t srclen,
181 iconv_t cd, iconv_t cd1, iconv_t cd2,
182 enum iconv_ilseq_handler handler,
185 char **resultp, size_t *lengthp)
187 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
188 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
189 Instead, we have to start afresh from the beginning of SRC. */
190 /* Use a temporary buffer, so that for small strings, a single malloc()
191 call will be sufficient. */
192 # define tmpbufsize 4096
193 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
194 libiconv's UCS-4-INTERNAL encoding. */
195 union { unsigned int align; char buf[tmpbufsize]; } tmp;
196 # define tmpbuf tmp.buf
198 char *initial_result;
202 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
204 if (*lengthp >= sizeof (tmpbuf))
206 initial_result = *resultp;
207 allocated = *lengthp;
211 initial_result = tmpbuf;
212 allocated = sizeof (tmpbuf);
214 result = initial_result;
220 for (i = 0; i < srclen; i++)
221 offsets[i] = (size_t)(-1);
223 last_length = (size_t)(-1);
227 /* First, try a direct conversion, and see whether a conversion error
230 const char *inptr = src;
231 size_t insize = srclen;
233 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
234 # if defined _LIBICONV_VERSION \
235 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
236 /* Set to the initial state. */
237 iconv (cd, NULL, NULL, NULL, NULL);
242 char *outptr = result + length;
243 size_t outsize = allocated - extra_alloc - length;
250 if (length != last_length) /* ensure that offset[] be increasing */
252 offsets[inptr - src] = length;
253 last_length = length;
255 res = iconv_carefully_1 (cd,
261 /* Use iconv_carefully instead of iconv here, because:
262 - If TO_CODESET is UTF-8, we can do the error handling in this
263 loop, no need for a second loop,
264 - With iconv() implementations other than GNU libiconv and GNU
265 libc, if we use iconv() in a big swoop, checking for an E2BIG
266 return, we lose the number of irreversible conversions. */
267 res = iconv_carefully (cd,
272 length = outptr - result;
273 grow = (length + extra_alloc > allocated / 2);
274 if (res == (size_t)(-1))
278 else if (errno == EINVAL)
280 else if (errno == EILSEQ && handler != iconveh_error)
282 if (cd2 == (iconv_t)(-1))
284 /* TO_CODESET is UTF-8. */
285 /* Error handling can produce up to 1 byte of output. */
286 if (length + 1 + extra_alloc > allocated)
290 allocated = 2 * allocated;
291 if (length + 1 + extra_alloc > allocated)
293 if (result == initial_result)
294 memory = (char *) malloc (allocated);
296 memory = (char *) realloc (result, allocated);
299 if (result != initial_result)
304 if (result == initial_result)
305 memcpy (memory, initial_result, length);
309 /* The input is invalid in FROM_CODESET. Eat up one byte
310 and emit a question mark. */
318 result[length] = '?';
326 if (result != initial_result)
328 int saved_errno = errno;
341 allocated = 2 * allocated;
342 if (result == initial_result)
343 memory = (char *) malloc (allocated);
345 memory = (char *) realloc (result, allocated);
348 if (result != initial_result)
353 if (result == initial_result)
354 memcpy (memory, initial_result, length);
360 /* Now get the conversion state back to the initial state.
361 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
362 #if defined _LIBICONV_VERSION \
363 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
366 char *outptr = result + length;
367 size_t outsize = allocated - extra_alloc - length;
370 res = iconv (cd, NULL, NULL, &outptr, &outsize);
371 length = outptr - result;
372 if (res == (size_t)(-1))
378 allocated = 2 * allocated;
379 if (result == initial_result)
380 memory = (char *) malloc (allocated);
382 memory = (char *) realloc (result, allocated);
385 if (result != initial_result)
390 if (result == initial_result)
391 memcpy (memory, initial_result, length);
396 if (result != initial_result)
398 int saved_errno = errno;
410 /* The direct conversion succeeded. */
414 /* The direct conversion failed, handler != iconveh_error,
415 and cd2 != (iconv_t)(-1).
416 Use a conversion through UTF-8. */
421 for (i = 0; i < srclen; i++)
422 offsets[i] = (size_t)(-1);
424 last_length = (size_t)(-1);
428 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
429 char utf8buf[utf8bufsize + 1];
431 const char *in1ptr = src;
432 size_t in1size = srclen;
433 bool do_final_flush1 = true;
434 bool do_final_flush2 = true;
436 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
437 # if defined _LIBICONV_VERSION \
438 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
439 /* Set to the initial state. */
440 if (cd1 != (iconv_t)(-1))
441 iconv (cd1, NULL, NULL, NULL, NULL);
442 iconv (cd2, NULL, NULL, NULL, NULL);
445 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
447 char *out1ptr = utf8buf + utf8len;
448 size_t out1size = utf8bufsize - utf8len;
453 /* Conversion step 1: from FROM_CODESET to UTF-8. */
457 && length != last_length) /* ensure that offset[] be increasing */
459 offsets[in1ptr - src] = length;
460 last_length = length;
462 if (cd1 != (iconv_t)(-1))
465 res1 = iconv_carefully_1 (cd1,
470 res1 = iconv_carefully (cd1,
477 /* FROM_CODESET is UTF-8. */
485 n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size);
488 && (uint8_t)in1ptr[0] == 0xEF
489 && (uint8_t)in1ptr[1] == 0xBF
490 && (uint8_t)in1ptr[2] == 0xBD))
503 incremented1 = false;
506 m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
511 incremented1 = false;
526 while (offsets == NULL && in1size > 0);
529 else if (do_final_flush1)
531 /* Now get the conversion state of CD1 back to the initial state.
532 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
533 # if defined _LIBICONV_VERSION \
534 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
535 if (cd1 != (iconv_t)(-1))
536 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
540 do_final_flush1 = false;
548 if (res1 == (size_t)(-1)
549 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
551 if (result != initial_result)
553 int saved_errno = errno;
559 if (res1 == (size_t)(-1)
560 && errno == EILSEQ && handler != iconveh_error)
562 /* The input is invalid in FROM_CODESET. Eat up one byte and
563 emit a question mark. Room for the question mark was allocated
564 at the end of utf8buf. */
572 utf8buf[utf8len++] = '?';
575 utf8len = out1ptr - utf8buf;
579 || utf8len > utf8bufsize / 2
580 || (res1 == (size_t)(-1) && errno1 == E2BIG))
582 /* Conversion step 2: from UTF-8 to TO_CODESET. */
583 const char *in2ptr = utf8buf;
584 size_t in2size = utf8len;
587 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
589 char *out2ptr = result + length;
590 size_t out2size = allocated - extra_alloc - length;
596 res2 = iconv_carefully (cd2,
600 else /* in1size == 0 && !do_final_flush1
601 && in2size == 0 && do_final_flush2 */
603 /* Now get the conversion state of CD1 back to the initial
604 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
605 # if defined _LIBICONV_VERSION \
606 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
607 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
611 do_final_flush2 = false;
615 length = out2ptr - result;
616 grow = (length + extra_alloc > allocated / 2);
617 if (res2 == (size_t)(-1))
621 else if (errno == EINVAL)
623 else if (errno == EILSEQ && handler != iconveh_error)
625 /* Error handling can produce up to 10 bytes of ASCII
626 output. But TO_CODESET may be UCS-2, UTF-16 or
627 UCS-4, so use CD2 here as well. */
637 if (u8_prev (&uc, (const uint8_t *) in2ptr,
638 (const uint8_t *) utf8buf)
647 n = u8_mbtouc (&uc, (const uint8_t *) in2ptr,
653 if (handler == iconveh_escape_sequence)
655 static char hex[16] = "0123456789ABCDEF";
657 scratchbuf[scratchlen++] = '\\';
659 scratchbuf[scratchlen++] = 'u';
662 scratchbuf[scratchlen++] = 'U';
663 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
664 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
665 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
666 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
668 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
669 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
670 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
671 scratchbuf[scratchlen++] = hex[uc & 15];
682 (ICONV_CONST char **) &inptr, &insize,
683 &out2ptr, &out2size);
684 length = out2ptr - result;
685 if (res == (size_t)(-1) && errno == E2BIG)
689 allocated = 2 * allocated;
690 if (length + 1 + extra_alloc > allocated)
692 if (result == initial_result)
693 memory = (char *) malloc (allocated);
695 memory = (char *) realloc (result, allocated);
698 if (result != initial_result)
703 if (result == initial_result)
704 memcpy (memory, initial_result, length);
708 out2ptr = result + length;
709 out2size = allocated - extra_alloc - length;
711 (ICONV_CONST char **) &inptr, &insize,
712 &out2ptr, &out2size);
713 length = out2ptr - result;
715 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
716 /* Irix iconv() inserts a NUL byte if it cannot convert.
717 NetBSD iconv() inserts a question mark if it cannot
719 Only GNU libiconv and GNU libc are known to prefer
720 to fail rather than doing a lossy conversion. */
721 if (res != (size_t)(-1) && res > 0)
727 if (res == (size_t)(-1))
729 /* Failure converting the ASCII replacement. */
730 if (result != initial_result)
732 int saved_errno = errno;
741 if (result != initial_result)
743 int saved_errno = errno;
751 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
757 allocated = 2 * allocated;
758 if (result == initial_result)
759 memory = (char *) malloc (allocated);
761 memory = (char *) realloc (result, allocated);
764 if (result != initial_result)
769 if (result == initial_result)
770 memcpy (memory, initial_result, length);
775 /* Move the remaining bytes to the beginning of utf8buf. */
777 memmove (utf8buf, in2ptr, in2size);
781 if (res1 == (size_t)(-1))
783 if (errno1 == EINVAL)
785 else if (errno1 == EILSEQ)
787 if (result != initial_result)
798 /* Now the final memory allocation. */
799 if (result == tmpbuf)
803 memory = (char *) malloc (length + extra_alloc);
806 memcpy (memory, tmpbuf, length);
815 else if (result != *resultp && length + extra_alloc < allocated)
817 /* Shrink the allocated memory if possible. */
820 memory = (char *) realloc (result, length + extra_alloc);
832 mem_cd_iconveh (const char *src, size_t srclen,
833 iconv_t cd, iconv_t cd1, iconv_t cd2,
834 enum iconv_ilseq_handler handler,
836 char **resultp, size_t *lengthp)
838 return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
839 offsets, resultp, lengthp);
843 str_cd_iconveh (const char *src,
844 iconv_t cd, iconv_t cd1, iconv_t cd2,
845 enum iconv_ilseq_handler handler)
847 /* For most encodings, a trailing NUL byte in the input will be converted
848 to a trailing NUL byte in the output. But not for UTF-7. So that this
849 function is usable for UTF-7, we have to exclude the NUL byte from the
850 conversion and add it by hand afterwards. */
853 int retval = mem_cd_iconveh_internal (src, strlen (src),
854 cd, cd1, cd2, handler, 1, NULL,
861 int saved_errno = errno;
868 /* Add the terminating NUL byte. */
869 result[length] = '\0';
877 mem_iconveh (const char *src, size_t srclen,
878 const char *from_codeset, const char *to_codeset,
879 enum iconv_ilseq_handler handler,
881 char **resultp, size_t *lengthp)
885 /* Nothing to convert. */
889 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
893 if (*resultp != NULL && *lengthp >= srclen)
897 result = (char *) malloc (srclen);
904 memcpy (result, src, srclen);
919 /* Avoid glibc-2.1 bug with EUC-KR. */
920 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
921 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
922 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
929 cd = iconv_open (to_codeset, from_codeset);
930 if (cd == (iconv_t)(-1))
933 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
937 cd1 = iconv_open ("UTF-8", from_codeset);
938 if (cd1 == (iconv_t)(-1))
940 int saved_errno = errno;
947 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
951 cd2 = iconv_open (to_codeset, "UTF-8");
952 if (cd2 == (iconv_t)(-1))
954 int saved_errno = errno;
955 if (cd1 != (iconv_t)(-1))
965 retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
970 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
971 int saved_errno = errno;
972 if (cd2 != (iconv_t)(-1))
974 if (cd1 != (iconv_t)(-1))
981 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
983 /* Return -1, but free the allocated memory, and while doing
984 that, preserve the errno from iconv_close. */
985 int saved_errno = errno;
986 if (cd1 != (iconv_t)(-1))
989 if (result != *resultp && result != NULL)
994 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
996 /* Return -1, but free the allocated memory, and while doing
997 that, preserve the errno from iconv_close. */
998 int saved_errno = errno;
1000 if (result != *resultp && result != NULL)
1002 errno = saved_errno;
1005 if (iconv_close (cd) < 0)
1007 /* Return -1, but free the allocated memory, and while doing
1008 that, preserve the errno from iconv_close. */
1009 int saved_errno = errno;
1010 if (result != *resultp && result != NULL)
1012 errno = saved_errno;
1020 /* This is a different error code than if iconv_open existed but didn't
1021 support from_codeset and to_codeset, so that the caller can emit
1022 an error message such as
1023 "iconv() is not supported. Installing GNU libiconv and
1024 then reinstalling this package would fix this." */
1032 str_iconveh (const char *src,
1033 const char *from_codeset, const char *to_codeset,
1034 enum iconv_ilseq_handler handler)
1036 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1038 char *result = strdup (src);
1052 /* Avoid glibc-2.1 bug with EUC-KR. */
1053 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1054 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
1055 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
1062 cd = iconv_open (to_codeset, from_codeset);
1063 if (cd == (iconv_t)(-1))
1066 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1067 cd1 = (iconv_t)(-1);
1070 cd1 = iconv_open ("UTF-8", from_codeset);
1071 if (cd1 == (iconv_t)(-1))
1073 int saved_errno = errno;
1075 errno = saved_errno;
1080 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1081 cd2 = (iconv_t)(-1);
1084 cd2 = iconv_open (to_codeset, "UTF-8");
1085 if (cd2 == (iconv_t)(-1))
1087 int saved_errno = errno;
1088 if (cd1 != (iconv_t)(-1))
1091 errno = saved_errno;
1096 result = str_cd_iconveh (src, cd, cd1, cd2, handler);
1100 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
1101 int saved_errno = errno;
1102 if (cd2 != (iconv_t)(-1))
1104 if (cd1 != (iconv_t)(-1))
1107 errno = saved_errno;
1111 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
1113 /* Return NULL, but free the allocated memory, and while doing
1114 that, preserve the errno from iconv_close. */
1115 int saved_errno = errno;
1116 if (cd1 != (iconv_t)(-1))
1120 errno = saved_errno;
1123 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
1125 /* Return NULL, but free the allocated memory, and while doing
1126 that, preserve the errno from iconv_close. */
1127 int saved_errno = errno;
1130 errno = saved_errno;
1133 if (iconv_close (cd) < 0)
1135 /* Return NULL, but free the allocated memory, and while doing
1136 that, preserve the errno from iconv_close. */
1137 int saved_errno = errno;
1139 errno = saved_errno;
1145 /* This is a different error code than if iconv_open existed but didn't
1146 support from_codeset and to_codeset, so that the caller can emit
1147 an error message such as
1148 "iconv() is not supported. Installing GNU libiconv and
1149 then reinstalling this package would fix this." */