1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "striconveh.h"
31 # include "utf8-ucs4.h"
32 # include "ucs4-utf8.h"
36 #include "c-strcase.h"
37 #include "c-strcaseeq.h"
40 # define SIZE_MAX ((size_t) -1)
46 /* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
47 error occurs, we may have to determine the Unicode representation of the
48 inconvertible character. */
50 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
51 a conversion error, and it returns in *INCREMENTED a boolean telling whether
52 it has incremented the input pointers past the error location. */
53 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
54 /* Irix iconv() inserts a NUL byte if it cannot convert.
55 NetBSD iconv() inserts a question mark if it cannot convert.
56 Only GNU libiconv and GNU libc are known to prefer to fail rather
57 than doing a lossy conversion. */
59 iconv_carefully (iconv_t cd,
60 const char **inbuf, size_t *inbytesleft,
61 char **outbuf, size_t *outbytesleft,
64 const char *inptr = *inbuf;
65 const char *inptr_end = inptr + *inbytesleft;
66 char *outptr = *outbuf;
67 size_t outsize = *outbytesleft;
68 const char *inptr_before;
78 for (insize = 1; inptr + insize <= inptr_end; insize++)
81 (ICONV_CONST char **) &inptr, &insize,
83 if (!(res == (size_t)(-1) && errno == EINVAL))
85 /* We expect that no input bytes have been consumed so far. */
86 if (inptr != inptr_before)
93 *outbytesleft = outsize;
96 while (res == 0 && inptr < inptr_end);
99 *inbytesleft = inptr_end - inptr;
100 if (res != (size_t)(-1) && res > 0)
102 /* iconv() has already incremented INPTR. We cannot go back to a
103 previous INPTR, otherwise the state inside CD would become invalid,
104 if FROM_CODESET is a stateful encoding. So, tell the caller that
105 *INBUF has already been incremented. */
106 *incremented = (inptr > inptr_before);
112 *incremented = false;
117 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
118 (*(incremented) = false, \
119 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
122 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
123 converting one character. */
125 iconv_carefully_1 (iconv_t cd,
126 const char **inbuf, size_t *inbytesleft,
127 char **outbuf, size_t *outbytesleft,
130 const char *inptr = *inbuf;
131 const char *inptr_end = inptr + *inbytesleft;
132 char *outptr = *outbuf;
133 size_t outsize = *outbytesleft;
134 const char *inptr_before = inptr;
135 size_t res = (size_t)(-1);
138 for (insize = 1; inptr + insize <= inptr_end; insize++)
141 (ICONV_CONST char **) &inptr, &insize,
143 if (!(res == (size_t)(-1) && errno == EINVAL))
145 /* We expect that no input bytes have been consumed so far. */
146 if (inptr != inptr_before)
151 *inbytesleft = inptr_end - inptr;
152 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
153 /* Irix iconv() inserts a NUL byte if it cannot convert.
154 NetBSD iconv() inserts a question mark if it cannot convert.
155 Only GNU libiconv and GNU libc are known to prefer to fail rather
156 than doing a lossy conversion. */
157 if (res != (size_t)(-1) && res > 0)
159 /* iconv() has already incremented INPTR. We cannot go back to a
160 previous INPTR, otherwise the state inside CD would become invalid,
161 if FROM_CODESET is a stateful encoding. So, tell the caller that
162 *INBUF has already been incremented. */
163 *incremented = (inptr > inptr_before);
169 if (res != (size_t)(-1))
172 *outbytesleft = outsize;
174 *incremented = false;
179 mem_cd_iconveh_internal (const char *src, size_t srclen,
180 iconv_t cd, iconv_t cd1, iconv_t cd2,
181 enum iconv_ilseq_handler handler,
184 char **resultp, size_t *lengthp)
186 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
187 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
188 Instead, we have to start afresh from the beginning of SRC. */
189 /* Use a temporary buffer, so that for small strings, a single malloc()
190 call will be sufficient. */
191 # define tmpbufsize 4096
192 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
193 libiconv's UCS-4-INTERNAL encoding. */
194 union { unsigned int align; char buf[tmpbufsize]; } tmp;
195 # define tmpbuf tmp.buf
197 char *initial_result;
201 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
203 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
205 initial_result = *resultp;
206 allocated = *lengthp;
210 initial_result = tmpbuf;
211 allocated = sizeof (tmpbuf);
213 result = initial_result;
219 for (i = 0; i < srclen; i++)
220 offsets[i] = (size_t)(-1);
222 last_length = (size_t)(-1);
226 /* First, try a direct conversion, and see whether a conversion error
229 const char *inptr = src;
230 size_t insize = srclen;
232 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
233 # if defined _LIBICONV_VERSION \
234 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
235 /* Set to the initial state. */
236 iconv (cd, NULL, NULL, NULL, NULL);
241 char *outptr = result + length;
242 size_t outsize = allocated - extra_alloc - length;
249 if (length != last_length) /* ensure that offset[] be increasing */
251 offsets[inptr - src] = length;
252 last_length = length;
254 res = iconv_carefully_1 (cd,
260 /* Use iconv_carefully instead of iconv here, because:
261 - If TO_CODESET is UTF-8, we can do the error handling in this
262 loop, no need for a second loop,
263 - With iconv() implementations other than GNU libiconv and GNU
264 libc, if we use iconv() in a big swoop, checking for an E2BIG
265 return, we lose the number of irreversible conversions. */
266 res = iconv_carefully (cd,
271 length = outptr - result;
272 grow = (length + extra_alloc > allocated / 2);
273 if (res == (size_t)(-1))
277 else if (errno == EINVAL)
279 else if (errno == EILSEQ && handler != iconveh_error)
281 if (cd2 == (iconv_t)(-1))
283 /* TO_CODESET is UTF-8. */
284 /* Error handling can produce up to 1 byte of output. */
285 if (length + 1 + extra_alloc > allocated)
289 allocated = 2 * allocated;
290 if (length + 1 + extra_alloc > allocated)
292 if (result == initial_result)
293 memory = (char *) malloc (allocated);
295 memory = (char *) realloc (result, allocated);
298 if (result != initial_result)
303 if (result == initial_result)
304 memcpy (memory, initial_result, length);
308 /* The input is invalid in FROM_CODESET. Eat up one byte
309 and emit a question mark. */
317 result[length] = '?';
325 if (result != initial_result)
327 int saved_errno = errno;
340 allocated = 2 * allocated;
341 if (result == initial_result)
342 memory = (char *) malloc (allocated);
344 memory = (char *) realloc (result, allocated);
347 if (result != initial_result)
352 if (result == initial_result)
353 memcpy (memory, initial_result, length);
359 /* Now get the conversion state back to the initial state.
360 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
361 #if defined _LIBICONV_VERSION \
362 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
365 char *outptr = result + length;
366 size_t outsize = allocated - extra_alloc - length;
369 res = iconv (cd, NULL, NULL, &outptr, &outsize);
370 length = outptr - result;
371 if (res == (size_t)(-1))
377 allocated = 2 * allocated;
378 if (result == initial_result)
379 memory = (char *) malloc (allocated);
381 memory = (char *) realloc (result, allocated);
384 if (result != initial_result)
389 if (result == initial_result)
390 memcpy (memory, initial_result, length);
395 if (result != initial_result)
397 int saved_errno = errno;
409 /* The direct conversion succeeded. */
413 /* The direct conversion failed, handler != iconveh_error,
414 and cd2 != (iconv_t)(-1).
415 Use a conversion through UTF-8. */
420 for (i = 0; i < srclen; i++)
421 offsets[i] = (size_t)(-1);
423 last_length = (size_t)(-1);
427 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
428 char utf8buf[utf8bufsize + 1];
430 const char *in1ptr = src;
431 size_t in1size = srclen;
432 bool do_final_flush1 = true;
433 bool do_final_flush2 = true;
435 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
436 # if defined _LIBICONV_VERSION \
437 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
438 /* Set to the initial state. */
439 if (cd1 != (iconv_t)(-1))
440 iconv (cd1, NULL, NULL, NULL, NULL);
441 iconv (cd2, NULL, NULL, NULL, NULL);
444 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
446 char *out1ptr = utf8buf + utf8len;
447 size_t out1size = utf8bufsize - utf8len;
452 /* Conversion step 1: from FROM_CODESET to UTF-8. */
456 && length != last_length) /* ensure that offset[] be increasing */
458 offsets[in1ptr - src] = length;
459 last_length = length;
461 if (cd1 != (iconv_t)(-1))
464 res1 = iconv_carefully_1 (cd1,
469 res1 = iconv_carefully (cd1,
476 /* FROM_CODESET is UTF-8. */
484 n = u8_mbtouc (&uc, (const uint8_t *) in1ptr, in1size);
487 && (uint8_t)in1ptr[0] == 0xEF
488 && (uint8_t)in1ptr[1] == 0xBF
489 && (uint8_t)in1ptr[2] == 0xBD))
502 incremented1 = false;
505 m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
510 incremented1 = false;
525 while (offsets == NULL && in1size > 0);
528 else if (do_final_flush1)
530 /* Now get the conversion state of CD1 back to the initial state.
531 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
532 # if defined _LIBICONV_VERSION \
533 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
534 if (cd1 != (iconv_t)(-1))
535 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
539 do_final_flush1 = false;
547 if (res1 == (size_t)(-1)
548 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
550 if (result != initial_result)
552 int saved_errno = errno;
558 if (res1 == (size_t)(-1)
559 && errno == EILSEQ && handler != iconveh_error)
561 /* The input is invalid in FROM_CODESET. Eat up one byte and
562 emit a question mark. Room for the question mark was allocated
563 at the end of utf8buf. */
571 utf8buf[utf8len++] = '?';
574 utf8len = out1ptr - utf8buf;
578 || utf8len > utf8bufsize / 2
579 || (res1 == (size_t)(-1) && errno1 == E2BIG))
581 /* Conversion step 2: from UTF-8 to TO_CODESET. */
582 const char *in2ptr = utf8buf;
583 size_t in2size = utf8len;
586 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
588 char *out2ptr = result + length;
589 size_t out2size = allocated - extra_alloc - length;
595 res2 = iconv_carefully (cd2,
599 else /* in1size == 0 && !do_final_flush1
600 && in2size == 0 && do_final_flush2 */
602 /* Now get the conversion state of CD1 back to the initial
603 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
604 # if defined _LIBICONV_VERSION \
605 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
606 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
610 do_final_flush2 = false;
614 length = out2ptr - result;
615 grow = (length + extra_alloc > allocated / 2);
616 if (res2 == (size_t)(-1))
620 else if (errno == EINVAL)
622 else if (errno == EILSEQ && handler != iconveh_error)
624 /* Error handling can produce up to 10 bytes of ASCII
625 output. But TO_CODESET may be UCS-2, UTF-16 or
626 UCS-4, so use CD2 here as well. */
636 if (u8_prev (&uc, (const uint8_t *) in2ptr,
637 (const uint8_t *) utf8buf)
646 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
652 if (handler == iconveh_escape_sequence)
654 static char hex[16] = "0123456789ABCDEF";
656 scratchbuf[scratchlen++] = '\\';
658 scratchbuf[scratchlen++] = 'u';
661 scratchbuf[scratchlen++] = 'U';
662 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
663 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
664 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
665 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
667 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
668 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
669 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
670 scratchbuf[scratchlen++] = hex[uc & 15];
681 (ICONV_CONST char **) &inptr, &insize,
682 &out2ptr, &out2size);
683 length = out2ptr - result;
684 if (res == (size_t)(-1) && errno == E2BIG)
688 allocated = 2 * allocated;
689 if (length + 1 + extra_alloc > allocated)
691 if (result == initial_result)
692 memory = (char *) malloc (allocated);
694 memory = (char *) realloc (result, allocated);
697 if (result != initial_result)
702 if (result == initial_result)
703 memcpy (memory, initial_result, length);
707 out2ptr = result + length;
708 out2size = allocated - extra_alloc - length;
710 (ICONV_CONST char **) &inptr, &insize,
711 &out2ptr, &out2size);
712 length = out2ptr - result;
714 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
715 /* Irix iconv() inserts a NUL byte if it cannot convert.
716 NetBSD iconv() inserts a question mark if it cannot
718 Only GNU libiconv and GNU libc are known to prefer
719 to fail rather than doing a lossy conversion. */
720 if (res != (size_t)(-1) && res > 0)
726 if (res == (size_t)(-1))
728 /* Failure converting the ASCII replacement. */
729 if (result != initial_result)
731 int saved_errno = errno;
740 if (result != initial_result)
742 int saved_errno = errno;
750 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
756 allocated = 2 * allocated;
757 if (result == initial_result)
758 memory = (char *) malloc (allocated);
760 memory = (char *) realloc (result, allocated);
763 if (result != initial_result)
768 if (result == initial_result)
769 memcpy (memory, initial_result, length);
774 /* Move the remaining bytes to the beginning of utf8buf. */
776 memmove (utf8buf, in2ptr, in2size);
780 if (res1 == (size_t)(-1))
782 if (errno1 == EINVAL)
784 else if (errno1 == EILSEQ)
786 if (result != initial_result)
797 /* Now the final memory allocation. */
798 if (result == tmpbuf)
802 memory = (char *) malloc (length + extra_alloc);
805 memcpy (memory, tmpbuf, length);
814 else if (result != *resultp && length + extra_alloc < allocated)
816 /* Shrink the allocated memory if possible. */
819 memory = (char *) realloc (result, length + extra_alloc);
831 mem_cd_iconveh (const char *src, size_t srclen,
832 iconv_t cd, iconv_t cd1, iconv_t cd2,
833 enum iconv_ilseq_handler handler,
835 char **resultp, size_t *lengthp)
837 return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
838 offsets, resultp, lengthp);
842 str_cd_iconveh (const char *src,
843 iconv_t cd, iconv_t cd1, iconv_t cd2,
844 enum iconv_ilseq_handler handler)
846 /* For most encodings, a trailing NUL byte in the input will be converted
847 to a trailing NUL byte in the output. But not for UTF-7. So that this
848 function is usable for UTF-7, we have to exclude the NUL byte from the
849 conversion and add it by hand afterwards. */
852 int retval = mem_cd_iconveh_internal (src, strlen (src),
853 cd, cd1, cd2, handler, 1, NULL,
860 int saved_errno = errno;
867 /* Add the terminating NUL byte. */
868 result[length] = '\0';
876 mem_iconveh (const char *src, size_t srclen,
877 const char *from_codeset, const char *to_codeset,
878 enum iconv_ilseq_handler handler,
880 char **resultp, size_t *lengthp)
884 /* Nothing to convert. */
888 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
892 if (*resultp != NULL && *lengthp >= srclen)
896 result = (char *) malloc (srclen);
903 memcpy (result, src, srclen);
918 /* Avoid glibc-2.1 bug with EUC-KR. */
919 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
920 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
921 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
928 cd = iconv_open (to_codeset, from_codeset);
929 if (cd == (iconv_t)(-1))
932 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
936 cd1 = iconv_open ("UTF-8", from_codeset);
937 if (cd1 == (iconv_t)(-1))
939 int saved_errno = errno;
946 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
950 cd2 = iconv_open (to_codeset, "UTF-8");
951 if (cd2 == (iconv_t)(-1))
953 int saved_errno = errno;
954 if (cd1 != (iconv_t)(-1))
964 retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
969 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
970 int saved_errno = errno;
971 if (cd2 != (iconv_t)(-1))
973 if (cd1 != (iconv_t)(-1))
980 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
982 /* Return -1, but free the allocated memory, and while doing
983 that, preserve the errno from iconv_close. */
984 int saved_errno = errno;
985 if (cd1 != (iconv_t)(-1))
988 if (result != *resultp && result != NULL)
993 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
995 /* Return -1, but free the allocated memory, and while doing
996 that, preserve the errno from iconv_close. */
997 int saved_errno = errno;
999 if (result != *resultp && result != NULL)
1001 errno = saved_errno;
1004 if (iconv_close (cd) < 0)
1006 /* Return -1, but free the allocated memory, and while doing
1007 that, preserve the errno from iconv_close. */
1008 int saved_errno = errno;
1009 if (result != *resultp && result != NULL)
1011 errno = saved_errno;
1019 /* This is a different error code than if iconv_open existed but didn't
1020 support from_codeset and to_codeset, so that the caller can emit
1021 an error message such as
1022 "iconv() is not supported. Installing GNU libiconv and
1023 then reinstalling this package would fix this." */
1031 str_iconveh (const char *src,
1032 const char *from_codeset, const char *to_codeset,
1033 enum iconv_ilseq_handler handler)
1035 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1037 char *result = strdup (src);
1051 /* Avoid glibc-2.1 bug with EUC-KR. */
1052 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1053 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
1054 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
1061 cd = iconv_open (to_codeset, from_codeset);
1062 if (cd == (iconv_t)(-1))
1065 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1066 cd1 = (iconv_t)(-1);
1069 cd1 = iconv_open ("UTF-8", from_codeset);
1070 if (cd1 == (iconv_t)(-1))
1072 int saved_errno = errno;
1074 errno = saved_errno;
1079 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1080 cd2 = (iconv_t)(-1);
1083 cd2 = iconv_open (to_codeset, "UTF-8");
1084 if (cd2 == (iconv_t)(-1))
1086 int saved_errno = errno;
1087 if (cd1 != (iconv_t)(-1))
1090 errno = saved_errno;
1095 result = str_cd_iconveh (src, cd, cd1, cd2, handler);
1099 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
1100 int saved_errno = errno;
1101 if (cd2 != (iconv_t)(-1))
1103 if (cd1 != (iconv_t)(-1))
1106 errno = saved_errno;
1110 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
1112 /* Return NULL, but free the allocated memory, and while doing
1113 that, preserve the errno from iconv_close. */
1114 int saved_errno = errno;
1115 if (cd1 != (iconv_t)(-1))
1119 errno = saved_errno;
1122 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
1124 /* Return NULL, but free the allocated memory, and while doing
1125 that, preserve the errno from iconv_close. */
1126 int saved_errno = errno;
1129 errno = saved_errno;
1132 if (iconv_close (cd) < 0)
1134 /* Return NULL, but free the allocated memory, and while doing
1135 that, preserve the errno from iconv_close. */
1136 int saved_errno = errno;
1138 errno = saved_errno;
1144 /* This is a different error code than if iconv_open existed but didn't
1145 support from_codeset and to_codeset, so that the caller can emit
1146 an error message such as
1147 "iconv() is not supported. Installing GNU libiconv and
1148 then reinstalling this package would fix this." */