1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "striconveh.h"
34 #include "c-strcase.h"
35 #include "c-strcaseeq.h"
38 # define SIZE_MAX ((size_t) -1)
44 /* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
45 error occurs, we may have to determine the Unicode representation of the
46 inconvertible character. */
48 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
49 a conversion error, and it returns in *INCREMENTED a boolean telling whether
50 it has incremented the input pointers past the error location. */
51 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
52 /* Irix iconv() inserts a NUL byte if it cannot convert.
53 NetBSD iconv() inserts a question mark if it cannot convert.
54 Only GNU libiconv and GNU libc are known to prefer to fail rather
55 than doing a lossy conversion. */
57 iconv_carefully (iconv_t cd,
58 const char **inbuf, size_t *inbytesleft,
59 char **outbuf, size_t *outbytesleft,
62 const char *inptr = *inbuf;
63 const char *inptr_end = inptr + *inbytesleft;
64 char *outptr = *outbuf;
65 size_t outsize = *outbytesleft;
66 const char *inptr_before;
76 for (insize = 1; inptr + insize <= inptr_end; insize++)
79 (ICONV_CONST char **) &inptr, &insize,
81 if (!(res == (size_t)(-1) && errno == EINVAL))
83 /* We expect that no input bytes have been consumed so far. */
84 if (inptr != inptr_before)
91 *outbytesleft = outsize;
94 while (res == 0 && inptr < inptr_end);
97 *inbytesleft = inptr_end - inptr;
98 if (res != (size_t)(-1) && res > 0)
100 /* iconv() has already incremented INPTR. We cannot go back to a
101 previous INPTR, otherwise the state inside CD would become invalid,
102 if FROM_CODESET is a stateful encoding. So, tell the caller that
103 *INBUF has already been incremented. */
104 *incremented = (inptr > inptr_before);
110 *incremented = false;
115 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
116 (*(incremented) = false, \
117 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
120 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
121 converting one character. */
123 iconv_carefully_1 (iconv_t cd,
124 const char **inbuf, size_t *inbytesleft,
125 char **outbuf, size_t *outbytesleft,
128 const char *inptr = *inbuf;
129 const char *inptr_end = inptr + *inbytesleft;
130 char *outptr = *outbuf;
131 size_t outsize = *outbytesleft;
132 const char *inptr_before = inptr;
133 size_t res = (size_t)(-1);
136 for (insize = 1; inptr + insize <= inptr_end; insize++)
139 (ICONV_CONST char **) &inptr, &insize,
141 if (!(res == (size_t)(-1) && errno == EINVAL))
143 /* We expect that no input bytes have been consumed so far. */
144 if (inptr != inptr_before)
149 *inbytesleft = inptr_end - inptr;
150 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
151 /* Irix iconv() inserts a NUL byte if it cannot convert.
152 NetBSD iconv() inserts a question mark if it cannot convert.
153 Only GNU libiconv and GNU libc are known to prefer to fail rather
154 than doing a lossy conversion. */
155 if (res != (size_t)(-1) && res > 0)
157 /* iconv() has already incremented INPTR. We cannot go back to a
158 previous INPTR, otherwise the state inside CD would become invalid,
159 if FROM_CODESET is a stateful encoding. So, tell the caller that
160 *INBUF has already been incremented. */
161 *incremented = (inptr > inptr_before);
167 if (res != (size_t)(-1))
170 *outbytesleft = outsize;
172 *incremented = false;
177 mem_cd_iconveh_internal (const char *src, size_t srclen,
178 iconv_t cd, iconv_t cd1, iconv_t cd2,
179 enum iconv_ilseq_handler handler,
182 char **resultp, size_t *lengthp)
184 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
185 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
186 Instead, we have to start afresh from the beginning of SRC. */
187 /* Use a temporary buffer, so that for small strings, a single malloc()
188 call will be sufficient. */
189 # define tmpbufsize 4096
190 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
191 libiconv's UCS-4-INTERNAL encoding. */
192 union { unsigned int align; char buf[tmpbufsize]; } tmp;
193 # define tmpbuf tmp.buf
195 char *initial_result;
199 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
201 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
203 initial_result = *resultp;
204 allocated = *lengthp;
208 initial_result = tmpbuf;
209 allocated = sizeof (tmpbuf);
211 result = initial_result;
217 for (i = 0; i < srclen; i++)
218 offsets[i] = (size_t)(-1);
220 last_length = (size_t)(-1);
224 /* First, try a direct conversion, and see whether a conversion error
227 const char *inptr = src;
228 size_t insize = srclen;
230 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
231 # if defined _LIBICONV_VERSION \
232 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
233 /* Set to the initial state. */
234 iconv (cd, NULL, NULL, NULL, NULL);
239 char *outptr = result + length;
240 size_t outsize = allocated - extra_alloc - length;
247 if (length != last_length) /* ensure that offset[] be increasing */
249 offsets[inptr - src] = length;
250 last_length = length;
252 res = iconv_carefully_1 (cd,
258 /* Use iconv_carefully instead of iconv here, because:
259 - If TO_CODESET is UTF-8, we can do the error handling in this
260 loop, no need for a second loop,
261 - With iconv() implementations other than GNU libiconv and GNU
262 libc, if we use iconv() in a big swoop, checking for an E2BIG
263 return, we lose the number of irreversible conversions. */
264 res = iconv_carefully (cd,
269 length = outptr - result;
270 grow = (length + extra_alloc > allocated / 2);
271 if (res == (size_t)(-1))
275 else if (errno == EINVAL)
277 else if (errno == EILSEQ && handler != iconveh_error)
279 if (cd2 == (iconv_t)(-1))
281 /* TO_CODESET is UTF-8. */
282 /* Error handling can produce up to 1 byte of output. */
283 if (length + 1 + extra_alloc > allocated)
287 allocated = 2 * allocated;
288 if (length + 1 + extra_alloc > allocated)
290 if (result == initial_result)
291 memory = (char *) malloc (allocated);
293 memory = (char *) realloc (result, allocated);
296 if (result != initial_result)
301 if (result == initial_result)
302 memcpy (memory, initial_result, length);
306 /* The input is invalid in FROM_CODESET. Eat up one byte
307 and emit a question mark. */
315 result[length] = '?';
323 if (result != initial_result)
325 int saved_errno = errno;
338 allocated = 2 * allocated;
339 if (result == initial_result)
340 memory = (char *) malloc (allocated);
342 memory = (char *) realloc (result, allocated);
345 if (result != initial_result)
350 if (result == initial_result)
351 memcpy (memory, initial_result, length);
357 /* Now get the conversion state back to the initial state.
358 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
359 #if defined _LIBICONV_VERSION \
360 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
363 char *outptr = result + length;
364 size_t outsize = allocated - extra_alloc - length;
367 res = iconv (cd, NULL, NULL, &outptr, &outsize);
368 length = outptr - result;
369 if (res == (size_t)(-1))
375 allocated = 2 * allocated;
376 if (result == initial_result)
377 memory = (char *) malloc (allocated);
379 memory = (char *) realloc (result, allocated);
382 if (result != initial_result)
387 if (result == initial_result)
388 memcpy (memory, initial_result, length);
393 if (result != initial_result)
395 int saved_errno = errno;
407 /* The direct conversion succeeded. */
411 /* The direct conversion failed, handler != iconveh_error,
412 and cd2 != (iconv_t)(-1).
413 Use a conversion through UTF-8. */
418 for (i = 0; i < srclen; i++)
419 offsets[i] = (size_t)(-1);
421 last_length = (size_t)(-1);
425 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
426 char utf8buf[utf8bufsize + 1];
428 const char *in1ptr = src;
429 size_t in1size = srclen;
430 bool do_final_flush1 = true;
431 bool do_final_flush2 = true;
433 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
434 # if defined _LIBICONV_VERSION \
435 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
436 /* Set to the initial state. */
437 if (cd1 != (iconv_t)(-1))
438 iconv (cd1, NULL, NULL, NULL, NULL);
439 iconv (cd2, NULL, NULL, NULL, NULL);
442 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
444 char *out1ptr = utf8buf + utf8len;
445 size_t out1size = utf8bufsize - utf8len;
450 /* Conversion step 1: from FROM_CODESET to UTF-8. */
454 && length != last_length) /* ensure that offset[] be increasing */
456 offsets[in1ptr - src] = length;
457 last_length = length;
459 if (cd1 != (iconv_t)(-1))
462 res1 = iconv_carefully_1 (cd1,
467 res1 = iconv_carefully (cd1,
474 /* FROM_CODESET is UTF-8. */
482 n = u8_mbtouc (&uc, (const uint8_t *) in1ptr, in1size);
485 && (uint8_t)in1ptr[0] == 0xEF
486 && (uint8_t)in1ptr[1] == 0xBF
487 && (uint8_t)in1ptr[2] == 0xBD))
500 incremented1 = false;
503 m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
508 incremented1 = false;
523 while (offsets == NULL && in1size > 0);
526 else if (do_final_flush1)
528 /* Now get the conversion state of CD1 back to the initial state.
529 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
530 # if defined _LIBICONV_VERSION \
531 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
532 if (cd1 != (iconv_t)(-1))
533 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
537 do_final_flush1 = false;
545 if (res1 == (size_t)(-1)
546 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
548 if (result != initial_result)
550 int saved_errno = errno;
556 if (res1 == (size_t)(-1)
557 && errno == EILSEQ && handler != iconveh_error)
559 /* The input is invalid in FROM_CODESET. Eat up one byte and
560 emit a question mark. Room for the question mark was allocated
561 at the end of utf8buf. */
569 utf8buf[utf8len++] = '?';
572 utf8len = out1ptr - utf8buf;
576 || utf8len > utf8bufsize / 2
577 || (res1 == (size_t)(-1) && errno1 == E2BIG))
579 /* Conversion step 2: from UTF-8 to TO_CODESET. */
580 const char *in2ptr = utf8buf;
581 size_t in2size = utf8len;
584 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
586 char *out2ptr = result + length;
587 size_t out2size = allocated - extra_alloc - length;
593 res2 = iconv_carefully (cd2,
597 else /* in1size == 0 && !do_final_flush1
598 && in2size == 0 && do_final_flush2 */
600 /* Now get the conversion state of CD1 back to the initial
601 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
602 # if defined _LIBICONV_VERSION \
603 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
604 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
608 do_final_flush2 = false;
612 length = out2ptr - result;
613 grow = (length + extra_alloc > allocated / 2);
614 if (res2 == (size_t)(-1))
618 else if (errno == EINVAL)
620 else if (errno == EILSEQ && handler != iconveh_error)
622 /* Error handling can produce up to 10 bytes of ASCII
623 output. But TO_CODESET may be UCS-2, UTF-16 or
624 UCS-4, so use CD2 here as well. */
634 if (u8_prev (&uc, (const uint8_t *) in2ptr,
635 (const uint8_t *) utf8buf)
644 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
650 if (handler == iconveh_escape_sequence)
652 static char hex[16] = "0123456789ABCDEF";
654 scratchbuf[scratchlen++] = '\\';
656 scratchbuf[scratchlen++] = 'u';
659 scratchbuf[scratchlen++] = 'U';
660 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
661 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
662 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
663 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
665 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
666 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
667 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
668 scratchbuf[scratchlen++] = hex[uc & 15];
679 (ICONV_CONST char **) &inptr, &insize,
680 &out2ptr, &out2size);
681 length = out2ptr - result;
682 if (res == (size_t)(-1) && errno == E2BIG)
686 allocated = 2 * allocated;
687 if (length + 1 + extra_alloc > allocated)
689 if (result == initial_result)
690 memory = (char *) malloc (allocated);
692 memory = (char *) realloc (result, allocated);
695 if (result != initial_result)
700 if (result == initial_result)
701 memcpy (memory, initial_result, length);
705 out2ptr = result + length;
706 out2size = allocated - extra_alloc - length;
708 (ICONV_CONST char **) &inptr, &insize,
709 &out2ptr, &out2size);
710 length = out2ptr - result;
712 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
713 /* Irix iconv() inserts a NUL byte if it cannot convert.
714 NetBSD iconv() inserts a question mark if it cannot
716 Only GNU libiconv and GNU libc are known to prefer
717 to fail rather than doing a lossy conversion. */
718 if (res != (size_t)(-1) && res > 0)
724 if (res == (size_t)(-1))
726 /* Failure converting the ASCII replacement. */
727 if (result != initial_result)
729 int saved_errno = errno;
738 if (result != initial_result)
740 int saved_errno = errno;
748 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
754 allocated = 2 * allocated;
755 if (result == initial_result)
756 memory = (char *) malloc (allocated);
758 memory = (char *) realloc (result, allocated);
761 if (result != initial_result)
766 if (result == initial_result)
767 memcpy (memory, initial_result, length);
772 /* Move the remaining bytes to the beginning of utf8buf. */
774 memmove (utf8buf, in2ptr, in2size);
778 if (res1 == (size_t)(-1))
780 if (errno1 == EINVAL)
782 else if (errno1 == EILSEQ)
784 if (result != initial_result)
795 /* Now the final memory allocation. */
796 if (result == tmpbuf)
800 memory = (char *) malloc (length + extra_alloc);
803 memcpy (memory, tmpbuf, length);
812 else if (result != *resultp && length + extra_alloc < allocated)
814 /* Shrink the allocated memory if possible. */
817 memory = (char *) realloc (result, length + extra_alloc);
829 mem_cd_iconveh (const char *src, size_t srclen,
830 iconv_t cd, iconv_t cd1, iconv_t cd2,
831 enum iconv_ilseq_handler handler,
833 char **resultp, size_t *lengthp)
835 return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
836 offsets, resultp, lengthp);
840 str_cd_iconveh (const char *src,
841 iconv_t cd, iconv_t cd1, iconv_t cd2,
842 enum iconv_ilseq_handler handler)
844 /* For most encodings, a trailing NUL byte in the input will be converted
845 to a trailing NUL byte in the output. But not for UTF-7. So that this
846 function is usable for UTF-7, we have to exclude the NUL byte from the
847 conversion and add it by hand afterwards. */
850 int retval = mem_cd_iconveh_internal (src, strlen (src),
851 cd, cd1, cd2, handler, 1, NULL,
858 int saved_errno = errno;
865 /* Add the terminating NUL byte. */
866 result[length] = '\0';
874 mem_iconveh (const char *src, size_t srclen,
875 const char *from_codeset, const char *to_codeset,
876 enum iconv_ilseq_handler handler,
878 char **resultp, size_t *lengthp)
882 /* Nothing to convert. */
886 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
890 if (*resultp != NULL && *lengthp >= srclen)
894 result = (char *) malloc (srclen);
901 memcpy (result, src, srclen);
916 /* Avoid glibc-2.1 bug with EUC-KR. */
917 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
918 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
919 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
926 cd = iconv_open (to_codeset, from_codeset);
927 if (cd == (iconv_t)(-1))
930 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
934 cd1 = iconv_open ("UTF-8", from_codeset);
935 if (cd1 == (iconv_t)(-1))
937 int saved_errno = errno;
944 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
948 cd2 = iconv_open (to_codeset, "UTF-8");
949 if (cd2 == (iconv_t)(-1))
951 int saved_errno = errno;
952 if (cd1 != (iconv_t)(-1))
962 retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
967 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
968 int saved_errno = errno;
969 if (cd2 != (iconv_t)(-1))
971 if (cd1 != (iconv_t)(-1))
978 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
980 /* Return -1, but free the allocated memory, and while doing
981 that, preserve the errno from iconv_close. */
982 int saved_errno = errno;
983 if (cd1 != (iconv_t)(-1))
986 if (result != *resultp && result != NULL)
991 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
993 /* Return -1, but free the allocated memory, and while doing
994 that, preserve the errno from iconv_close. */
995 int saved_errno = errno;
997 if (result != *resultp && result != NULL)
1002 if (iconv_close (cd) < 0)
1004 /* Return -1, but free the allocated memory, and while doing
1005 that, preserve the errno from iconv_close. */
1006 int saved_errno = errno;
1007 if (result != *resultp && result != NULL)
1009 errno = saved_errno;
1017 /* This is a different error code than if iconv_open existed but didn't
1018 support from_codeset and to_codeset, so that the caller can emit
1019 an error message such as
1020 "iconv() is not supported. Installing GNU libiconv and
1021 then reinstalling this package would fix this." */
1029 str_iconveh (const char *src,
1030 const char *from_codeset, const char *to_codeset,
1031 enum iconv_ilseq_handler handler)
1033 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1035 char *result = strdup (src);
1049 /* Avoid glibc-2.1 bug with EUC-KR. */
1050 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1051 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
1052 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
1059 cd = iconv_open (to_codeset, from_codeset);
1060 if (cd == (iconv_t)(-1))
1063 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1064 cd1 = (iconv_t)(-1);
1067 cd1 = iconv_open ("UTF-8", from_codeset);
1068 if (cd1 == (iconv_t)(-1))
1070 int saved_errno = errno;
1072 errno = saved_errno;
1077 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1078 cd2 = (iconv_t)(-1);
1081 cd2 = iconv_open (to_codeset, "UTF-8");
1082 if (cd2 == (iconv_t)(-1))
1084 int saved_errno = errno;
1085 if (cd1 != (iconv_t)(-1))
1088 errno = saved_errno;
1093 result = str_cd_iconveh (src, cd, cd1, cd2, handler);
1097 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
1098 int saved_errno = errno;
1099 if (cd2 != (iconv_t)(-1))
1101 if (cd1 != (iconv_t)(-1))
1104 errno = saved_errno;
1108 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
1110 /* Return NULL, but free the allocated memory, and while doing
1111 that, preserve the errno from iconv_close. */
1112 int saved_errno = errno;
1113 if (cd1 != (iconv_t)(-1))
1117 errno = saved_errno;
1120 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
1122 /* Return NULL, but free the allocated memory, and while doing
1123 that, preserve the errno from iconv_close. */
1124 int saved_errno = errno;
1127 errno = saved_errno;
1130 if (iconv_close (cd) < 0)
1132 /* Return NULL, but free the allocated memory, and while doing
1133 that, preserve the errno from iconv_close. */
1134 int saved_errno = errno;
1136 errno = saved_errno;
1142 /* This is a different error code than if iconv_open existed but didn't
1143 support from_codeset and to_codeset, so that the caller can emit
1144 an error message such as
1145 "iconv() is not supported. Installing GNU libiconv and
1146 then reinstalling this package would fix this." */