1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "striconveh.h"
31 # include "utf8-ucs4-safe.h"
32 # include "ucs4-utf8.h"
37 #include "c-strcase.h"
40 # define SIZE_MAX ((size_t) -1)
46 /* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
47 error occurs, we may have to determine the Unicode representation of the
48 inconvertible character. */
50 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
51 a conversion error, and it returns in *INCREMENTED a boolean telling whether
52 it has incremented the input pointers past the error location. */
53 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
54 /* Irix iconv() inserts a NUL byte if it cannot convert.
55 NetBSD iconv() inserts a question mark if it cannot convert.
56 Only GNU libiconv and GNU libc are known to prefer to fail rather
57 than doing a lossy conversion. */
59 iconv_carefully (iconv_t cd,
60 const char **inbuf, size_t *inbytesleft,
61 char **outbuf, size_t *outbytesleft,
64 const char *inptr = *inbuf;
65 const char *inptr_end = inptr + *inbytesleft;
66 char *outptr = *outbuf;
67 size_t outsize = *outbytesleft;
68 const char *inptr_before;
78 for (insize = 1; inptr + insize <= inptr_end; insize++)
81 (ICONV_CONST char **) &inptr, &insize,
83 if (!(res == (size_t)(-1) && errno == EINVAL))
85 /* We expect that no input bytes have been consumed so far. */
86 if (inptr != inptr_before)
93 *outbytesleft = outsize;
96 while (res == 0 && inptr < inptr_end);
99 *inbytesleft = inptr_end - inptr;
100 if (res != (size_t)(-1) && res > 0)
102 /* iconv() has already incremented INPTR. We cannot go back to a
103 previous INPTR, otherwise the state inside CD would become invalid,
104 if FROM_CODESET is a stateful encoding. So, tell the caller that
105 *INBUF has already been incremented. */
106 *incremented = (inptr > inptr_before);
112 *incremented = false;
117 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
118 (*(incremented) = false, \
119 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
123 mem_cd_iconveh_internal (const char *src, size_t srclen,
124 iconv_t cd, iconv_t cd1, iconv_t cd2,
125 enum iconv_ilseq_handler handler,
127 char **resultp, size_t *lengthp)
129 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
130 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
131 Instead, we have to start afresh from the beginning of SRC. */
132 /* Use a temporary buffer, so that for small strings, a single malloc()
133 call will be sufficient. */
134 # define tmpbufsize 4096
135 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
136 libiconv's UCS-4-INTERNAL encoding. */
137 union { unsigned int align; char buf[tmpbufsize]; } tmp;
138 # define tmpbuf tmp.buf
140 char *initial_result;
145 if (*lengthp >= sizeof (tmpbuf))
147 initial_result = *resultp;
148 allocated = *lengthp;
152 initial_result = tmpbuf;
153 allocated = sizeof (tmpbuf);
155 result = initial_result;
158 /* First, try a direct conversion, and see whether a conversion error
161 const char *inptr = src;
162 size_t insize = srclen;
164 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
165 # if defined _LIBICONV_VERSION \
166 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
167 /* Set to the initial state. */
168 iconv (cd, NULL, NULL, NULL, NULL);
173 char *outptr = result + length;
174 size_t outsize = allocated - extra_alloc - length;
179 /* Use iconv_carefully instead of iconv here, because:
180 - If TO_CODESET is UTF-8, we can do the error handling in this loop,
181 no need for a second loop,
182 - With iconv() implementations other than GNU libiconv and GNU libc,
183 if we use iconv() in a big swoop, checking for an E2BIG return,
184 we lose the number of irreversible conversions. */
185 res = iconv_carefully (cd,
190 length = outptr - result;
191 grow = (length + extra_alloc > allocated / 2);
192 if (res == (size_t)(-1))
196 else if (errno == EINVAL)
198 else if (errno == EILSEQ && handler != iconveh_error)
200 if (cd2 == (iconv_t)(-1))
202 /* TO_CODESET is UTF-8. */
203 /* Error handling can produce up to 1 byte of output. */
204 if (length + 1 + extra_alloc > allocated)
208 allocated = 2 * allocated;
209 if (length + 1 + extra_alloc > allocated)
211 if (result == initial_result)
212 memory = (char *) malloc (allocated);
214 memory = (char *) realloc (result, allocated);
217 if (result != initial_result)
222 if (result == initial_result)
223 memcpy (memory, initial_result, length);
227 /* The input is invalid in FROM_CODESET. Eat up one byte
228 and emit a question mark. */
236 result[length] = '?';
244 if (result != initial_result)
246 int saved_errno = errno;
259 allocated = 2 * allocated;
260 if (result == initial_result)
261 memory = (char *) malloc (allocated);
263 memory = (char *) realloc (result, allocated);
266 if (result != initial_result)
271 if (result == initial_result)
272 memcpy (memory, initial_result, length);
278 /* Now get the conversion state back to the initial state.
279 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
280 #if defined _LIBICONV_VERSION \
281 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
284 char *outptr = result + length;
285 size_t outsize = allocated - extra_alloc - length;
288 res = iconv (cd, NULL, NULL, &outptr, &outsize);
289 length = outptr - result;
290 if (res == (size_t)(-1))
296 allocated = 2 * allocated;
297 if (result == initial_result)
298 memory = (char *) malloc (allocated);
300 memory = (char *) realloc (result, allocated);
303 if (result != initial_result)
308 if (result == initial_result)
309 memcpy (memory, initial_result, length);
314 if (result != initial_result)
316 int saved_errno = errno;
328 /* The direct conversion succeeded. */
332 /* The direct conversion failed, handler != iconveh_error,
333 and cd2 != (iconv_t)(-1).
334 Use a conversion through UTF-8. */
337 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
338 char utf8buf[utf8bufsize + 1];
340 const char *in1ptr = src;
341 size_t in1size = srclen;
342 bool do_final_flush1 = true;
343 bool do_final_flush2 = true;
345 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
346 # if defined _LIBICONV_VERSION \
347 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
348 /* Set to the initial state. */
349 if (cd1 != (iconv_t)(-1))
350 iconv (cd1, NULL, NULL, NULL, NULL);
351 iconv (cd2, NULL, NULL, NULL, NULL);
354 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
356 char *out1ptr = utf8buf + utf8len;
357 size_t out1size = utf8bufsize - utf8len;
362 /* Conversion step 1: from FROM_CODESET to UTF-8. */
365 if (cd1 != (iconv_t)(-1))
366 res1 = iconv_carefully (cd1,
367 (ICONV_CONST char **) &in1ptr, &in1size,
372 /* FROM_CODESET is UTF-8. */
380 n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size);
383 && (uint8_t)in1ptr[0] == 0xEF
384 && (uint8_t)in1ptr[1] == 0xBF
385 && (uint8_t)in1ptr[2] == 0xBD))
398 incremented1 = false;
401 m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size);
406 incremented1 = false;
424 else if (do_final_flush1)
426 /* Now get the conversion state of CD1 back to the initial state.
427 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
428 # if defined _LIBICONV_VERSION \
429 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
430 if (cd1 != (iconv_t)(-1))
431 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
435 do_final_flush1 = false;
443 if (res1 == (size_t)(-1)
444 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
446 if (result != initial_result)
448 int saved_errno = errno;
454 if (res1 == (size_t)(-1)
455 && errno == EILSEQ && handler != iconveh_error)
457 /* The input is invalid in FROM_CODESET. Eat up one byte and
458 emit a question mark. Room for the question mark was allocated
459 at the end of utf8buf. */
467 utf8buf[utf8len++] = '?';
470 utf8len = out1ptr - utf8buf;
473 || utf8len > utf8bufsize / 2
474 || (res1 == (size_t)(-1) && errno1 == E2BIG))
476 /* Conversion step 2: from UTF-8 to TO_CODESET. */
477 const char *in2ptr = utf8buf;
478 size_t in2size = utf8len;
481 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
483 char *out2ptr = result + length;
484 size_t out2size = allocated - extra_alloc - length;
490 res2 = iconv_carefully (cd2,
494 else /* in1size == 0 && !do_final_flush1
495 && in2size == 0 && do_final_flush2 */
497 /* Now get the conversion state of CD1 back to the initial
498 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
499 # if defined _LIBICONV_VERSION \
500 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
501 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
505 do_final_flush2 = false;
509 length = out2ptr - result;
510 grow = (length + extra_alloc > allocated / 2);
511 if (res2 == (size_t)(-1))
515 else if (errno == EINVAL)
517 else if (errno == EILSEQ && handler != iconveh_error)
519 /* Error handling can produce up to 10 bytes of ASCII
520 output. But TO_CODESET may be UCS-2, UTF-16 or
521 UCS-4, so use CD2 here as well. */
531 if (u8_prev (&uc, (const uint8_t *) in2ptr,
532 (const uint8_t *) utf8buf)
541 n = u8_mbtouc (&uc, (const uint8_t *) in2ptr,
547 if (handler == iconveh_escape_sequence)
549 static char hex[16] = "0123456789ABCDEF";
551 scratchbuf[scratchlen++] = '\\';
553 scratchbuf[scratchlen++] = 'u';
556 scratchbuf[scratchlen++] = 'U';
557 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
558 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
559 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
560 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
562 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
563 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
564 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
565 scratchbuf[scratchlen++] = hex[uc & 15];
576 (ICONV_CONST char **) &inptr, &insize,
577 &out2ptr, &out2size);
578 length = out2ptr - result;
579 if (res == (size_t)(-1) && errno == E2BIG)
583 allocated = 2 * allocated;
584 if (length + 1 + extra_alloc > allocated)
586 if (result == initial_result)
587 memory = (char *) malloc (allocated);
589 memory = (char *) realloc (result, allocated);
592 if (result != initial_result)
597 if (result == initial_result)
598 memcpy (memory, initial_result, length);
602 out2ptr = result + length;
603 out2size = allocated - extra_alloc - length;
605 (ICONV_CONST char **) &inptr, &insize,
606 &out2ptr, &out2size);
607 length = out2ptr - result;
609 # if !defined _LIBICONV_VERSION && !defined __GLIBC__
610 /* Irix iconv() inserts a NUL byte if it cannot convert.
611 NetBSD iconv() inserts a question mark if it cannot
613 Only GNU libiconv and GNU libc are known to prefer
614 to fail rather than doing a lossy conversion. */
615 if (res != (size_t)(-1) && res > 0)
621 if (res == (size_t)(-1))
623 /* Failure converting the ASCII replacement. */
624 if (result != initial_result)
626 int saved_errno = errno;
635 if (result != initial_result)
637 int saved_errno = errno;
645 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
651 allocated = 2 * allocated;
652 if (result == initial_result)
653 memory = (char *) malloc (allocated);
655 memory = (char *) realloc (result, allocated);
658 if (result != initial_result)
663 if (result == initial_result)
664 memcpy (memory, initial_result, length);
669 /* Move the remaining bytes to the beginning of utf8buf. */
671 memmove (utf8buf, in2ptr, in2size);
675 if (res1 == (size_t)(-1))
677 if (errno1 == EINVAL)
679 else if (errno1 == EILSEQ)
681 if (result != initial_result)
692 /* Now the final memory allocation. */
693 if (result == tmpbuf)
697 memory = (char *) malloc (length + extra_alloc);
700 memcpy (memory, tmpbuf, length);
709 else if (result != *resultp && length + extra_alloc < allocated)
711 /* Shrink the allocated memory if possible. */
714 memory = (char *) realloc (result, length + extra_alloc);
726 mem_cd_iconveh (const char *src, size_t srclen,
727 iconv_t cd, iconv_t cd1, iconv_t cd2,
728 enum iconv_ilseq_handler handler,
729 char **resultp, size_t *lengthp)
731 return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
736 str_cd_iconveh (const char *src,
737 iconv_t cd, iconv_t cd1, iconv_t cd2,
738 enum iconv_ilseq_handler handler)
740 /* For most encodings, a trailing NUL byte in the input will be converted
741 to a trailing NUL byte in the output. But not for UTF-7. So that this
742 function is usable for UTF-7, we have to exclude the NUL byte from the
743 conversion and add it by hand afterwards. */
746 int retval = mem_cd_iconveh_internal (src, strlen (src),
747 cd, cd1, cd2, handler, 1,
754 int saved_errno = errno;
761 /* Add the terminating NUL byte. */
762 result[length] = '\0';
770 str_iconveh (const char *src,
771 const char *from_codeset, const char *to_codeset,
772 enum iconv_ilseq_handler handler)
774 if (c_strcasecmp (from_codeset, to_codeset) == 0)
784 /* Avoid glibc-2.1 bug with EUC-KR. */
785 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
786 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
787 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
794 cd = iconv_open (to_codeset, from_codeset);
795 if (cd == (iconv_t)(-1))
798 if (c_strcasecmp (from_codeset, "UTF-8") == 0)
802 cd1 = iconv_open ("UTF-8", from_codeset);
803 if (cd1 == (iconv_t)(-1))
805 int saved_errno = errno;
812 if (c_strcasecmp (to_codeset, "UTF-8") == 0)
816 cd2 = iconv_open (to_codeset, "UTF-8");
817 if (cd2 == (iconv_t)(-1))
819 int saved_errno = errno;
820 if (cd1 != (iconv_t)(-1))
828 result = str_cd_iconveh (src, cd, cd1, cd2, handler);
832 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
833 int saved_errno = errno;
834 if (cd2 != (iconv_t)(-1))
836 if (cd1 != (iconv_t)(-1))
843 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
845 /* Return NULL, but free the allocated memory, and while doing
846 that, preserve the errno from iconv_close. */
847 int saved_errno = errno;
848 if (cd1 != (iconv_t)(-1))
855 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
857 /* Return NULL, but free the allocated memory, and while doing
858 that, preserve the errno from iconv_close. */
859 int saved_errno = errno;
865 if (iconv_close (cd) < 0)
867 /* Return NULL, but free the allocated memory, and while doing
868 that, preserve the errno from iconv_close. */
869 int saved_errno = errno;
877 /* This is a different error code than if iconv_open existed but didn't
878 support from_codeset and to_codeset, so that the caller can emit
879 an error message such as
880 "iconv() is not supported. Installing GNU libiconv and
881 then reinstalling this package would fix this." */