1 /* Character set conversion with error handling and autodetection.
2 Copyright (C) 2002, 2005, 2007 Free Software Foundation, Inc.
3 Written by Bruno Haible.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "striconveha.h"
30 #include "c-strcase.h"
32 #define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
35 /* Autodetection list. */
37 struct autodetect_alias
39 struct autodetect_alias *next;
41 const char * const *encodings_to_try;
44 static const char * const autodetect_utf8_try[] =
46 /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
47 be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */
48 "UTF-8", "ISO-8859-1",
51 static const char * const autodetect_jp_try[] =
53 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
55 Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
56 is unavoidable. People will condemn SHIFT_JIS.
57 If we tried SHIFT_JIS first, then some short EUC-JP inputs would
58 come out wrong, and people would condemn EUC-JP and Unix, which
60 Finally try SHIFT_JIS. */
61 "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
64 static const char * const autodetect_kr_try[] =
66 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
68 Finally try EUC-KR. */
69 "ISO-2022-KR", "EUC-KR",
73 static struct autodetect_alias autodetect_predefined[] =
75 { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
76 { &autodetect_predefined[2], "autodetect_jp", autodetect_jp_try },
77 { NULL, "autodetect_kr", autodetect_kr_try }
80 static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
81 static struct autodetect_alias **autodetect_list_end =
82 &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
85 uniconv_register_autodetect (const char *name,
86 const char * const *try_in_order)
93 struct autodetect_alias *new_alias;
95 const char **new_try_in_order;
97 /* The TRY_IN_ORDER list must not be empty. */
98 if (try_in_order[0] == NULL)
104 /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
105 with dynamic extent. */
106 namelen = strlen (name) + 1;
107 memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
108 for (i = 0; try_in_order[i] != NULL; i++)
109 memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
112 memory = (char *) malloc (memneed);
115 new_alias = (struct autodetect_alias *) memory;
116 memory += sizeof (struct autodetect_alias);
118 new_try_in_order = (const char **) memory;
119 memory += (listlen + 1) * sizeof (char *);
121 new_name = (char *) memory;
122 memcpy (new_name, name, namelen);
125 for (i = 0; i < listlen; i++)
127 size_t len = strlen (try_in_order[i]) + 1;
128 memcpy (memory, try_in_order[i], len);
129 new_try_in_order[i] = (const char *) memory;
132 new_try_in_order[i] = NULL;
134 /* Now insert the new alias. */
135 new_alias->name = new_name;
136 new_alias->encodings_to_try = new_try_in_order;
137 new_alias->next = NULL;
138 /* FIXME: Not multithread-safe. */
139 *autodetect_list_end = new_alias;
140 autodetect_list_end = &new_alias->next;
150 /* Like mem_iconveha, except no handling of transliteration. */
152 mem_iconveha_notranslit (const char *src, size_t srclen,
153 const char *from_codeset, const char *to_codeset,
154 enum iconv_ilseq_handler handler,
156 char **resultp, size_t *lengthp)
158 int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
159 offsets, resultp, lengthp);
160 if (retval >= 0 || errno != EINVAL)
164 struct autodetect_alias *alias;
166 /* Unsupported from_codeset or to_codeset. Check whether the caller
167 requested autodetection. */
168 for (alias = autodetect_list; alias != NULL; alias = alias->next)
169 if (strcmp (from_codeset, alias->name) == 0)
171 const char * const *encodings;
173 if (handler != iconveh_error)
175 /* First try all encodings without any forgiving. */
176 encodings = alias->encodings_to_try;
179 retval = mem_iconveha_notranslit (src, srclen,
180 *encodings, to_codeset,
181 iconveh_error, offsets,
183 if (!(retval < 0 && errno == EILSEQ))
187 while (*encodings != NULL);
190 encodings = alias->encodings_to_try;
193 retval = mem_iconveha_notranslit (src, srclen,
194 *encodings, to_codeset,
197 if (!(retval < 0 && errno == EILSEQ))
201 while (*encodings != NULL);
203 /* Return the last call's result. */
207 /* It wasn't an autodetection name. */
214 mem_iconveha (const char *src, size_t srclen,
215 const char *from_codeset, const char *to_codeset,
217 enum iconv_ilseq_handler handler,
219 char **resultp, size_t *lengthp)
223 /* Nothing to convert. */
228 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
229 we want to use transliteration. */
230 #if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
234 size_t len = strlen (to_codeset);
235 char *to_codeset_suffixed = (char *) allocsa (len + 10 + 1);
236 memcpy (to_codeset_suffixed, to_codeset, len);
237 memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
239 retval = mem_iconveha_notranslit (src, srclen,
240 from_codeset, to_codeset_suffixed,
241 handler, offsets, resultp, lengthp);
243 freesa (to_codeset_suffixed);
249 return mem_iconveha_notranslit (src, srclen,
250 from_codeset, to_codeset,
251 handler, offsets, resultp, lengthp);
254 /* Like str_iconveha, except no handling of transliteration. */
256 str_iconveha_notranslit (const char *src,
257 const char *from_codeset, const char *to_codeset,
258 enum iconv_ilseq_handler handler)
260 char *result = str_iconveh (src, from_codeset, to_codeset, handler);
262 if (result != NULL || errno != EINVAL)
266 struct autodetect_alias *alias;
268 /* Unsupported from_codeset or to_codeset. Check whether the caller
269 requested autodetection. */
270 for (alias = autodetect_list; alias != NULL; alias = alias->next)
271 if (strcmp (from_codeset, alias->name) == 0)
273 const char * const *encodings;
275 if (handler != iconveh_error)
277 /* First try all encodings without any forgiving. */
278 encodings = alias->encodings_to_try;
281 result = str_iconveha_notranslit (src,
282 *encodings, to_codeset,
284 if (!(result == NULL && errno == EILSEQ))
288 while (*encodings != NULL);
291 encodings = alias->encodings_to_try;
294 result = str_iconveha_notranslit (src,
295 *encodings, to_codeset,
297 if (!(result == NULL && errno == EILSEQ))
301 while (*encodings != NULL);
303 /* Return the last call's result. */
307 /* It wasn't an autodetection name. */
314 str_iconveha (const char *src,
315 const char *from_codeset, const char *to_codeset,
317 enum iconv_ilseq_handler handler)
319 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
321 char *result = strdup (src);
328 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
329 we want to use transliteration. */
330 #if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
334 size_t len = strlen (to_codeset);
335 char *to_codeset_suffixed = (char *) allocsa (len + 10 + 1);
336 memcpy (to_codeset_suffixed, to_codeset, len);
337 memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
339 result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
342 freesa (to_codeset_suffixed);
348 return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);