From: Bruno Haible Date: Sun, 21 Jan 2007 22:59:19 +0000 (+0000) Subject: New module 'striconveha'. X-Git-Tag: cvs-readonly~1299 X-Git-Url: http://erislabs.net/gitweb/?a=commitdiff_plain;h=36a6f6825953d52a32710a6c38d3ef3a5870d3ac;p=gnulib.git New module 'striconveha'. --- diff --git a/ChangeLog b/ChangeLog index 8996bd0b0..11c6a50bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2007-01-21 Bruno Haible + * modules/striconveha: New file. + * lib/striconveha.h: New file. + * lib/striconveha.c: New file. + * MODULES.html.sh (Internationalization functions): Add striconveha. + * lib/striconv.c (str_iconv): Optimize the case of an empty input + string. + * lib/striconveh.c (mem_iconveh, str_iconveh): Likewise. + +2007-01-21 Bruno Haible + * lib/striconv.c (str_iconv): Guarantee errno is set when strdup fails. * lib/striconveh.c (str_iconveh): Likewise. diff --git a/MODULES.html.sh b/MODULES.html.sh index eff713caa..b0b99b5b9 100755 --- a/MODULES.html.sh +++ b/MODULES.html.sh @@ -2144,6 +2144,7 @@ func_all_modules () func_module striconv func_module xstriconv func_module striconveh + func_module striconveha func_module iconvme func_module localcharset func_module hard-locale diff --git a/lib/striconv.c b/lib/striconv.c index 7c4f549cf..3703e9c58 100644 --- a/lib/striconv.c +++ b/lib/striconv.c @@ -397,7 +397,7 @@ str_cd_iconv (const char *src, iconv_t cd) char * str_iconv (const char *src, const char *from_codeset, const char *to_codeset) { - if (c_strcasecmp (from_codeset, to_codeset) == 0) + if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) { char *result = strdup (src); diff --git a/lib/striconveh.c b/lib/striconveh.c index 235e60635..b02a182bb 100644 --- a/lib/striconveh.c +++ b/lib/striconveh.c @@ -772,7 +772,13 @@ mem_iconveh (const char *src, size_t srclen, enum iconv_ilseq_handler handler, char **resultp, size_t *lengthp) { - if (c_strcasecmp (from_codeset, to_codeset) == 0) + if (srclen == 0) + { + /* Nothing to convert. */ + *lengthp = 0; + return 0; + } + else if (c_strcasecmp (from_codeset, to_codeset) == 0) { char *result; @@ -919,7 +925,7 @@ str_iconveh (const char *src, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler) { - if (c_strcasecmp (from_codeset, to_codeset) == 0) + if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) { char *result = strdup (src); diff --git a/lib/striconveha.c b/lib/striconveha.c new file mode 100644 index 000000000..9da18c93e --- /dev/null +++ b/lib/striconveha.c @@ -0,0 +1,225 @@ +/* Character set conversion with error handling and autodetection. + Copyright (C) 2002, 2005, 2007 Free Software Foundation, Inc. + Written by Bruno Haible. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#include + +/* Specification. */ +#include "striconveha.h" + +#include +#include +#include + +#define SIZEOF(a) (sizeof(a)/sizeof(a[0])) + + +/* Autodetection list. */ + +struct autodetect_alias +{ + struct autodetect_alias *next; + const char *name; + const char * const *encodings_to_try; +}; + +static const char * const autodetect_utf8_try[] = +{ + /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would + be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */ + "UTF-8", "ISO-8859-1", + NULL +}; +static const char * const autodetect_jp_try[] = +{ + /* Try 7-bit encoding first. If the input contains bytes >= 0x80, + it will fail. + Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This + is unavoidable. People will condemn SHIFT_JIS. + If we tried SHIFT_JIS first, then some short EUC-JP inputs would + come out wrong, and people would condemn EUC-JP and Unix, which + would not be good. + Finally try SHIFT_JIS. */ + "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", + NULL +}; +static const char * const autodetect_kr_try[] = +{ + /* Try 7-bit encoding first. If the input contains bytes >= 0x80, + it will fail. + Finally try EUC-KR. */ + "ISO-2022-KR", "EUC-KR", + NULL +}; + +static struct autodetect_alias autodetect_predefined[] = +{ + { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try }, + { &autodetect_predefined[2], "autodetect_jp", autodetect_jp_try }, + { NULL, "autodetect_kr", autodetect_kr_try } +}; + +static struct autodetect_alias *autodetect_list = &autodetect_predefined[0]; +static struct autodetect_alias **autodetect_list_end = + &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next; + +int +uniconv_register_autodetect (const char *name, + const char * const *try_in_order) +{ + size_t namelen; + size_t listlen; + size_t memneed; + size_t i; + char *memory; + struct autodetect_alias *new_alias; + char *new_name; + const char **new_try_in_order; + + /* The TRY_IN_ORDER list must not be empty. */ + if (try_in_order[0] == NULL) + { + errno = EINVAL; + return -1; + } + + /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated + with dynamic extent. */ + namelen = strlen (name) + 1; + memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *); + for (i = 0; try_in_order[i] != NULL; i++) + memneed += sizeof (char *) + strlen (try_in_order[i]) + 1; + listlen = i; + + memory = (char *) malloc (memneed); + if (memory != NULL) + { + new_alias = (struct autodetect_alias *) memory; + memory += sizeof (struct autodetect_alias); + + new_try_in_order = (const char **) memory; + memory += (listlen + 1) * sizeof (char *); + + new_name = (char *) memory; + memcpy (new_name, name, namelen); + memory += namelen; + + for (i = 0; i < listlen; i++) + { + size_t len = strlen (try_in_order[i]) + 1; + memcpy (memory, try_in_order[i], len); + new_try_in_order[i] = (const char *) memory; + memory += len; + } + new_try_in_order[i] = NULL; + + /* Now insert the new alias. */ + new_alias->name = new_name; + new_alias->encodings_to_try = new_try_in_order; + new_alias->next = NULL; + /* FIXME: Not multithread-safe. */ + *autodetect_list_end = new_alias; + autodetect_list_end = &new_alias->next; + return 0; + } + else + { + errno = ENOMEM; + return -1; + } +} + +int +mem_iconveha (const char *src, size_t srclen, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler, + char **resultp, size_t *lengthp) +{ + int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, + resultp, lengthp); + if (retval >= 0 || errno != EINVAL) + return retval; + else + { + struct autodetect_alias *alias; + + /* Unsupported from_codeset or to_codeset. Check whether the caller + requested autodetection. */ + for (alias = autodetect_list; alias != NULL; alias = alias->next) + if (strcmp (from_codeset, alias->name) == 0) + { + const char * const *encodings = alias->encodings_to_try; + + do + { + retval = mem_iconveha (src, srclen, + from_codeset, to_codeset, handler, + resultp, lengthp); + if (!(retval < 0 && errno == EILSEQ)) + return retval; + encodings++; + } + while (*encodings != NULL); + + /* Return the last call's result. */ + return -1; + } + + /* It wasn't an autodetection name. */ + errno = EINVAL; + return -1; + } +} + +char * +str_iconveha (const char *src, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler) +{ + char *result = str_iconveh (src, from_codeset, to_codeset, handler); + + if (result != NULL || errno != EINVAL) + return result; + else + { + struct autodetect_alias *alias; + + /* Unsupported from_codeset or to_codeset. Check whether the caller + requested autodetection. */ + for (alias = autodetect_list; alias != NULL; alias = alias->next) + if (strcmp (from_codeset, alias->name) == 0) + { + const char * const *encodings = alias->encodings_to_try; + + do + { + result = str_iconveha (src, *encodings, to_codeset, handler); + if (!(result == NULL && errno == EILSEQ)) + return result; + encodings++; + } + while (*encodings != NULL); + + /* Return the last call's result. */ + return NULL; + } + + /* It wasn't an autodetection name. */ + errno = EINVAL; + return NULL; + } +} diff --git a/lib/striconveha.h b/lib/striconveha.h new file mode 100644 index 000000000..28fc7e6b1 --- /dev/null +++ b/lib/striconveha.h @@ -0,0 +1,80 @@ +/* Character set conversion with error handling and autodetection. + Copyright (C) 2002, 2005, 2007 Free Software Foundation, Inc. + Written by Bruno Haible. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef _STRICONVEHA_H +#define _STRICONVEHA_H + +#include "striconveh.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Convert an entire string from one encoding to another, using iconv. + The original string is at [SRC,...,SRC+SRCLEN-1]. + The "from" encoding can also be a name defined for autodetection. + *RESULTP and *LENGTH should initially be a scratch buffer and its size, + or *RESULTP can initially be NULL. + May erase the contents of the memory at *RESULTP. + Return value: 0 if successful, otherwise -1 and errno set. + If successful: The resulting string is stored in *RESULTP and its length + in *LENGTHP. *RESULTP is set to a freshly allocated memory block, or is + unchanged if no dynamic memory allocation was necessary. */ +extern int + mem_iconveha (const char *src, size_t srclen, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler, + char **resultp, size_t *lengthp); + +/* Convert an entire string from one encoding to another, using iconv. + The original string is the NUL-terminated string starting at SRC. + Both the "from" and the "to" encoding must use a single NUL byte at the + end of the string (i.e. not UCS-2, UCS-4, UTF-16, UTF-32). + The "from" encoding can also be a name defined for autodetection. + Allocate a malloced memory block for the result. + Return value: the freshly allocated resulting NUL-terminated string if + successful, otherwise NULL and errno set. */ +extern char * + str_iconveha (const char *src, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler); + + +/* In the above, FROM_CODESET can also be one of the following values: + "autodetect_utf8" supports ISO-8859-1 and UTF-8 + "autodetect_jp" supports EUC-JP, ISO-2022-JP-2 and SHIFT_JIS + "autodetect_kr" supports EUC-KR and ISO-2022-KR + More names can be defined for autodetection. */ + +/* Registers an encoding name for autodetection. + TRY_IN_ORDER is a NULL terminated list of encodings to be tried. + Returns 0 upon success, or -1 (with errno set) in case of error. + Particular errno values: ENOMEM. */ +extern int + iconv_register_autodetect (const char *name, + const char * const *try_in_order); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _STRICONVEHA_H */ diff --git a/modules/striconveha b/modules/striconveha new file mode 100644 index 000000000..9b44e24d6 --- /dev/null +++ b/modules/striconveha @@ -0,0 +1,25 @@ +Description: +Character set conversion of strings with error handling and autodetection, +uses iconv. + +Files: +lib/striconveha.h +lib/striconveha.c + +Depends-on: +striconveh + +configure.ac: + +Makefile.am: +lib_SOURCES += striconveha.h striconveha.c + +Include: +"striconveha.h" + +License: +LGPL + +Maintainer: +Bruno Haible +