From 2c11b49f00662697a6b27da0beb86adb4b6fae97 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sun, 21 Dec 2008 02:55:00 +0100 Subject: [PATCH] Work around mbrtowc bugs on AIX, HP-UX, OSF/1, Solaris. --- ChangeLog | 28 ++++ doc/posix-functions/mbrtowc.texi | 16 +++ lib/mbrtowc.c | 89 ++++++++++++- lib/wchar.in.h | 15 ++- m4/mbrtowc.m4 | 275 ++++++++++++++++++++++++++++++++++++++- m4/mbsinit.m4 | 8 +- m4/wchar.m4 | 5 +- modules/mbrtowc | 4 + modules/mbsinit | 1 + modules/wchar | 3 + 10 files changed, 431 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index b5ee9e7be..972a94071 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,33 @@ 2008-12-20 Bruno Haible + Work around mbrtowc bugs on AIX, HP-UX, OSF/1, Solaris. + * lib/wchar.in.h (mbstate_t): Redefine also if REPLACE_MBSTATE_T is + set. + (GNULIB_defined_mbstate_t): New macro. + (mbsinit): Redefine if REPLACE_MBSINIT is set. + (mbrtowc): Redefine if REPLACE_MBRTOWC is set. + * lib/mbrtowc.c (rpl_mbrtowc): Add an alternative implementation that + reuses the system's mbrtowc function but works around the bugs. + * m4/mbrtowc.m4 (gl_MBSTATE_T_BROKEN, gl_MBRTOWC_INCOMPLETE_STATE, + gl_MBRTOWC_NULL_ARG, gl_MBRTOWC_RETVAL, gl_MBRTOWC_NUL_RETVAL): New + macros. + (gl_FUNC_MBRTOWC): Invoke them. Set REPLACE_MBRTOWC if mbrtowc needs to + be overridden. Optionally define MBRTOWC_NULL_ARG_BUG, + MBRTOWC_RETVAL_BUG, MBRTOWC_NUL_RETVAL_BUG. + * m4/mbsinit.m4 (gl_FUNC_MBSINIT): Invoke gl_MBSTATE_T_BROKEN. Set + REPLACE_MBSINIT if mbsinit needs to be overridden. + * m4/wchar.m4 (gl_WCHAR_H_DEFAULTS): Initialize REPLACE_MBSTATE_T, + REPLACE_MBSINIT, REPLACE_MBRTOWC. + * modules/wchar (Makefile.am): Substitute REPLACE_MBSTATE_T, + REPLACE_MBSINIT, REPLACE_MBRTOWC. + * modules/mbrtowc (Files): Add m4/locale-fr.m4, m4/locale-ja.m4, + m4/locale-zh.m4. + (Depends): Add mbsinit. + * modules/mbsinit (Depends): Add mbrtowc. + * doc/posix-functions/mbrtowc.texi: Mention the various bugs. + +2008-12-20 Bruno Haible + * tests/test-mbrtowc.c (main): Change sample string in EUC-JP encoding so that there are no conversion errors on AIX. * tests/test-mbsrtowcs.c (main): LIkewise. diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi index 53126aea5..d75888804 100644 --- a/doc/posix-functions/mbrtowc.texi +++ b/doc/posix-functions/mbrtowc.texi @@ -11,6 +11,22 @@ Portability problems fixed by Gnulib: @item This function is missing on some platforms: HP-UX 11, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5. +@item +This function does not put the state into non-initial state when parsing an +incomplete multibyte character on some platforms: +AIX 5.1, OSF/1 5.1. +@item +This function does not ignore the @code{pwc} argument if the string argument is +NULL on some platforms: +OSF/1 5.1. +@item +This function returns the total number of bytes that make up the multibyte +character, not the number of bytes that were needed to complete the multibyte +character, on some platforms: +HP-UX 11.11, Solaris 10. +@item +This function may not return 0 when parsing the NUL character on some platforms: +Solaris 9. @end itemize Portability problems not fixed by Gnulib: diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c index e5ae0bcc3..603f00609 100644 --- a/lib/mbrtowc.c +++ b/lib/mbrtowc.c @@ -20,12 +20,15 @@ /* Specification. */ #include -#include -#include +#if GNULIB_defined_mbstate_t +/* Implement mbrtowc() on top of mbtowc(). */ -#include "localcharset.h" -#include "streq.h" -#include "verify.h" +# include +# include + +# include "localcharset.h" +# include "streq.h" +# include "verify.h" verify (sizeof (mbstate_t) >= 4); @@ -88,10 +91,10 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) /* Here 0 < m ≤ 4. */ -#if __GLIBC__ +# if __GLIBC__ /* Work around bug */ mbtowc (NULL, NULL, 0); -#endif +# endif { int res = mbtowc (pwc, p, m); @@ -272,3 +275,75 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } } } + +#else +/* Override the system's mbrtowc() function. */ + +# undef mbrtowc + +size_t +rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) +{ +# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG + if (s == NULL) + { + pwc = NULL; + s = ""; + n = 1; + } +# endif + +# if MBRTOWC_RETVAL_BUG + { + static mbstate_t internal_state; + + /* Override mbrtowc's internal state. We can not call mbsinit() on the + hidden internal state, but we can call it on our variable. */ + if (ps == NULL) + ps = &internal_state; + + if (!mbsinit (ps)) + { + /* Parse the rest of the multibyte character byte for byte. */ + size_t count = 0; + for (; n > 0; s++, n--) + { + wchar_t wc; + size_t ret = mbrtowc (&wc, s, 1, ps); + + if (ret == (size_t)(-1)) + return (size_t)(-1); + count++; + if (ret != (size_t)(-2)) + { + /* The multibyte character has been completed. */ + if (pwc != NULL) + *pwc = wc; + return (wc == 0 ? 0 : count); + } + } + return (size_t)(-2); + } + } +# endif + +# if MBRTOWC_NUL_RETVAL_BUG + { + wchar_t wc; + size_t ret = mbrtowc (&wc, s, n, ps); + + if (ret != (size_t)(-1) && ret != (size_t)(-2)) + { + if (pwc != NULL) + *pwc = wc; + if (wc == 0) + ret = 0; + } + return ret; + } +# else + return mbrtowc (pwc, s, n, ps); +# endif +} + +#endif diff --git a/lib/wchar.in.h b/lib/wchar.in.h index e4647331e..62844af72 100644 --- a/lib/wchar.in.h +++ b/lib/wchar.in.h @@ -74,10 +74,11 @@ extern "C" { /* Override mbstate_t if it is too small. On IRIX 6.5, sizeof (mbstate_t) == 1, which is not sufficient for implementing mbrtowc for encodings like UTF-8. */ -#if !(@HAVE_MBSINIT@ && @HAVE_MBRTOWC@) +#if !(@HAVE_MBSINIT@ && @HAVE_MBRTOWC@) || @REPLACE_MBSTATE_T@ typedef int rpl_mbstate_t; # undef mbstate_t # define mbstate_t rpl_mbstate_t +# define GNULIB_defined_mbstate_t 1 #endif @@ -116,7 +117,11 @@ extern int wctob (wint_t wc); /* Test whether *PS is in the initial state. */ #if @GNULIB_MBSINIT@ -# if !@HAVE_MBSINIT@ +# if @REPLACE_MBSINIT@ +# undef mbsinit +# define mbsinit rpl_mbsinit +# endif +# if !@HAVE_MBSINIT@ || @REPLACE_MBSINIT@ extern int mbsinit (const mbstate_t *ps); # endif #elif defined GNULIB_POSIXCHECK @@ -130,7 +135,11 @@ extern int mbsinit (const mbstate_t *ps); /* Convert a multibyte character to a wide character. */ #if @GNULIB_MBRTOWC@ -# if !@HAVE_MBRTOWC@ +# if @REPLACE_MBRTOWC@ +# undef mbrtowc +# define mbrtowc rpl_mbrtowc +# endif +# if !@HAVE_MBRTOWC@ || @REPLACE_MBRTOWC@ extern size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps); # endif #elif defined GNULIB_POSIXCHECK diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4 index 099b748fd..6eeacc4e2 100644 --- a/m4/mbrtowc.m4 +++ b/m4/mbrtowc.m4 @@ -1,4 +1,4 @@ -# mbrtowc.m4 serial 10 +# mbrtowc.m4 serial 11 dnl Copyright (C) 2001-2002, 2004-2005, 2008 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -9,15 +9,288 @@ AC_DEFUN([gl_FUNC_MBRTOWC], AC_REQUIRE([gl_WCHAR_H_DEFAULTS]) AC_REQUIRE([AC_TYPE_MBSTATE_T]) + gl_MBSTATE_T_BROKEN + if test $REPLACE_MBSTATE_T = 1; then + REPLACE_MBRTOWC=1 + fi AC_CHECK_FUNCS_ONCE([mbrtowc]) if test $ac_cv_func_mbrtowc = no; then HAVE_MBRTOWC=0 + fi + if test $HAVE_MBRTOWC != 0 && test $REPLACE_MBRTOWC != 1; then + gl_MBRTOWC_NULL_ARG + gl_MBRTOWC_RETVAL + gl_MBRTOWC_NUL_RETVAL + case "$gl_cv_func_mbrtowc_null_arg" in + *yes) ;; + *) AC_DEFINE([MBRTOWC_NULL_ARG_BUG], [1], + [Define if the mbrtowc function has the NULL string argument bug.]) + REPLACE_MBRTOWC=1 + ;; + esac + case "$gl_cv_func_mbrtowc_retval" in + *yes) ;; + *) AC_DEFINE([MBRTOWC_RETVAL_BUG], [1], + [Define if the mbrtowc function returns a wrong return value.]) + REPLACE_MBRTOWC=1 + ;; + esac + case "$gl_cv_func_mbrtowc_nul_retval" in + *yes) ;; + *) AC_DEFINE([MBRTOWC_NUL_RETVAL_BUG], [1], + [Define if the mbrtowc function does not return 0 for a NUL character.]) + REPLACE_MBRTOWC=1 + ;; + esac + fi + if test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1; then gl_REPLACE_WCHAR_H AC_LIBOBJ([mbrtowc]) gl_PREREQ_MBRTOWC fi ]) +dnl Test whether mbsinit() and mbrtowc() need to be overridden in a way that +dnl redefines the semantics of the given mbstate_t type. +dnl Result is REPLACE_MBSTATE_T. +dnl When this is set to 1, we replace both mbsinit() and mbrtowc(), in order to +dnl avoid inconsistencies. + +AC_DEFUN([gl_MBSTATE_T_BROKEN], +[ + AC_REQUIRE([gl_WCHAR_H_DEFAULTS]) + + AC_REQUIRE([AC_TYPE_MBSTATE_T]) + AC_CHECK_FUNCS_ONCE([mbsinit]) + AC_CHECK_FUNCS_ONCE([mbrtowc]) + if test $ac_cv_func_mbsinit = yes && test $ac_cv_func_mbrtowc = yes; then + gl_MBRTOWC_INCOMPLETE_STATE + case "$gl_cv_func_mbrtowc_incomplete_state" in + *yes) REPLACE_MBSTATE_T=0 ;; + *) REPLACE_MBSTATE_T=1 ;; + esac + else + REPLACE_MBSTATE_T=1 + fi + if test $REPLACE_MBSTATE_T = 1; then + gl_REPLACE_WCHAR_H + fi +]) + +dnl Test whether mbrtowc puts the state into non-initial state when parsing an +dnl incomplete multibyte character. +dnl Result is gl_cv_func_mbrtowc_incomplete_state. + +AC_DEFUN([gl_MBRTOWC_INCOMPLETE_STATE], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([gt_LOCALE_JA]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtowc handles incomplete characters], + [gl_cv_func_mbrtowc_incomplete_state], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on AIX and OSF/1. + osf*) gl_cv_func_mbrtowc_incomplete_state="guessing no" ;; + # Guess yes otherwise. + *) gl_cv_func_mbrtowc_incomplete_state="guessing yes" ;; + esac +changequote([,])dnl + if test $LOCALE_JA != none; then + AC_TRY_RUN([ +#include +#include +#include +int main () +{ + if (setlocale (LC_ALL, "$LOCALE_JA") != NULL) + { + const char input[] = "B\217\253\344\217\251\316er"; /* "Büßer" */ + mbstate_t state; + wchar_t wc; + int ret; + + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, input + 1, 1, &state) == (size_t)(-2)) + if (mbsinit (&state)) + return 1; + } + return 0; +}], + [gl_cv_func_mbrtowc_incomplete_state=yes], + [gl_cv_func_mbrtowc_incomplete_state=no], + []) + fi + ]) +]) + +dnl Test whether mbrtowc supports a NULL string argument correctly. +dnl Result is gl_cv_func_mbrtowc_null_arg. + +AC_DEFUN([gl_MBRTOWC_NULL_ARG], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([gt_LOCALE_FR_UTF8]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtowc handles a NULL string argument], + [gl_cv_func_mbrtowc_null_arg], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on OSF/1. + osf*) gl_cv_func_mbrtowc_null_arg="guessing no" ;; + # Guess yes otherwise. + *) gl_cv_func_mbrtowc_null_arg="guessing yes" ;; + esac +changequote([,])dnl + if test $LOCALE_FR_UTF8 != none; then + AC_TRY_RUN([ +#include +#include +#include +int main () +{ + if (setlocale (LC_ALL, "$LOCALE_FR_UTF8") != NULL) + { + mbstate_t state; + wchar_t wc; + int ret; + + memset (&state, '\0', sizeof (mbstate_t)); + wc = (wchar_t) 0xBADFACE; + mbrtowc (&wc, NULL, 5, &state); + /* Check that wc was not modified. */ + if (wc != (wchar_t) 0xBADFACE) + return 1; + } + return 0; +}], [gl_cv_func_mbrtowc_null_arg=yes], [gl_cv_func_mbrtowc_null_arg=no], []) + fi + ]) +]) + +dnl Test whether mbrtowc, when parsing the end of a multibyte character, +dnl correctly returns the number of bytes that were needed to complete the +dnl character (not the total number of bytes of the multibyte character). +dnl Result is gl_cv_func_mbrtowc_retval. + +AC_DEFUN([gl_MBRTOWC_RETVAL], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([gt_LOCALE_FR_UTF8]) + AC_REQUIRE([gt_LOCALE_JA]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtowc has a correct return value], + [gl_cv_func_mbrtowc_retval], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on HP-UX and Solaris. + hpux* | solaris*) gl_cv_func_mbrtowc_retval="guessing no" ;; + # Guess yes otherwise. + *) gl_cv_func_mbrtowc_retval="guessing yes" ;; + esac +changequote([,])dnl + if test $LOCALE_FR_UTF8 != none || test $LOCALE_JA != none; then + AC_TRY_RUN([ +#include +#include +#include +int main () +{ + /* This fails on Solaris. */ + if (setlocale (LC_ALL, "$LOCALE_FR_UTF8") != NULL) + { + char input[] = "B\303\274\303\237er"; /* "Büßer" */ + mbstate_t state; + wchar_t wc; + + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, input + 1, 1, &state) == (size_t)(-2)) + { + input[1] = '\0'; + if (mbrtowc (&wc, input + 2, 5, &state) != 1) + return 1; + } + } + /* This fails on HP-UX 11.11. */ + if (setlocale (LC_ALL, "$LOCALE_JA") != NULL) + { + char input[] = "B\217\253\344\217\251\316er"; /* "Büßer" */ + mbstate_t state; + wchar_t wc; + + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, input + 1, 1, &state) == (size_t)(-2)) + { + input[1] = '\0'; + if (mbrtowc (&wc, input + 2, 5, &state) != 2) + return 1; + } + } + return 0; +}], + [gl_cv_func_mbrtowc_retval=yes], + [gl_cv_func_mbrtowc_retval=no], + []) + fi + ]) +]) + +dnl Test whether mbrtowc, when parsing a NUL character, correctly returns 0. +dnl Result is gl_cv_func_mbrtowc_nul_retval. + +AC_DEFUN([gl_MBRTOWC_NUL_RETVAL], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([gt_LOCALE_ZH_CN]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtowc returns 0 when parsing a NUL character], + [gl_cv_func_mbrtowc_nul_retval], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on Solaris 9. + solaris2.9) gl_cv_func_mbrtowc_nul_retval="guessing no" ;; + # Guess yes otherwise. + *) gl_cv_func_mbrtowc_nul_retval="guessing yes" ;; + esac +changequote([,])dnl + if test $LOCALE_ZH_CN != none; then + AC_TRY_RUN([ +#include +#include +#include +int main () +{ + /* This fails on Solaris 9. */ + if (setlocale (LC_ALL, "$LOCALE_ZH_CN") != NULL) + { + mbstate_t state; + wchar_t wc; + + memset (&state, '\0', sizeof (mbstate_t)); + if (mbrtowc (&wc, "", 1, &state) != 0) + return 1; + } + return 0; +}], + [gl_cv_func_mbrtowc_nul_retval=yes], + [gl_cv_func_mbrtowc_nul_retval=no], + []) + fi + ]) +]) + # Prerequisites of lib/mbrtowc.c. AC_DEFUN([gl_PREREQ_MBRTOWC], [ : diff --git a/m4/mbsinit.m4 b/m4/mbsinit.m4 index d7d547541..03b055cd8 100644 --- a/m4/mbsinit.m4 +++ b/m4/mbsinit.m4 @@ -1,4 +1,4 @@ -# mbsinit.m4 serial 2 +# mbsinit.m4 serial 3 dnl Copyright (C) 2008 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -9,9 +9,15 @@ AC_DEFUN([gl_FUNC_MBSINIT], AC_REQUIRE([gl_WCHAR_H_DEFAULTS]) AC_REQUIRE([AC_TYPE_MBSTATE_T]) + gl_MBSTATE_T_BROKEN + if test $REPLACE_MBSTATE_T = 1; then + REPLACE_MBSINIT=1 + fi AC_CHECK_FUNCS_ONCE([mbsinit]) if test $ac_cv_func_mbsinit = no; then HAVE_MBSINIT=0 + fi + if test $HAVE_MBSINIT = 0 || test $REPLACE_MBSINIT = 1; then gl_REPLACE_WCHAR_H AC_LIBOBJ([mbsinit]) gl_PREREQ_MBSINIT diff --git a/m4/wchar.m4 b/m4/wchar.m4 index 02e7f188c..0809b3c79 100644 --- a/m4/wchar.m4 +++ b/m4/wchar.m4 @@ -7,7 +7,7 @@ dnl with or without modifications, as long as this notice is preserved. dnl Written by Eric Blake. -# wchar.m4 serial 13 +# wchar.m4 serial 14 AC_DEFUN([gl_WCHAR_H], [ @@ -76,7 +76,10 @@ AC_DEFUN([gl_WCHAR_H_DEFAULTS], HAVE_MBSRTOWCS=1; AC_SUBST([HAVE_MBSRTOWCS]) HAVE_DECL_WCTOB=1; AC_SUBST([HAVE_DECL_WCTOB]) HAVE_DECL_WCWIDTH=1; AC_SUBST([HAVE_DECL_WCWIDTH]) + REPLACE_MBSTATE_T=0; AC_SUBST([REPLACE_MBSTATE_T]) REPLACE_WCTOB=0; AC_SUBST([REPLACE_WCTOB]) + REPLACE_MBSINIT=0; AC_SUBST([REPLACE_MBSINIT]) + REPLACE_MBRTOWC=0; AC_SUBST([REPLACE_MBRTOWC]) REPLACE_WCWIDTH=0; AC_SUBST([REPLACE_WCWIDTH]) WCHAR_H=''; AC_SUBST([WCHAR_H]) ]) diff --git a/modules/mbrtowc b/modules/mbrtowc index c17fbd584..b4f4ce1f9 100644 --- a/modules/mbrtowc +++ b/modules/mbrtowc @@ -5,9 +5,13 @@ Files: lib/mbrtowc.c m4/mbrtowc.m4 m4/mbstate_t.m4 +m4/locale-fr.m4 +m4/locale-ja.m4 +m4/locale-zh.m4 Depends-on: wchar +mbsinit localcharset streq verify diff --git a/modules/mbsinit b/modules/mbsinit index 3f7cc3731..11c476cab 100644 --- a/modules/mbsinit +++ b/modules/mbsinit @@ -8,6 +8,7 @@ m4/mbstate_t.m4 Depends-on: wchar +mbrtowc verify extensions diff --git a/modules/wchar b/modules/wchar index f0bbe9277..716e7f280 100644 --- a/modules/wchar +++ b/modules/wchar @@ -40,7 +40,10 @@ wchar.h: wchar.in.h -e 's|@''HAVE_MBSRTOWCS''@|$(HAVE_MBSRTOWCS)|g' \ -e 's|@''HAVE_DECL_WCTOB''@|$(HAVE_DECL_WCTOB)|g' \ -e 's|@''HAVE_DECL_WCWIDTH''@|$(HAVE_DECL_WCWIDTH)|g' \ + -e 's|@''REPLACE_MBSTATE_T''@|$(REPLACE_MBSTATE_T)|g' \ -e 's|@''REPLACE_WCTOB''@|$(REPLACE_WCTOB)|g' \ + -e 's|@''REPLACE_MBSINIT''@|$(REPLACE_MBSINIT)|g' \ + -e 's|@''REPLACE_MBRTOWC''@|$(REPLACE_MBRTOWC)|g' \ -e 's|@''REPLACE_WCWIDTH''@|$(REPLACE_WCWIDTH)|g' \ -e '/definition of GL_LINK_WARNING/r $(LINK_WARNING_H)' \ < $(srcdir)/wchar.in.h; \ -- 2.11.0