1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 #include "localcharset.h"
31 verify (sizeof (mbstate_t) >= 4);
33 static char internal_state[4];
36 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
38 char *pstate = (char *)ps;
41 pstate = internal_state;
55 size_t nstate = pstate[0];
92 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
93 mbtowc (NULL, NULL, 0);
96 int res = mbtowc (pwc, p, m);
100 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
102 if (nstate >= (res > 0 ? res : 1))
109 /* mbtowc does not distinguish between invalid and incomplete multibyte
110 sequences. But mbrtowc needs to make this distinction.
111 There are two possible approaches:
112 - Use iconv() and its return value.
113 - Use built-in knowledge about the possible encodings.
114 Given the low quality of implementation of iconv() on the systems that
115 lack mbrtowc(), we use the second approach.
116 The possible encodings are:
118 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
120 Use specialized code for each. */
121 if (m >= 4 || m >= MB_CUR_MAX)
123 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
125 const char *encoding = locale_charset ();
127 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
129 /* Cf. unistr/u8-mblen.c. */
130 unsigned char c = (unsigned char) p[0];
145 unsigned char c2 = (unsigned char) p[1];
147 if ((c2 ^ 0x80) < 0x40
148 && (c >= 0xe1 || c2 >= 0xa0)
149 && (c != 0xed || c2 < 0xa0))
157 else /* m == 2 || m == 3 */
159 unsigned char c2 = (unsigned char) p[1];
161 if ((c2 ^ 0x80) < 0x40
162 && (c >= 0xf1 || c2 >= 0x90)
163 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
169 unsigned char c3 = (unsigned char) p[2];
171 if ((c3 ^ 0x80) < 0x40)
181 /* As a reference for this code, you can use the GNU libiconv
182 implementation. Look for uses of the RET_TOOFEW macro. */
184 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188 unsigned char c = (unsigned char) p[0];
190 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
195 unsigned char c = (unsigned char) p[0];
199 unsigned char c2 = (unsigned char) p[1];
201 if (c2 >= 0xa1 && c2 < 0xff)
207 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
208 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
209 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213 unsigned char c = (unsigned char) p[0];
215 if (c >= 0xa1 && c < 0xff)
220 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224 unsigned char c = (unsigned char) p[0];
226 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
229 else /* m == 2 || m == 3 */
231 unsigned char c = (unsigned char) p[0];
238 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
242 unsigned char c = (unsigned char) p[0];
244 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
245 || (c >= 0xf0 && c <= 0xf9))
251 /* An unknown multibyte encoding. */
258 /* Here 0 < k < m < 4. */
270 /* The conversion state is undefined, says POSIX. */