1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 #include "localcharset.h"
30 static char internal_state[4];
33 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
35 char *pstate = (char *)ps;
38 pstate = internal_state;
52 size_t nstate = pstate[0];
89 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
90 mbtowc (NULL, NULL, 0);
93 int res = mbtowc (pwc, p, m);
97 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
99 if (nstate >= (res > 0 ? res : 1))
106 /* mbtowc does not distinguish between invalid and incomplete multibyte
107 sequences. But mbrtowc needs to make this distinction.
108 There are two possible approaches:
109 - Use iconv() and its return value.
110 - Use built-in knowledge about the possible encodings.
111 Given the low quality of implementation of iconv() on the systems that
112 lack mbrtowc(), we use the second approach.
113 The possible encodings are:
115 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
117 Use specialized code for each. */
118 if (m >= 4 || m >= MB_CUR_MAX)
120 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
122 const char *encoding = locale_charset ();
124 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
126 /* Cf. unistr/u8-mblen.c. */
127 unsigned char c = (unsigned char) p[0];
142 unsigned char c2 = (unsigned char) p[1];
144 if ((c2 ^ 0x80) < 0x40
145 && (c >= 0xe1 || c2 >= 0xa0)
146 && (c != 0xed || c2 < 0xa0))
154 else /* m == 2 || m == 3 */
156 unsigned char c2 = (unsigned char) p[1];
158 if ((c2 ^ 0x80) < 0x40
159 && (c >= 0xf1 || c2 >= 0x90)
160 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
166 unsigned char c3 = (unsigned char) p[2];
168 if ((c3 ^ 0x80) < 0x40)
178 /* As a reference for this code, you can use the GNU libiconv
179 implementation. Look for uses of the RET_TOOFEW macro. */
181 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
185 unsigned char c = (unsigned char) p[0];
187 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
192 unsigned char c = (unsigned char) p[0];
196 unsigned char c2 = (unsigned char) p[1];
198 if (c2 >= 0xa1 && c2 < 0xff)
204 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
205 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
206 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
210 unsigned char c = (unsigned char) p[0];
212 if (c >= 0xa1 && c < 0xff)
217 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
221 unsigned char c = (unsigned char) p[0];
223 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
226 else /* m == 2 || m == 3 */
228 unsigned char c = (unsigned char) p[0];
235 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
239 unsigned char c = (unsigned char) p[0];
241 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
242 || (c >= 0xf0 && c <= 0xf9))
248 /* An unknown multibyte encoding. */
255 /* Here 0 < k < m < 4. */
267 /* The conversion state is undefined, says POSIX. */