lib/mbrtowc.c

   1 /* Convert multibyte character to wide character.
   2    Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2008.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include <wchar.h>
  22
  23 #include <errno.h>
  24 #include <stdlib.h>
  25
  26 #include "localcharset.h"
  27 #include "streq.h"
  28
  29
  30 static char internal_state[4];
  31
  32 size_t
  33 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  34 {
  35   char *pstate = (char *)ps;
  36
  37   if (pstate == NULL)
  38     pstate = internal_state;
  39
  40   if (s == NULL)
  41     {
  42       pwc = NULL;
  43       s = "";
  44       n = 1;
  45     }
  46
  47   if (n == 0)
  48     return (size_t)(-2);
  49
  50   /* Here n > 0.  */
  51   {
  52     size_t nstate = pstate[0];
  53     char buf[4];
  54     const char *p;
  55     size_t m;
  56
  57     switch (nstate)
  58       {
  59       case 0:
  60         p = s;
  61         m = n;
  62         break;
  63       case 3:
  64         buf[2] = pstate[3];
  65         /*FALLTHROUGH*/
  66       case 2:
  67         buf[1] = pstate[2];
  68         /*FALLTHROUGH*/
  69       case 1:
  70         buf[0] = pstate[1];
  71         p = buf;
  72         m = nstate;
  73         buf[m++] = s[0];
  74         if (n >= 2 && m < 4)
  75           {
  76             buf[m++] = s[1];
  77             if (n >= 3 && m < 4)
  78               buf[m++] = s[2];
  79           }
  80         break;
  81       default:
  82         errno = EINVAL;
  83         return (size_t)(-1);
  84       }
  85
  86     /* Here 0 < m ≤ 4.  */
  87
  88 #if __GLIBC__
  89     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
  90     mbtowc (NULL, NULL, 0);
  91 #endif
  92     {
  93       int res = mbtowc (pwc, p, m);
  94
  95       if (res >= 0)
  96         {
  97           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
  98             abort ();
  99           if (nstate >= (res > 0 ? res : 1))
 100             abort ();
 101           res -= nstate;
 102           pstate[0] = 0;
 103           return res;
 104         }
 105
 106       /* mbtowc does not distinguish between invalid and incomplete multibyte
 107          sequences.  But mbrtowc needs to make this distinction.
 108          There are two possible approaches:
 109            - Use iconv() and its return value.
 110            - Use built-in knowledge about the possible encodings.
 111          Given the low quality of implementation of iconv() on the systems that
 112          lack mbrtowc(), we use the second approach.
 113          The possible encodings are:
 114            - 8-bit encodings,
 115            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
 116            - UTF-8.
 117          Use specialized code for each.  */
 118       if (m >= 4 || m >= MB_CUR_MAX)
 119         goto invalid;
 120       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 121       {
 122         const char *encoding = locale_charset ();
 123
 124         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 125           {
 126             /* Cf. unistr/u8-mblen.c.  */
 127             unsigned char c = (unsigned char) p[0];
 128
 129             if (c >= 0xc2)
 130               {
 131                 if (c < 0xe0)
 132                   {
 133                     if (m == 1)
 134                       goto incomplete;
 135                   }
 136                 else if (c < 0xf0)
 137                   {
 138                     if (m == 1)
 139                       goto incomplete;
 140                     if (m == 2)
 141                       {
 142                         unsigned char c2 = (unsigned char) p[1];
 143
 144                         if ((c2 ^ 0x80) < 0x40
 145                             && (c >= 0xe1 || c2 >= 0xa0)
 146                             && (c != 0xed || c2 < 0xa0))
 147                           goto incomplete;
 148                       }
 149                   }
 150                 else if (c <= 0xf4)
 151                   {
 152                     if (m == 1)
 153                       goto incomplete;
 154                     else /* m == 2 || m == 3 */
 155                       {
 156                         unsigned char c2 = (unsigned char) p[1];
 157
 158                         if ((c2 ^ 0x80) < 0x40
 159                             && (c >= 0xf1 || c2 >= 0x90)
 160                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 161                           {
 162                             if (m == 2)
 163                               goto incomplete;
 164                             else /* m == 3 */
 165                               {
 166                                 unsigned char c3 = (unsigned char) p[2];
 167
 168                                 if ((c3 ^ 0x80) < 0x40)
 169                                   goto incomplete;
 170                               }
 171                           }
 172                       }
 173                   }
 174               }
 175             goto invalid;
 176           }
 177
 178         /* As a reference for this code, you can use the GNU libiconv
 179            implementation.  Look for uses of the RET_TOOFEW macro.  */
 180
 181         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 182           {
 183             if (m == 1)
 184               {
 185                 unsigned char c = (unsigned char) p[0];
 186
 187                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 188                   goto incomplete;
 189               }
 190             if (m == 2)
 191               {
 192                 unsigned char c = (unsigned char) p[0];
 193
 194                 if (c == 0x8f)
 195                   {
 196                     unsigned char c2 = (unsigned char) p[1];
 197
 198                     if (c2 >= 0xa1 && c2 < 0xff)
 199                       goto incomplete;
 200                   }
 201               }
 202             goto invalid;
 203           }
 204         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 205             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 206             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 207           {
 208             if (m == 1)
 209               {
 210                 unsigned char c = (unsigned char) p[0];
 211
 212                 if (c >= 0xa1 && c < 0xff)
 213                   goto incomplete;
 214               }
 215             goto invalid;
 216           }
 217         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 218           {
 219             if (m == 1)
 220               {
 221                 unsigned char c = (unsigned char) p[0];
 222
 223                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 224                   goto incomplete;
 225               }
 226             else /* m == 2 || m == 3 */
 227               {
 228                 unsigned char c = (unsigned char) p[0];
 229
 230                 if (c == 0x8e)
 231                   goto incomplete;
 232               }
 233             goto invalid;
 234           }
 235         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 236           {
 237             if (m == 1)
 238               {
 239                 unsigned char c = (unsigned char) p[0];
 240
 241                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 242                     || (c >= 0xf0 && c <= 0xf9))
 243                   goto incomplete;
 244               }
 245             goto invalid;
 246           }
 247
 248         /* An unknown multibyte encoding.  */
 249         goto incomplete;
 250       }
 251
 252      incomplete:
 253       {
 254         size_t k = nstate;
 255         /* Here 0 < k < m < 4.  */
 256         pstate[++k] = s[0];
 257         if (k < m)
 258           pstate[++k] = s[1];
 259         if (k != m)
 260           abort ();
 261       }
 262       pstate[0] = m;
 263       return (size_t)(-2);
 264
 265      invalid:
 266       errno = EILSEQ;
 267       /* The conversion state is undefined, says POSIX.  */
 268       return (size_t)(-1);
 269     }
 270   }
 271 }