lib/mbrtowc.c

   1 /* Convert multibyte character to wide character.
   2    Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2008.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include <wchar.h>
  22
  23 #include <errno.h>
  24 #include <stdlib.h>
  25
  26 #include "localcharset.h"
  27 #include "streq.h"
  28 #include "verify.h"
  29
  30
  31 verify (sizeof (mbstate_t) >= 4);
  32
  33 static char internal_state[4];
  34
  35 size_t
  36 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  37 {
  38   char *pstate = (char *)ps;
  39
  40   if (pstate == NULL)
  41     pstate = internal_state;
  42
  43   if (s == NULL)
  44     {
  45       pwc = NULL;
  46       s = "";
  47       n = 1;
  48     }
  49
  50   if (n == 0)
  51     return (size_t)(-2);
  52
  53   /* Here n > 0.  */
  54   {
  55     size_t nstate = pstate[0];
  56     char buf[4];
  57     const char *p;
  58     size_t m;
  59
  60     switch (nstate)
  61       {
  62       case 0:
  63         p = s;
  64         m = n;
  65         break;
  66       case 3:
  67         buf[2] = pstate[3];
  68         /*FALLTHROUGH*/
  69       case 2:
  70         buf[1] = pstate[2];
  71         /*FALLTHROUGH*/
  72       case 1:
  73         buf[0] = pstate[1];
  74         p = buf;
  75         m = nstate;
  76         buf[m++] = s[0];
  77         if (n >= 2 && m < 4)
  78           {
  79             buf[m++] = s[1];
  80             if (n >= 3 && m < 4)
  81               buf[m++] = s[2];
  82           }
  83         break;
  84       default:
  85         errno = EINVAL;
  86         return (size_t)(-1);
  87       }
  88
  89     /* Here 0 < m ≤ 4.  */
  90
  91 #if __GLIBC__
  92     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
  93     mbtowc (NULL, NULL, 0);
  94 #endif
  95     {
  96       int res = mbtowc (pwc, p, m);
  97
  98       if (res >= 0)
  99         {
 100           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
 101             abort ();
 102           if (nstate >= (res > 0 ? res : 1))
 103             abort ();
 104           res -= nstate;
 105           pstate[0] = 0;
 106           return res;
 107         }
 108
 109       /* mbtowc does not distinguish between invalid and incomplete multibyte
 110          sequences.  But mbrtowc needs to make this distinction.
 111          There are two possible approaches:
 112            - Use iconv() and its return value.
 113            - Use built-in knowledge about the possible encodings.
 114          Given the low quality of implementation of iconv() on the systems that
 115          lack mbrtowc(), we use the second approach.
 116          The possible encodings are:
 117            - 8-bit encodings,
 118            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
 119            - UTF-8.
 120          Use specialized code for each.  */
 121       if (m >= 4 || m >= MB_CUR_MAX)
 122         goto invalid;
 123       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 124       {
 125         const char *encoding = locale_charset ();
 126
 127         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 128           {
 129             /* Cf. unistr/u8-mblen.c.  */
 130             unsigned char c = (unsigned char) p[0];
 131
 132             if (c >= 0xc2)
 133               {
 134                 if (c < 0xe0)
 135                   {
 136                     if (m == 1)
 137                       goto incomplete;
 138                   }
 139                 else if (c < 0xf0)
 140                   {
 141                     if (m == 1)
 142                       goto incomplete;
 143                     if (m == 2)
 144                       {
 145                         unsigned char c2 = (unsigned char) p[1];
 146
 147                         if ((c2 ^ 0x80) < 0x40
 148                             && (c >= 0xe1 || c2 >= 0xa0)
 149                             && (c != 0xed || c2 < 0xa0))
 150                           goto incomplete;
 151                       }
 152                   }
 153                 else if (c <= 0xf4)
 154                   {
 155                     if (m == 1)
 156                       goto incomplete;
 157                     else /* m == 2 || m == 3 */
 158                       {
 159                         unsigned char c2 = (unsigned char) p[1];
 160
 161                         if ((c2 ^ 0x80) < 0x40
 162                             && (c >= 0xf1 || c2 >= 0x90)
 163                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 164                           {
 165                             if (m == 2)
 166                               goto incomplete;
 167                             else /* m == 3 */
 168                               {
 169                                 unsigned char c3 = (unsigned char) p[2];
 170
 171                                 if ((c3 ^ 0x80) < 0x40)
 172                                   goto incomplete;
 173                               }
 174                           }
 175                       }
 176                   }
 177               }
 178             goto invalid;
 179           }
 180
 181         /* As a reference for this code, you can use the GNU libiconv
 182            implementation.  Look for uses of the RET_TOOFEW macro.  */
 183
 184         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 185           {
 186             if (m == 1)
 187               {
 188                 unsigned char c = (unsigned char) p[0];
 189
 190                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 191                   goto incomplete;
 192               }
 193             if (m == 2)
 194               {
 195                 unsigned char c = (unsigned char) p[0];
 196
 197                 if (c == 0x8f)
 198                   {
 199                     unsigned char c2 = (unsigned char) p[1];
 200
 201                     if (c2 >= 0xa1 && c2 < 0xff)
 202                       goto incomplete;
 203                   }
 204               }
 205             goto invalid;
 206           }
 207         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 208             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 209             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 210           {
 211             if (m == 1)
 212               {
 213                 unsigned char c = (unsigned char) p[0];
 214
 215                 if (c >= 0xa1 && c < 0xff)
 216                   goto incomplete;
 217               }
 218             goto invalid;
 219           }
 220         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 221           {
 222             if (m == 1)
 223               {
 224                 unsigned char c = (unsigned char) p[0];
 225
 226                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 227                   goto incomplete;
 228               }
 229             else /* m == 2 || m == 3 */
 230               {
 231                 unsigned char c = (unsigned char) p[0];
 232
 233                 if (c == 0x8e)
 234                   goto incomplete;
 235               }
 236             goto invalid;
 237           }
 238         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 239           {
 240             if (m == 1)
 241               {
 242                 unsigned char c = (unsigned char) p[0];
 243
 244                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 245                     || (c >= 0xf0 && c <= 0xf9))
 246                   goto incomplete;
 247               }
 248             goto invalid;
 249           }
 250
 251         /* An unknown multibyte encoding.  */
 252         goto incomplete;
 253       }
 254
 255      incomplete:
 256       {
 257         size_t k = nstate;
 258         /* Here 0 < k < m < 4.  */
 259         pstate[++k] = s[0];
 260         if (k < m)
 261           pstate[++k] = s[1];
 262         if (k != m)
 263           abort ();
 264       }
 265       pstate[0] = m;
 266       return (size_t)(-2);
 267
 268      invalid:
 269       errno = EILSEQ;
 270       /* The conversion state is undefined, says POSIX.  */
 271       return (size_t)(-1);
 272     }
 273   }
 274 }