d286c27e197687c71edeaa5479699dc41c498443
[gnulib.git] / lib / unistr / u8-mbtouc.c
1 /* Look at first character in UTF-8 string.
2    Copyright (C) 1999-2002, 2006-2007, 2009-2012 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 #include <config.h>
19
20 #if defined IN_LIBUNISTRING
21 /* Tell unistr.h to declare u8_mbtouc as 'extern', not 'static inline'.  */
22 # include "unistring-notinline.h"
23 #endif
24
25 /* Specification.  */
26 #include "unistr.h"
27
28 #if !HAVE_INLINE
29
30 int
31 u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
32 {
33   uint8_t c = *s;
34
35   if (c < 0x80)
36     {
37       *puc = c;
38       return 1;
39     }
40   else if (c >= 0xc2)
41     {
42       if (c < 0xe0)
43         {
44           if (n >= 2)
45             {
46               if ((s[1] ^ 0x80) < 0x40)
47                 {
48                   *puc = ((unsigned int) (c & 0x1f) << 6)
49                          | (unsigned int) (s[1] ^ 0x80);
50                   return 2;
51                 }
52               /* invalid multibyte character */
53             }
54           else
55             {
56               /* incomplete multibyte character */
57               *puc = 0xfffd;
58               return 1;
59             }
60         }
61       else if (c < 0xf0)
62         {
63           if (n >= 3)
64             {
65               if ((s[1] ^ 0x80) < 0x40)
66                 {
67                   if ((s[2] ^ 0x80) < 0x40)
68                     {
69                       if ((c >= 0xe1 || s[1] >= 0xa0)
70                           && (c != 0xed || s[1] < 0xa0))
71                         {
72                           *puc = ((unsigned int) (c & 0x0f) << 12)
73                                  | ((unsigned int) (s[1] ^ 0x80) << 6)
74                                  | (unsigned int) (s[2] ^ 0x80);
75                           return 3;
76                         }
77                       /* invalid multibyte character */
78                       *puc = 0xfffd;
79                       return 3;
80                     }
81                   /* invalid multibyte character */
82                   *puc = 0xfffd;
83                   return 2;
84                 }
85               /* invalid multibyte character */
86             }
87           else
88             {
89               /* incomplete multibyte character */
90               *puc = 0xfffd;
91               if (n == 1 || (s[1] ^ 0x80) >= 0x40)
92                 return 1;
93               else
94                 return 2;
95             }
96         }
97       else if (c < 0xf8)
98         {
99           if (n >= 4)
100             {
101               if ((s[1] ^ 0x80) < 0x40)
102                 {
103                   if ((s[2] ^ 0x80) < 0x40)
104                     {
105                       if ((s[3] ^ 0x80) < 0x40)
106                         {
107                           if ((c >= 0xf1 || s[1] >= 0x90)
108 #if 1
109                               && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
110 #endif
111                              )
112                             {
113                               *puc = ((unsigned int) (c & 0x07) << 18)
114                                      | ((unsigned int) (s[1] ^ 0x80) << 12)
115                                      | ((unsigned int) (s[2] ^ 0x80) << 6)
116                                      | (unsigned int) (s[3] ^ 0x80);
117                               return 4;
118                             }
119                           /* invalid multibyte character */
120                           *puc = 0xfffd;
121                           return 4;
122                         }
123                       /* invalid multibyte character */
124                       *puc = 0xfffd;
125                       return 3;
126                     }
127                   /* invalid multibyte character */
128                   *puc = 0xfffd;
129                   return 2;
130                 }
131               /* invalid multibyte character */
132             }
133           else
134             {
135               /* incomplete multibyte character */
136               *puc = 0xfffd;
137               if (n == 1 || (s[1] ^ 0x80) >= 0x40)
138                 return 1;
139               else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
140                 return 2;
141               else
142                 return 3;
143             }
144         }
145 #if 0
146       else if (c < 0xfc)
147         {
148           if (n >= 5)
149             {
150               if ((s[1] ^ 0x80) < 0x40)
151                 {
152                   if ((s[2] ^ 0x80) < 0x40)
153                     {
154                       if ((s[3] ^ 0x80) < 0x40)
155                         {
156                           if ((s[4] ^ 0x80) < 0x40)
157                             {
158                               if (c >= 0xf9 || s[1] >= 0x88)
159                                 {
160                                   *puc = ((unsigned int) (c & 0x03) << 24)
161                                          | ((unsigned int) (s[1] ^ 0x80) << 18)
162                                          | ((unsigned int) (s[2] ^ 0x80) << 12)
163                                          | ((unsigned int) (s[3] ^ 0x80) << 6)
164                                          | (unsigned int) (s[4] ^ 0x80);
165                                   return 5;
166                                 }
167                               /* invalid multibyte character */
168                               *puc = 0xfffd;
169                               return 5;
170                             }
171                           /* invalid multibyte character */
172                           *puc = 0xfffd;
173                           return 4;
174                         }
175                       /* invalid multibyte character */
176                       *puc = 0xfffd;
177                       return 3;
178                     }
179                   /* invalid multibyte character */
180                   return 2;
181                 }
182               /* invalid multibyte character */
183             }
184           else
185             {
186               /* incomplete multibyte character */
187               *puc = 0xfffd;
188               return n;
189             }
190         }
191       else if (c < 0xfe)
192         {
193           if (n >= 6)
194             {
195               if ((s[1] ^ 0x80) < 0x40)
196                 {
197                   if ((s[2] ^ 0x80) < 0x40)
198                     {
199                       if ((s[3] ^ 0x80) < 0x40)
200                         {
201                           if ((s[4] ^ 0x80) < 0x40)
202                             {
203                               if ((s[5] ^ 0x80) < 0x40)
204                                 {
205                                   if (c >= 0xfd || s[1] >= 0x84)
206                                     {
207                                       *puc = ((unsigned int) (c & 0x01) << 30)
208                                              | ((unsigned int) (s[1] ^ 0x80) << 24)
209                                              | ((unsigned int) (s[2] ^ 0x80) << 18)
210                                              | ((unsigned int) (s[3] ^ 0x80) << 12)
211                                              | ((unsigned int) (s[4] ^ 0x80) << 6)
212                                              | (unsigned int) (s[5] ^ 0x80);
213                                       return 6;
214                                     }
215                                   /* invalid multibyte character */
216                                   *puc = 0xfffd;
217                                   return 6;
218                                 }
219                               /* invalid multibyte character */
220                               *puc = 0xfffd;
221                               return 5;
222                             }
223                           /* invalid multibyte character */
224                           *puc = 0xfffd;
225                           return 4;
226                         }
227                       /* invalid multibyte character */
228                       *puc = 0xfffd;
229                       return 3;
230                     }
231                   /* invalid multibyte character */
232                   return 2;
233                 }
234               /* invalid multibyte character */
235             }
236           else
237             {
238               /* incomplete multibyte character */
239               *puc = 0xfffd;
240               return n;
241             }
242         }
243 #endif
244     }
245   /* invalid multibyte character */
246   *puc = 0xfffd;
247   return 1;
248 }
249
250 #endif