1 /* Elementary Unicode string functions.
2 Copyright (C) 2002, 2005-2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU Library General Public License as published
6 by the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU Library General Public
15 License along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
37 All functions prefixed with u8_ operate on UTF-8 encoded strings.
38 Their unit is an uint8_t (1 byte).
40 All functions prefixed with u16_ operate on UTF-16 encoded strings.
41 Their unit is an uint16_t (a 2-byte word).
43 All functions prefixed with u32_ operate on UCS-4 encoded strings.
44 Their unit is an uint32_t (a 4-byte word).
46 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
49 All arguments starting with "str" and the arguments of functions starting
50 with u8_str/u16_str/u32_str denote a NUL terminated string, i.e. a string
51 which terminates at the first NUL unit. This termination unit is
52 considered part of the string for all memory allocation purposes, but
53 is not considered part of the string for all other logical purposes.
55 Functions returning a string result take a (resultbuf, lengthp) argument
56 pair. If resultbuf is not NULL and the result fits into *lengthp units,
57 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
58 allocated string is returned. In both cases, *lengthp is set to the
59 length (number of units) of the returned string. In case of error,
60 NULL is returned and errno is set. */
63 /* Elementary string checks. */
65 /* Check whether an UTF-8 string is well-formed.
66 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
67 extern const uint8_t *
68 u8_check (const uint8_t *s, size_t n);
70 /* Check whether an UTF-16 string is well-formed.
71 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
72 extern const uint16_t *
73 u16_check (const uint16_t *s, size_t n);
75 /* Check whether an UCS-4 string is well-formed.
76 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
77 extern const uint32_t *
78 u32_check (const uint32_t *s, size_t n);
81 /* Elementary string conversions. */
83 /* Convert an UTF-8 string to an UTF-16 string. */
85 u8_to_u16 (const uint8_t *s, size_t n, uint16_t *resultbuf,
88 /* Convert an UTF-8 string to an UCS-4 string. */
90 u8_to_u32 (const uint8_t *s, size_t n, uint32_t *resultbuf,
93 /* Convert an UTF-16 string to an UTF-8 string. */
95 u16_to_u8 (const uint16_t *s, size_t n, uint8_t *resultbuf,
98 /* Convert an UTF-16 string to an UCS-4 string. */
100 u16_to_u32 (const uint16_t *s, size_t n, uint32_t *resultbuf,
103 /* Convert an UCS-4 string to an UTF-8 string. */
105 u32_to_u8 (const uint32_t *s, size_t n, uint8_t *resultbuf,
108 /* Convert an UCS-4 string to an UTF-16 string. */
110 u32_to_u16 (const uint32_t *s, size_t n, uint16_t *resultbuf,
114 /* Elementary string functions. */
116 /* Return the length (number of units) of the first character in S, which is
117 no longer than N. Return 0 if it is the NUL character. Return -1 upon
119 /* Similar to mblen(), except that s must not be NULL. */
121 u8_mblen (const uint8_t *s, size_t n);
123 u16_mblen (const uint16_t *s, size_t n);
125 u32_mblen (const uint32_t *s, size_t n);
127 /* Return the length (number of units) of the first character in S, putting
128 its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
129 and an appropriate number of units is returned. */
130 /* Similar to mbtowc(), except that puc and s must not be NULL, and the NUL
131 character is not treated specially. */
132 /* The variants with _safe suffix are safe, even if the library is compiled
133 without --enable-safety. */
135 #ifdef GNULIB_UNISTR_U8_MBTOUC_UNSAFE
138 u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n);
140 # include "utf8-ucs4-unsafe.h"
144 #ifdef GNULIB_UNISTR_U16_MBTOUC_UNSAFE
147 u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n);
149 # include "utf16-ucs4-unsafe.h"
153 #ifdef GNULIB_UNISTR_U32_MBTOUC
156 u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n);
159 u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n)
163 # if CONFIG_UNICODE_SAFETY
164 if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
167 # if CONFIG_UNICODE_SAFETY
169 /* invalid multibyte character */
177 #ifdef GNULIB_UNISTR_U8_MBTOUC
180 u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n);
182 # include "utf8-ucs4.h"
186 #ifdef GNULIB_UNISTR_U16_MBTOUC_SAFE
189 u16_mbtouc_safe (ucs4_t *puc, const uint16_t *s, size_t n);
191 # include "utf16-ucs4-safe.h"
195 #ifdef GNULIB_UNISTR_U32_MBTOUC_SAFE
198 u32_mbtouc_safe (ucs4_t *puc, const uint32_t *s, size_t n);
201 u32_mbtouc_safe (ucs4_t *puc, const uint32_t *s, size_t n)
205 if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
208 /* invalid multibyte character */
215 /* Put the multibyte character represented by UC in S, returning its
216 length. Return -1 upon failure, -2 if the number of available units, N,
217 is too small. The latter case cannot occur if N >= 6/2/1, respectively. */
218 /* Similar to wctomb(), except that s must not be NULL, and the argument n
219 must be specified. */
221 #ifdef GNULIB_UNISTR_U8_UCTOMB
224 u8_uctomb (uint8_t *s, ucs4_t uc, int n);
226 # include "ucs4-utf8.h"
230 #ifdef GNULIB_UNISTR_U16_UCTOMB
233 u16_uctomb (uint16_t *s, ucs4_t uc, int n);
235 # include "ucs4-utf16.h"
239 #ifdef GNULIB_UNISTR_U32_UCTOMB
242 u32_uctomb (uint32_t *s, ucs4_t uc, int n);
245 u32_uctomb (uint32_t *s, ucs4_t uc, int n)
247 if (uc < 0xd800 || (uc >= 0xe000 && uc < 0x110000))
263 /* Copy N units from SRC to DEST. */
264 /* Similar to memcpy(). */
266 u8_cpy (uint8_t *dest, const uint8_t *src, size_t n);
268 u16_cpy (uint16_t *dest, const uint16_t *src, size_t n);
270 u32_cpy (uint32_t *dest, const uint32_t *src, size_t n);
272 /* Copy N units from SRC to DEST, guaranteeing correct behavior for
273 overlapping memory areas. */
274 /* Similar to memmove(). */
276 u8_move (uint8_t *dest, const uint8_t *src, size_t n);
278 u16_move (uint16_t *dest, const uint16_t *src, size_t n);
280 u32_move (uint32_t *dest, const uint32_t *src, size_t n);
282 /* Set the first N characters of S to UC. UC should be a character that
283 occupies only 1 unit. */
284 /* Similar to memset(). */
286 u8_set (uint8_t *s, ucs4_t uc, size_t n);
288 u16_set (uint16_t *s, ucs4_t uc, size_t n);
290 u32_set (uint32_t *s, ucs4_t uc, size_t n);
292 /* Compare S1 and S2, each of length N. */
293 /* Similar to memcmp(). */
295 u8_cmp (const uint8_t *s1, const uint8_t *s2, size_t n);
297 u16_cmp (const uint16_t *s1, const uint16_t *s2, size_t n);
299 u32_cmp (const uint32_t *s1, const uint32_t *s2, size_t n);
301 /* Search the string at S for UC. */
302 /* Similar to memchr(). */
304 u8_chr (const uint8_t *s, size_t n, ucs4_t uc);
306 u16_chr (const uint16_t *s, size_t n, ucs4_t uc);
308 u32_chr (const uint32_t *s, size_t n, ucs4_t uc);
310 /* Elementary string functions with memory allocation. */
312 /* Make a freshly allocated copy of S, of length N. */
314 u8_cpy_alloc (const uint8_t *s, size_t n);
316 u16_cpy_alloc (const uint16_t *s, size_t n);
318 u32_cpy_alloc (const uint32_t *s, size_t n);
320 /* Elementary string functions on NUL terminated strings. */
322 /* Return the length (number of units) of the first character in S.
323 Return 0 if it is the NUL character. Return -1 upon failure. */
325 u8_strmblen (const uint8_t *s);
327 u16_strmblen (const uint16_t *s);
329 u32_strmblen (const uint32_t *s);
331 /* Return the length (number of units) of the first character in S, putting
332 its 'ucs4_t' representation in *PUC. Return 0 if it is the NUL
333 character. Return -1 upon failure. */
335 u8_strmbtouc (ucs4_t *puc, const uint8_t *s);
337 u16_strmbtouc (ucs4_t *puc, const uint16_t *s);
339 u32_strmbtouc (ucs4_t *puc, const uint32_t *s);
341 /* Forward iteration step. Advances the pointer past the next character,
342 or returns NULL if the end of the string has been reached. Puts the
343 character's 'ucs4_t' representation in *PUC. */
344 extern const uint8_t *
345 u8_next (ucs4_t *puc, const uint8_t *s);
346 extern const uint16_t *
347 u16_next (ucs4_t *puc, const uint16_t *s);
348 extern const uint32_t *
349 u32_next (ucs4_t *puc, const uint32_t *s);
351 /* Backward iteration step. Advances the pointer to point to the previous
352 character, or returns NULL if the beginning of the string had been reached.
353 Puts the character's 'ucs4_t' representation in *PUC. */
354 extern const uint8_t *
355 u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start);
356 extern const uint16_t *
357 u16_prev (ucs4_t *puc, const uint16_t *s, const uint16_t *start);
358 extern const uint32_t *
359 u32_prev (ucs4_t *puc, const uint32_t *s, const uint32_t *start);
361 /* Return the number of units in S. */
362 /* Similar to strlen(), wcslen(). */
364 u8_strlen (const uint8_t *s);
366 u16_strlen (const uint16_t *s);
368 u32_strlen (const uint32_t *s);
370 /* Return the number of units in S, but at most MAXLEN. */
371 /* Similar to strnlen(), wcsnlen(). */
373 u8_strnlen (const uint8_t *s, size_t maxlen);
375 u16_strnlen (const uint16_t *s, size_t maxlen);
377 u32_strnlen (const uint32_t *s, size_t maxlen);
379 /* Copy SRC to DEST. */
380 /* Similar to strcpy(), wcscpy(). */
382 u8_strcpy (uint8_t *dest, const uint8_t *src);
384 u16_strcpy (uint16_t *dest, const uint16_t *src);
386 u32_strcpy (uint32_t *dest, const uint32_t *src);
388 /* Copy SRC to DEST, returning the address of the terminating NUL in DEST. */
389 /* Similar to stpcpy(). */
391 u8_stpcpy (uint8_t *dest, const uint8_t *src);
393 u16_stpcpy (uint16_t *dest, const uint16_t *src);
395 u32_stpcpy (uint32_t *dest, const uint32_t *src);
397 /* Copy no more than N units of SRC to DEST. */
398 /* Similar to strncpy(), wcsncpy(). */
400 u8_strncpy (uint8_t *dest, const uint8_t *src, size_t n);
402 u16_strncpy (uint16_t *dest, const uint16_t *src, size_t n);
404 u32_strncpy (uint32_t *dest, const uint32_t *src, size_t n);
406 /* Copy no more than N characters of SRC to DEST, returning the address of
407 the last character written into DEST. */
408 /* Similar to stpncpy(). */
410 u8_stpncpy (uint8_t *dest, const uint8_t *src, size_t n);
412 u16_stpncpy (uint16_t *dest, const uint16_t *src, size_t n);
414 u32_stpncpy (uint32_t *dest, const uint32_t *src, size_t n);
416 /* Append SRC onto DEST. */
417 /* Similar to strcat(), wcscat(). */
419 u8_strcat (uint8_t *dest, const uint8_t *src);
421 u16_strcat (uint16_t *dest, const uint16_t *src);
423 u32_strcat (uint32_t *dest, const uint32_t *src);
425 /* Append no more than N units of SRC onto DEST. */
426 /* Similar to strncat(), wcsncat(). */
428 u8_strncat (uint8_t *dest, const uint8_t *src, size_t n);
430 u16_strncat (uint16_t *dest, const uint16_t *src, size_t n);
432 u32_strncat (uint32_t *dest, const uint32_t *src, size_t n);
434 /* Compare S1 and S2. */
435 /* Similar to strcmp(), wcscmp(). */
437 u8_strcmp (const uint8_t *s1, const uint8_t *s2);
439 u16_strcmp (const uint16_t *s1, const uint16_t *s2);
441 u32_strcmp (const uint32_t *s1, const uint32_t *s2);
443 /* Compare no more than N units of S1 and S2. */
444 /* Similar to strncmp(), wcsncmp(). */
446 u8_strncmp (const uint8_t *s1, const uint8_t *s2, size_t n);
448 u16_strncmp (const uint16_t *s1, const uint16_t *s2, size_t n);
450 u32_strncmp (const uint32_t *s1, const uint32_t *s2, size_t n);
452 /* Duplicate S, returning an identical malloc'd string. */
453 /* Similar to strdup(), wcsdup(). */
455 u8_strdup (const uint8_t *s);
457 u16_strdup (const uint16_t *s);
459 u32_strdup (const uint32_t *s);
461 /* Find the first occurrence of UC in STR. */
462 /* Similar to strchr(), wcschr(). */
464 u8_strchr (const uint8_t *str, ucs4_t uc);
466 u16_strchr (const uint16_t *str, ucs4_t uc);
468 u32_strchr (const uint32_t *str, ucs4_t uc);
470 /* Find the last occurrence of UC in STR. */
471 /* Similar to strrchr(), wcsrchr(). */
473 u8_strrchr (const uint8_t *str, ucs4_t uc);
475 u16_strrchr (const uint16_t *str, ucs4_t uc);
477 u32_strrchr (const uint32_t *str, ucs4_t uc);
479 /* Return the length of the initial segment of STR which consists entirely
480 of Unicode characters not in REJECT. */
481 /* Similar to strcspn(), wcscspn(). */
483 u8_strcspn (const uint8_t *str, const uint8_t *reject);
485 u16_strcspn (const uint16_t *str, const uint16_t *reject);
487 u32_strcspn (const uint32_t *str, const uint32_t *reject);
489 /* Return the length of the initial segment of STR which consists entirely
490 of Unicode characters in ACCEPT. */
491 /* Similar to strspn(), wcsspn(). */
493 u8_strspn (const uint8_t *str, const uint8_t *accept);
495 u16_strspn (const uint16_t *str, const uint16_t *accept);
497 u32_strspn (const uint32_t *str, const uint32_t *accept);
499 /* Find the first occurrence in STR of any character in ACCEPT. */
500 /* Similar to strpbrk(), wcspbrk(). */
502 u8_strpbrk (const uint8_t *str, const uint8_t *accept);
504 u16_strpbrk (const uint16_t *str, const uint16_t *accept);
506 u32_strpbrk (const uint32_t *str, const uint32_t *accept);
508 /* Find the first occurrence of NEEDLE in HAYSTACK. */
509 /* Similar to strstr(), wcsstr(). */
511 u8_strstr (const uint8_t *haystack, const uint8_t *needle);
513 u16_strstr (const uint16_t *haystack, const uint16_t *needle);
515 u32_strstr (const uint32_t *haystack, const uint32_t *needle);
517 /* Test whether STR starts with PREFIX. */
519 u8_startswith (const uint8_t *str, const uint8_t *prefix);
521 u16_startswith (const uint16_t *str, const uint16_t *prefix);
523 u32_startswith (const uint32_t *str, const uint32_t *prefix);
525 /* Test whether STR ends with SUFFIX. */
527 u8_endswith (const uint8_t *str, const uint8_t *suffix);
529 u16_endswith (const uint16_t *str, const uint16_t *suffix);
531 u32_endswith (const uint32_t *str, const uint32_t *suffix);
533 /* Divide STR into tokens separated by characters in DELIM.
534 This interface is actually more similar to wcstok than to strtok. */
535 /* Similar to strtok_r(), wcstok(). */
537 u8_strtok (uint8_t *str, const uint8_t *delim, uint8_t **ptr);
539 u16_strtok (uint16_t *str, const uint16_t *delim, uint16_t **ptr);
541 u32_strtok (uint32_t *str, const uint32_t *delim, uint32_t **ptr);
548 #endif /* _UNISTR_H */