lib/unicase.h

   1 /* Unicode character case mappings.
   2    Copyright (C) 2002, 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU Lesser General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 #ifndef _UNICASE_H
  18 #define _UNICASE_H
  19
  20 #include "unitypes.h"
  21
  22 /* Get bool.  */
  23 #include <stdbool.h>
  24
  25 /* Get size_t.  */
  26 #include <stddef.h>
  27
  28 /* Get uninorm_t.  */
  29 #include "uninorm.h"
  30
  31 #ifdef __cplusplus
  32 extern "C" {
  33 #endif
  34
  35 /* ========================================================================= */
  36
  37 /* Character case mappings.
  38    These mappings are locale and context independent.
  39    WARNING! These functions are not sufficient for languages such as German.
  40    Better use the functions below that treat an entire string at once and are
  41    language aware.  */
  42
  43 /* Return the uppercase mapping of a Unicode character.  */
  44 extern ucs4_t
  45        uc_toupper (ucs4_t uc);
  46
  47 /* Return the lowercase mapping of a Unicode character.  */
  48 extern ucs4_t
  49        uc_tolower (ucs4_t uc);
  50
  51 /* Return the titlecase mapping of a Unicode character.  */
  52 extern ucs4_t
  53        uc_totitle (ucs4_t uc);
  54
  55 /* ========================================================================= */
  56
  57 /* String case mappings.  */
  58
  59 /* These functions are locale dependent.  The iso639_language argument
  60    identifies the language (e.g. "tr" for Turkish).  NULL means to use
  61    locale independent case mappings.  */
  62
  63 /* Return the ISO 639 language code of the current locale.
  64    Return "" if it is unknown, or in the "C" locale.  */
  65 extern const char *
  66        uc_locale_language (void);
  67
  68 /* Conventions:
  69
  70    All functions prefixed with u8_ operate on UTF-8 encoded strings.
  71    Their unit is an uint8_t (1 byte).
  72
  73    All functions prefixed with u16_ operate on UTF-16 encoded strings.
  74    Their unit is an uint16_t (a 2-byte word).
  75
  76    All functions prefixed with u32_ operate on UCS-4 encoded strings.
  77    Their unit is an uint32_t (a 4-byte word).
  78
  79    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
  80    n units.
  81
  82    Functions returning a string result take a (resultbuf, lengthp) argument
  83    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
  84    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
  85    allocated string is returned.  In both cases, *lengthp is set to the
  86    length (number of units) of the returned string.  In case of error,
  87    NULL is returned and errno is set.  */
  88
  89 /* Return the uppercase mapping of a string.
  90    The nf argument identifies the normalization form to apply after the
  91    case-mapping.  It can also be NULL, for no normalization.  */
  92 extern uint8_t *
  93        u8_toupper (const uint8_t *s, size_t n, const char *iso639_language,
  94                    uninorm_t nf,
  95                    uint8_t *resultbuf, size_t *lengthp);
  96 extern uint16_t *
  97        u16_toupper (const uint16_t *s, size_t n, const char *iso639_language,
  98                     uninorm_t nf,
  99                     uint16_t *resultbuf, size_t *lengthp);
 100 extern uint32_t *
 101        u32_toupper (const uint32_t *s, size_t n, const char *iso639_language,
 102                     uninorm_t nf,
 103                     uint32_t *resultbuf, size_t *lengthp);
 104
 105 /* Return the lowercase mapping of a string.
 106    The nf argument identifies the normalization form to apply after the
 107    case-mapping.  It can also be NULL, for no normalization.  */
 108 extern uint8_t *
 109        u8_tolower (const uint8_t *s, size_t n, const char *iso639_language,
 110                    uninorm_t nf,
 111                    uint8_t *resultbuf, size_t *lengthp);
 112 extern uint16_t *
 113        u16_tolower (const uint16_t *s, size_t n, const char *iso639_language,
 114                     uninorm_t nf,
 115                     uint16_t *resultbuf, size_t *lengthp);
 116 extern uint32_t *
 117        u32_tolower (const uint32_t *s, size_t n, const char *iso639_language,
 118                     uninorm_t nf,
 119                     uint32_t *resultbuf, size_t *lengthp);
 120
 121 /* Return the titlecase mapping of a string.
 122    The nf argument identifies the normalization form to apply after the
 123    case-mapping.  It can also be NULL, for no normalization.  */
 124 extern uint8_t *
 125        u8_totitle (const uint8_t *s, size_t n, const char *iso639_language,
 126                    uninorm_t nf,
 127                    uint8_t *resultbuf, size_t *lengthp);
 128 extern uint16_t *
 129        u16_totitle (const uint16_t *s, size_t n, const char *iso639_language,
 130                     uninorm_t nf,
 131                     uint16_t *resultbuf, size_t *lengthp);
 132 extern uint32_t *
 133        u32_totitle (const uint32_t *s, size_t n, const char *iso639_language,
 134                     uninorm_t nf,
 135                     uint32_t *resultbuf, size_t *lengthp);
 136
 137 /* Return the case folded string.
 138    Comparing uN_casefold (S1) and uN_casefold (S2) with uN_cmp2() is equivalent
 139    to comparing S1 and S2 with uN_casecmp().
 140    The nf argument identifies the normalization form to apply after the
 141    case-mapping.  It can also be NULL, for no normalization.  */
 142 extern uint8_t *
 143        u8_casefold (const uint8_t *s, size_t n, const char *iso639_language,
 144                     uninorm_t nf,
 145                     uint8_t *resultbuf, size_t *lengthp);
 146 extern uint16_t *
 147        u16_casefold (const uint16_t *s, size_t n, const char *iso639_language,
 148                      uninorm_t nf,
 149                      uint16_t *resultbuf, size_t *lengthp);
 150 extern uint32_t *
 151        u32_casefold (const uint32_t *s, size_t n, const char *iso639_language,
 152                      uninorm_t nf,
 153                      uint32_t *resultbuf, size_t *lengthp);
 154
 155 /* Compare S1 and S2, ignoring differences in case and normalization.
 156    The nf argument identifies the normalization form to apply after the
 157    case-mapping.  It can also be NULL, for no normalization.
 158    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 159    return 0.  Upon failure, return -1 with errno set.  */
 160 extern int
 161        u8_casecmp (const uint8_t *s1, size_t n1,
 162                    const uint8_t *s2, size_t n2,
 163                    const char *iso639_language, uninorm_t nf, int *resultp);
 164 extern int
 165        u16_casecmp (const uint16_t *s1, size_t n1,
 166                     const uint16_t *s2, size_t n2,
 167                     const char *iso639_language, uninorm_t nf, int *resultp);
 168 extern int
 169        u32_casecmp (const uint32_t *s1, size_t n1,
 170                     const uint32_t *s2, size_t n2,
 171                     const char *iso639_language, uninorm_t nf, int *resultp);
 172 extern int
 173        ulc_casecmp (const char *s1, size_t n1,
 174                     const char *s2, size_t n2,
 175                     const char *iso639_language, uninorm_t nf, int *resultp);
 176
 177 /* Convert the string S of length N to a NUL-terminated byte sequence, in such
 178    a way that comparing uN_casexfrm (S1) and uN_casexfrm (S2) with the gnulib
 179    function memcmp2() is equivalent to comparing S1 and S2 with uN_casecoll().
 180    NF must be either UNINORM_NFC, UNINORM_NFKC, or NULL for no normalization.  */
 181 extern char *
 182        u8_casexfrm (const uint8_t *s, size_t n, const char *iso639_language,
 183                     uninorm_t nf, char *resultbuf, size_t *lengthp);
 184 extern char *
 185        u16_casexfrm (const uint16_t *s, size_t n, const char *iso639_language,
 186                      uninorm_t nf, char *resultbuf, size_t *lengthp);
 187 extern char *
 188        u32_casexfrm (const uint32_t *s, size_t n, const char *iso639_language,
 189                      uninorm_t nf, char *resultbuf, size_t *lengthp);
 190 extern char *
 191        ulc_casexfrm (const char *s, size_t n, const char *iso639_language,
 192                      uninorm_t nf, char *resultbuf, size_t *lengthp);
 193
 194 /* Compare S1 and S2, ignoring differences in case and normalization, using the
 195    collation rules of the current locale.
 196    The nf argument identifies the normalization form to apply after the
 197    case-mapping.  It must be either UNINORM_NFC or UNINORM_NFKC.  It can also
 198    be NULL, for no normalization.
 199    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 200    return 0.  Upon failure, return -1 with errno set.  */
 201 extern int
 202        u8_casecoll (const uint8_t *s1, size_t n1,
 203                     const uint8_t *s2, size_t n2,
 204                     const char *iso639_language, uninorm_t nf, int *resultp);
 205 extern int
 206        u16_casecoll (const uint16_t *s1, size_t n1,
 207                      const uint16_t *s2, size_t n2,
 208                      const char *iso639_language, uninorm_t nf, int *resultp);
 209 extern int
 210        u32_casecoll (const uint32_t *s1, size_t n1,
 211                      const uint32_t *s2, size_t n2,
 212                      const char *iso639_language, uninorm_t nf, int *resultp);
 213 extern int
 214        ulc_casecoll (const char *s1, size_t n1,
 215                      const char *s2, size_t n2,
 216                      const char *iso639_language, uninorm_t nf, int *resultp);
 217
 218
 219 /* Set *RESULTP to true if mapping NFD(S) to upper case is a no-op, or to false
 220    otherwise, and return 0.  Upon failure, return -1 with errno set.  */
 221 extern int
 222        u8_is_uppercase (const uint8_t *s, size_t n,
 223                         const char *iso639_language,
 224                         bool *resultp);
 225 extern int
 226        u16_is_uppercase (const uint16_t *s, size_t n,
 227                          const char *iso639_language,
 228                          bool *resultp);
 229 extern int
 230        u32_is_uppercase (const uint32_t *s, size_t n,
 231                          const char *iso639_language,
 232                          bool *resultp);
 233
 234 /* Set *RESULTP to true if mapping NFD(S) to lower case is a no-op, or to false
 235    otherwise, and return 0.  Upon failure, return -1 with errno set.  */
 236 extern int
 237        u8_is_lowercase (const uint8_t *s, size_t n,
 238                         const char *iso639_language,
 239                         bool *resultp);
 240 extern int
 241        u16_is_lowercase (const uint16_t *s, size_t n,
 242                          const char *iso639_language,
 243                          bool *resultp);
 244 extern int
 245        u32_is_lowercase (const uint32_t *s, size_t n,
 246                          const char *iso639_language,
 247                          bool *resultp);
 248
 249 /* Set *RESULTP to true if mapping NFD(S) to title case is a no-op, or to false
 250    otherwise, and return 0.  Upon failure, return -1 with errno set.  */
 251 extern int
 252        u8_is_titlecase (const uint8_t *s, size_t n,
 253                         const char *iso639_language,
 254                         bool *resultp);
 255 extern int
 256        u16_is_titlecase (const uint16_t *s, size_t n,
 257                          const char *iso639_language,
 258                          bool *resultp);
 259 extern int
 260        u32_is_titlecase (const uint32_t *s, size_t n,
 261                          const char *iso639_language,
 262                          bool *resultp);
 263
 264 /* Set *RESULTP to true if applying case folding to NFD(S) is a no-op, or to
 265    false otherwise, and return 0.  Upon failure, return -1 with errno set.  */
 266 extern int
 267        u8_is_casefolded (const uint8_t *s, size_t n,
 268                          const char *iso639_language,
 269                          bool *resultp);
 270 extern int
 271        u16_is_casefolded (const uint16_t *s, size_t n,
 272                           const char *iso639_language,
 273                           bool *resultp);
 274 extern int
 275        u32_is_casefolded (const uint32_t *s, size_t n,
 276                           const char *iso639_language,
 277                           bool *resultp);
 278
 279 /* Set *RESULTP to true if case matters for S, that is, if mapping NFD(S) to
 280    either upper case or lower case or title case is not a no-op.
 281    Set *RESULTP to false if NFD(S) maps to itself under the upper case mapping,
 282    under the lower case mapping, and under the title case mapping; in other
 283    words, when NFD(S) consists entirely of caseless characters.
 284    Upon failure, return -1 with errno set.  */
 285 extern int
 286        u8_is_cased (const uint8_t *s, size_t n,
 287                     const char *iso639_language,
 288                     bool *resultp);
 289 extern int
 290        u16_is_cased (const uint16_t *s, size_t n,
 291                      const char *iso639_language,
 292                      bool *resultp);
 293 extern int
 294        u32_is_cased (const uint32_t *s, size_t n,
 295                      const char *iso639_language,
 296                      bool *resultp);
 297
 298
 299 /* ========================================================================= */
 300
 301 #ifdef __cplusplus
 302 }
 303 #endif
 304
 305 #endif /* _UNICASE_H */