1 /* Unicode character classification and properties.
2 Copyright (C) 2002, 2005-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
32 /* ========================================================================= */
34 /* Field 1 of Unicode Character Database: Character name.
37 /* ========================================================================= */
39 /* Field 2 of Unicode Character Database: General category. */
41 /* Data type denoting a General category value. This is not just a bitmask,
42 but rather a bitmask and a pointer to the lookup table, so that programs
43 that use only the predefined bitmasks (i.e. don't combine bitmasks with &
44 and |) don't have a link-time dependency towards the big general table. */
47 uint32_t bitmask : 31;
48 /*bool*/ unsigned int generic : 1;
51 const void *table; /* when generic is 0 */
52 bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
55 uc_general_category_t;
57 /* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html
58 says a 32-bit integer will always suffice to represent them.
59 These bit masks can only be used with the uc_is_general_category_withtable
63 UC_CATEGORY_MASK_L = 0x0000001f,
64 UC_CATEGORY_MASK_LC = 0x00000007,
65 UC_CATEGORY_MASK_Lu = 0x00000001,
66 UC_CATEGORY_MASK_Ll = 0x00000002,
67 UC_CATEGORY_MASK_Lt = 0x00000004,
68 UC_CATEGORY_MASK_Lm = 0x00000008,
69 UC_CATEGORY_MASK_Lo = 0x00000010,
70 UC_CATEGORY_MASK_M = 0x000000e0,
71 UC_CATEGORY_MASK_Mn = 0x00000020,
72 UC_CATEGORY_MASK_Mc = 0x00000040,
73 UC_CATEGORY_MASK_Me = 0x00000080,
74 UC_CATEGORY_MASK_N = 0x00000700,
75 UC_CATEGORY_MASK_Nd = 0x00000100,
76 UC_CATEGORY_MASK_Nl = 0x00000200,
77 UC_CATEGORY_MASK_No = 0x00000400,
78 UC_CATEGORY_MASK_P = 0x0003f800,
79 UC_CATEGORY_MASK_Pc = 0x00000800,
80 UC_CATEGORY_MASK_Pd = 0x00001000,
81 UC_CATEGORY_MASK_Ps = 0x00002000,
82 UC_CATEGORY_MASK_Pe = 0x00004000,
83 UC_CATEGORY_MASK_Pi = 0x00008000,
84 UC_CATEGORY_MASK_Pf = 0x00010000,
85 UC_CATEGORY_MASK_Po = 0x00020000,
86 UC_CATEGORY_MASK_S = 0x003c0000,
87 UC_CATEGORY_MASK_Sm = 0x00040000,
88 UC_CATEGORY_MASK_Sc = 0x00080000,
89 UC_CATEGORY_MASK_Sk = 0x00100000,
90 UC_CATEGORY_MASK_So = 0x00200000,
91 UC_CATEGORY_MASK_Z = 0x01c00000,
92 UC_CATEGORY_MASK_Zs = 0x00400000,
93 UC_CATEGORY_MASK_Zl = 0x00800000,
94 UC_CATEGORY_MASK_Zp = 0x01000000,
95 UC_CATEGORY_MASK_C = 0x3e000000,
96 UC_CATEGORY_MASK_Cc = 0x02000000,
97 UC_CATEGORY_MASK_Cf = 0x04000000,
98 UC_CATEGORY_MASK_Cs = 0x08000000,
99 UC_CATEGORY_MASK_Co = 0x10000000,
100 UC_CATEGORY_MASK_Cn = 0x20000000
103 /* Predefined General category values. */
104 extern const uc_general_category_t UC_CATEGORY_L;
105 extern const uc_general_category_t UC_CATEGORY_LC;
106 extern const uc_general_category_t UC_CATEGORY_Lu;
107 extern const uc_general_category_t UC_CATEGORY_Ll;
108 extern const uc_general_category_t UC_CATEGORY_Lt;
109 extern const uc_general_category_t UC_CATEGORY_Lm;
110 extern const uc_general_category_t UC_CATEGORY_Lo;
111 extern const uc_general_category_t UC_CATEGORY_M;
112 extern const uc_general_category_t UC_CATEGORY_Mn;
113 extern const uc_general_category_t UC_CATEGORY_Mc;
114 extern const uc_general_category_t UC_CATEGORY_Me;
115 extern const uc_general_category_t UC_CATEGORY_N;
116 extern const uc_general_category_t UC_CATEGORY_Nd;
117 extern const uc_general_category_t UC_CATEGORY_Nl;
118 extern const uc_general_category_t UC_CATEGORY_No;
119 extern const uc_general_category_t UC_CATEGORY_P;
120 extern const uc_general_category_t UC_CATEGORY_Pc;
121 extern const uc_general_category_t UC_CATEGORY_Pd;
122 extern const uc_general_category_t UC_CATEGORY_Ps;
123 extern const uc_general_category_t UC_CATEGORY_Pe;
124 extern const uc_general_category_t UC_CATEGORY_Pi;
125 extern const uc_general_category_t UC_CATEGORY_Pf;
126 extern const uc_general_category_t UC_CATEGORY_Po;
127 extern const uc_general_category_t UC_CATEGORY_S;
128 extern const uc_general_category_t UC_CATEGORY_Sm;
129 extern const uc_general_category_t UC_CATEGORY_Sc;
130 extern const uc_general_category_t UC_CATEGORY_Sk;
131 extern const uc_general_category_t UC_CATEGORY_So;
132 extern const uc_general_category_t UC_CATEGORY_Z;
133 extern const uc_general_category_t UC_CATEGORY_Zs;
134 extern const uc_general_category_t UC_CATEGORY_Zl;
135 extern const uc_general_category_t UC_CATEGORY_Zp;
136 extern const uc_general_category_t UC_CATEGORY_C;
137 extern const uc_general_category_t UC_CATEGORY_Cc;
138 extern const uc_general_category_t UC_CATEGORY_Cf;
139 extern const uc_general_category_t UC_CATEGORY_Cs;
140 extern const uc_general_category_t UC_CATEGORY_Co;
141 extern const uc_general_category_t UC_CATEGORY_Cn;
143 extern const uc_general_category_t _UC_CATEGORY_NONE;
145 /* Alias names for predefined General category values. */
146 #define UC_LETTER UC_CATEGORY_L
147 #define UC_CASED_LETTER UC_CATEGORY_LC
148 #define UC_UPPERCASE_LETTER UC_CATEGORY_Lu
149 #define UC_LOWERCASE_LETTER UC_CATEGORY_Ll
150 #define UC_TITLECASE_LETTER UC_CATEGORY_Lt
151 #define UC_MODIFIER_LETTER UC_CATEGORY_Lm
152 #define UC_OTHER_LETTER UC_CATEGORY_Lo
153 #define UC_MARK UC_CATEGORY_M
154 #define UC_NON_SPACING_MARK UC_CATEGORY_Mn
155 #define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc
156 #define UC_ENCLOSING_MARK UC_CATEGORY_Me
157 #define UC_NUMBER UC_CATEGORY_N
158 #define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd
159 #define UC_LETTER_NUMBER UC_CATEGORY_Nl
160 #define UC_OTHER_NUMBER UC_CATEGORY_No
161 #define UC_PUNCTUATION UC_CATEGORY_P
162 #define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc
163 #define UC_DASH_PUNCTUATION UC_CATEGORY_Pd
164 #define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
165 #define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
166 #define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
167 #define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf
168 #define UC_OTHER_PUNCTUATION UC_CATEGORY_Po
169 #define UC_SYMBOL UC_CATEGORY_S
170 #define UC_MATH_SYMBOL UC_CATEGORY_Sm
171 #define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc
172 #define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk
173 #define UC_OTHER_SYMBOL UC_CATEGORY_So
174 #define UC_SEPARATOR UC_CATEGORY_Z
175 #define UC_SPACE_SEPARATOR UC_CATEGORY_Zs
176 #define UC_LINE_SEPARATOR UC_CATEGORY_Zl
177 #define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp
178 #define UC_OTHER UC_CATEGORY_C
179 #define UC_CONTROL UC_CATEGORY_Cc
180 #define UC_FORMAT UC_CATEGORY_Cf
181 #define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */
182 #define UC_PRIVATE_USE UC_CATEGORY_Co
183 #define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */
185 /* Return the union of two general categories.
186 This corresponds to the unions of the two sets of characters. */
187 extern uc_general_category_t
188 uc_general_category_or (uc_general_category_t category1,
189 uc_general_category_t category2);
191 /* Return the intersection of two general categories as bit masks.
192 This *does*not* correspond to the intersection of the two sets of
194 extern uc_general_category_t
195 uc_general_category_and (uc_general_category_t category1,
196 uc_general_category_t category2);
198 /* Return the intersection of a general category with the complement of a
199 second general category, as bit masks.
200 This *does*not* correspond to the intersection with complement, when
201 viewing the categories as sets of characters. */
202 extern uc_general_category_t
203 uc_general_category_and_not (uc_general_category_t category1,
204 uc_general_category_t category2);
206 /* Return the name of a general category. */
208 uc_general_category_name (uc_general_category_t category);
210 /* Return the long name of a general category. */
212 uc_general_category_long_name (uc_general_category_t category);
214 /* Return the general category given by name, e.g. "Lu", or by long name,
215 e.g. "Uppercase Letter". */
216 extern uc_general_category_t
217 uc_general_category_byname (const char *category_name);
219 /* Return the general category of a Unicode character. */
220 extern uc_general_category_t
221 uc_general_category (ucs4_t uc);
223 /* Test whether a Unicode character belongs to a given category.
224 The CATEGORY argument can be the combination of several predefined
225 general categories. */
227 uc_is_general_category (ucs4_t uc, uc_general_category_t category);
228 /* Likewise. This function uses a big table comprising all categories. */
230 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask);
232 /* ========================================================================= */
234 /* Field 3 of Unicode Character Database: Canonical combining class. */
236 /* The possible results of uc_combining_class (0..255) are described in
237 UCD.html. The list here is not definitive; more values can be added
238 in future versions. */
241 UC_CCC_NR = 0, /* Not Reordered */
242 UC_CCC_OV = 1, /* Overlay */
243 UC_CCC_NK = 7, /* Nukta */
244 UC_CCC_KV = 8, /* Kana Voicing */
245 UC_CCC_VR = 9, /* Virama */
246 UC_CCC_ATBL = 200, /* Attached Below Left */
247 UC_CCC_ATB = 202, /* Attached Below */
248 UC_CCC_ATA = 214, /* Attached Above */
249 UC_CCC_ATAR = 216, /* Attached Above Right */
250 UC_CCC_BL = 218, /* Below Left */
251 UC_CCC_B = 220, /* Below */
252 UC_CCC_BR = 222, /* Below Right */
253 UC_CCC_L = 224, /* Left */
254 UC_CCC_R = 226, /* Right */
255 UC_CCC_AL = 228, /* Above Left */
256 UC_CCC_A = 230, /* Above */
257 UC_CCC_AR = 232, /* Above Right */
258 UC_CCC_DB = 233, /* Double Below */
259 UC_CCC_DA = 234, /* Double Above */
260 UC_CCC_IS = 240 /* Iota Subscript */
263 /* Return the canonical combining class of a Unicode character. */
265 uc_combining_class (ucs4_t uc);
267 /* Return the name of a canonical combining class. */
269 uc_combining_class_name (int ccc);
271 /* ========================================================================= */
273 /* Field 4 of Unicode Character Database: Bidi class.
274 Before Unicode 4.0, this field was called "Bidirectional category". */
278 UC_BIDI_L, /* Left-to-Right */
279 UC_BIDI_LRE, /* Left-to-Right Embedding */
280 UC_BIDI_LRO, /* Left-to-Right Override */
281 UC_BIDI_R, /* Right-to-Left */
282 UC_BIDI_AL, /* Right-to-Left Arabic */
283 UC_BIDI_RLE, /* Right-to-Left Embedding */
284 UC_BIDI_RLO, /* Right-to-Left Override */
285 UC_BIDI_PDF, /* Pop Directional Format */
286 UC_BIDI_EN, /* European Number */
287 UC_BIDI_ES, /* European Number Separator */
288 UC_BIDI_ET, /* European Number Terminator */
289 UC_BIDI_AN, /* Arabic Number */
290 UC_BIDI_CS, /* Common Number Separator */
291 UC_BIDI_NSM, /* Non-Spacing Mark */
292 UC_BIDI_BN, /* Boundary Neutral */
293 UC_BIDI_B, /* Paragraph Separator */
294 UC_BIDI_S, /* Segment Separator */
295 UC_BIDI_WS, /* Whitespace */
296 UC_BIDI_ON /* Other Neutral */
299 /* Return the name of a bidi class. */
301 uc_bidi_class_name (int bidi_class);
302 /* Same; obsolete function name. */
304 uc_bidi_category_name (int category);
306 /* Return the bidi class given by name, e.g. "LRE". */
308 uc_bidi_class_byname (const char *bidi_class_name);
309 /* Same; obsolete function name. */
311 uc_bidi_category_byname (const char *category_name);
313 /* Return the bidi class of a Unicode character. */
315 uc_bidi_class (ucs4_t uc);
316 /* Same; obsolete function name. */
318 uc_bidi_category (ucs4_t uc);
320 /* Test whether a Unicode character belongs to a given bidi class. */
322 uc_is_bidi_class (ucs4_t uc, int bidi_class);
323 /* Same; obsolete function name. */
325 uc_is_bidi_category (ucs4_t uc, int category);
327 /* ========================================================================= */
329 /* Field 5 of Unicode Character Database: Character decomposition mapping.
332 /* ========================================================================= */
334 /* Field 6 of Unicode Character Database: Decimal digit value. */
336 /* Return the decimal digit value of a Unicode character. */
338 uc_decimal_value (ucs4_t uc);
340 /* ========================================================================= */
342 /* Field 7 of Unicode Character Database: Digit value. */
344 /* Return the digit value of a Unicode character. */
346 uc_digit_value (ucs4_t uc);
348 /* ========================================================================= */
350 /* Field 8 of Unicode Character Database: Numeric value. */
352 /* Return the numeric value of a Unicode character. */
360 uc_numeric_value (ucs4_t uc);
362 /* ========================================================================= */
364 /* Field 9 of Unicode Character Database: Mirrored. */
366 /* Return the mirrored character of a Unicode character UC in *PUC. */
368 uc_mirror_char (ucs4_t uc, ucs4_t *puc);
370 /* ========================================================================= */
372 /* Field 10 of Unicode Character Database: Unicode 1.0 Name.
373 Not available in this library. */
375 /* ========================================================================= */
377 /* Field 11 of Unicode Character Database: ISO 10646 comment.
378 Not available in this library. */
380 /* ========================================================================= */
382 /* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
383 lowercase mapping, titlecase mapping. See "unicase.h". */
385 /* ========================================================================= */
387 /* Field 2 of the file ArabicShaping.txt in the Unicode Character Database. */
389 /* Possible joining types. */
392 UC_JOINING_TYPE_U, /* Non_Joining */
393 UC_JOINING_TYPE_T, /* Transparent */
394 UC_JOINING_TYPE_C, /* Join_Causing */
395 UC_JOINING_TYPE_L, /* Left_Joining */
396 UC_JOINING_TYPE_R, /* Right_Joining */
397 UC_JOINING_TYPE_D /* Dual_Joining */
400 /* Return the name of a joining type. */
402 uc_joining_type_name (int joining_type);
404 /* Return the joining type given by name, e.g. "D". */
406 uc_joining_type_byname (const char *joining_type_name);
408 /* Return the joining type of a Unicode character. */
410 uc_joining_type (ucs4_t uc);
412 /* ========================================================================= */
414 /* Field 3 of the file ArabicShaping.txt in the Unicode Character Database. */
416 /* Possible joining groups.
417 This enumeration may be extended in the future. */
420 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
421 UC_JOINING_GROUP_AIN, /* Ain */
422 UC_JOINING_GROUP_ALAPH, /* Alaph */
423 UC_JOINING_GROUP_ALEF, /* Alef */
424 UC_JOINING_GROUP_BEH, /* Beh */
425 UC_JOINING_GROUP_BETH, /* Beth */
426 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
427 UC_JOINING_GROUP_DAL, /* Dal */
428 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
429 UC_JOINING_GROUP_E, /* E */
430 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
431 UC_JOINING_GROUP_FE, /* Fe */
432 UC_JOINING_GROUP_FEH, /* Feh */
433 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
434 UC_JOINING_GROUP_GAF, /* Gaf */
435 UC_JOINING_GROUP_GAMAL, /* Gamal */
436 UC_JOINING_GROUP_HAH, /* Hah */
437 UC_JOINING_GROUP_HE, /* He */
438 UC_JOINING_GROUP_HEH, /* Heh */
439 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
440 UC_JOINING_GROUP_HETH, /* Heth */
441 UC_JOINING_GROUP_KAF, /* Kaf */
442 UC_JOINING_GROUP_KAPH, /* Kaph */
443 UC_JOINING_GROUP_KHAPH, /* Khaph */
444 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
445 UC_JOINING_GROUP_LAM, /* Lam */
446 UC_JOINING_GROUP_LAMADH, /* Lamadh */
447 UC_JOINING_GROUP_MEEM, /* Meem */
448 UC_JOINING_GROUP_MIM, /* Mim */
449 UC_JOINING_GROUP_NOON, /* Noon */
450 UC_JOINING_GROUP_NUN, /* Nun */
451 UC_JOINING_GROUP_NYA, /* Nya */
452 UC_JOINING_GROUP_PE, /* Pe */
453 UC_JOINING_GROUP_QAF, /* Qaf */
454 UC_JOINING_GROUP_QAPH, /* Qaph */
455 UC_JOINING_GROUP_REH, /* Reh */
456 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
457 UC_JOINING_GROUP_SAD, /* Sad */
458 UC_JOINING_GROUP_SADHE, /* Sadhe */
459 UC_JOINING_GROUP_SEEN, /* Seen */
460 UC_JOINING_GROUP_SEMKATH, /* Semkath */
461 UC_JOINING_GROUP_SHIN, /* Shin */
462 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
463 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
464 UC_JOINING_GROUP_TAH, /* Tah */
465 UC_JOINING_GROUP_TAW, /* Taw */
466 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
467 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
468 UC_JOINING_GROUP_TETH, /* Teth */
469 UC_JOINING_GROUP_WAW, /* Waw */
470 UC_JOINING_GROUP_YEH, /* Yeh */
471 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
472 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
473 UC_JOINING_GROUP_YUDH, /* Yudh */
474 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
475 UC_JOINING_GROUP_ZAIN, /* Zain */
476 UC_JOINING_GROUP_ZHAIN /* Zhain */
479 /* Return the name of a joining group. */
481 uc_joining_group_name (int joining_group);
483 /* Return the joining group given by name, e.g. "Teh_Marbuta". */
485 uc_joining_group_byname (const char *joining_group_name);
487 /* Return the joining group of a Unicode character. */
489 uc_joining_group (ucs4_t uc);
491 /* ========================================================================= */
493 /* Common API for properties. */
495 /* Data type denoting a property. This is not just a number, but rather a
496 pointer to the test functions, so that programs that use only few of the
497 properties don't have a link-time dependency towards all the tables. */
500 bool (*test_fn) (ucs4_t uc);
504 /* Predefined properties. */
506 extern const uc_property_t UC_PROPERTY_WHITE_SPACE;
507 extern const uc_property_t UC_PROPERTY_ALPHABETIC;
508 extern const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
509 extern const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
510 extern const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
511 extern const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
512 extern const uc_property_t UC_PROPERTY_DEPRECATED;
513 extern const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
514 extern const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
515 extern const uc_property_t UC_PROPERTY_PRIVATE_USE;
516 extern const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
518 extern const uc_property_t UC_PROPERTY_UPPERCASE;
519 extern const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
520 extern const uc_property_t UC_PROPERTY_LOWERCASE;
521 extern const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
522 extern const uc_property_t UC_PROPERTY_TITLECASE;
523 extern const uc_property_t UC_PROPERTY_CASED;
524 extern const uc_property_t UC_PROPERTY_CASE_IGNORABLE;
525 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_LOWERCASED;
526 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_UPPERCASED;
527 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_TITLECASED;
528 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEFOLDED;
529 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEMAPPED;
530 extern const uc_property_t UC_PROPERTY_SOFT_DOTTED;
532 extern const uc_property_t UC_PROPERTY_ID_START;
533 extern const uc_property_t UC_PROPERTY_OTHER_ID_START;
534 extern const uc_property_t UC_PROPERTY_ID_CONTINUE;
535 extern const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
536 extern const uc_property_t UC_PROPERTY_XID_START;
537 extern const uc_property_t UC_PROPERTY_XID_CONTINUE;
538 extern const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
539 extern const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
540 /* Shaping and rendering. */
541 extern const uc_property_t UC_PROPERTY_JOIN_CONTROL;
542 extern const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
543 extern const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
544 extern const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
545 extern const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
547 extern const uc_property_t UC_PROPERTY_BIDI_CONTROL;
548 extern const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
549 extern const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
550 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
551 extern const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
552 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
553 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
554 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
555 extern const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
556 extern const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
557 extern const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
558 extern const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
559 extern const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
560 extern const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
561 extern const uc_property_t UC_PROPERTY_BIDI_PDF;
562 extern const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
563 extern const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
565 extern const uc_property_t UC_PROPERTY_HEX_DIGIT;
566 extern const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
568 extern const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
569 extern const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
570 extern const uc_property_t UC_PROPERTY_RADICAL;
571 extern const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
572 extern const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
574 extern const uc_property_t UC_PROPERTY_ZERO_WIDTH;
575 extern const uc_property_t UC_PROPERTY_SPACE;
576 extern const uc_property_t UC_PROPERTY_NON_BREAK;
577 extern const uc_property_t UC_PROPERTY_ISO_CONTROL;
578 extern const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
579 extern const uc_property_t UC_PROPERTY_DASH;
580 extern const uc_property_t UC_PROPERTY_HYPHEN;
581 extern const uc_property_t UC_PROPERTY_PUNCTUATION;
582 extern const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
583 extern const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
584 extern const uc_property_t UC_PROPERTY_QUOTATION_MARK;
585 extern const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
586 extern const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
587 extern const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
588 extern const uc_property_t UC_PROPERTY_MATH;
589 extern const uc_property_t UC_PROPERTY_OTHER_MATH;
590 extern const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
591 extern const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
592 extern const uc_property_t UC_PROPERTY_COMBINING;
593 extern const uc_property_t UC_PROPERTY_COMPOSITE;
594 extern const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
595 extern const uc_property_t UC_PROPERTY_NUMERIC;
596 extern const uc_property_t UC_PROPERTY_DIACRITIC;
597 extern const uc_property_t UC_PROPERTY_EXTENDER;
598 extern const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
600 /* Return the property given by name, e.g. "White space". */
602 uc_property_byname (const char *property_name);
604 /* Test whether a property is valid. */
605 #define uc_property_is_valid(property) ((property).test_fn != NULL)
607 /* Test whether a Unicode character has a given property. */
609 uc_is_property (ucs4_t uc, uc_property_t property);
610 extern bool uc_is_property_white_space (ucs4_t uc);
611 extern bool uc_is_property_alphabetic (ucs4_t uc);
612 extern bool uc_is_property_other_alphabetic (ucs4_t uc);
613 extern bool uc_is_property_not_a_character (ucs4_t uc);
614 extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc);
615 extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc);
616 extern bool uc_is_property_deprecated (ucs4_t uc);
617 extern bool uc_is_property_logical_order_exception (ucs4_t uc);
618 extern bool uc_is_property_variation_selector (ucs4_t uc);
619 extern bool uc_is_property_private_use (ucs4_t uc);
620 extern bool uc_is_property_unassigned_code_value (ucs4_t uc);
621 extern bool uc_is_property_uppercase (ucs4_t uc);
622 extern bool uc_is_property_other_uppercase (ucs4_t uc);
623 extern bool uc_is_property_lowercase (ucs4_t uc);
624 extern bool uc_is_property_other_lowercase (ucs4_t uc);
625 extern bool uc_is_property_titlecase (ucs4_t uc);
626 extern bool uc_is_property_cased (ucs4_t uc);
627 extern bool uc_is_property_case_ignorable (ucs4_t uc);
628 extern bool uc_is_property_changes_when_lowercased (ucs4_t uc);
629 extern bool uc_is_property_changes_when_uppercased (ucs4_t uc);
630 extern bool uc_is_property_changes_when_titlecased (ucs4_t uc);
631 extern bool uc_is_property_changes_when_casefolded (ucs4_t uc);
632 extern bool uc_is_property_changes_when_casemapped (ucs4_t uc);
633 extern bool uc_is_property_soft_dotted (ucs4_t uc);
634 extern bool uc_is_property_id_start (ucs4_t uc);
635 extern bool uc_is_property_other_id_start (ucs4_t uc);
636 extern bool uc_is_property_id_continue (ucs4_t uc);
637 extern bool uc_is_property_other_id_continue (ucs4_t uc);
638 extern bool uc_is_property_xid_start (ucs4_t uc);
639 extern bool uc_is_property_xid_continue (ucs4_t uc);
640 extern bool uc_is_property_pattern_white_space (ucs4_t uc);
641 extern bool uc_is_property_pattern_syntax (ucs4_t uc);
642 extern bool uc_is_property_join_control (ucs4_t uc);
643 extern bool uc_is_property_grapheme_base (ucs4_t uc);
644 extern bool uc_is_property_grapheme_extend (ucs4_t uc);
645 extern bool uc_is_property_other_grapheme_extend (ucs4_t uc);
646 extern bool uc_is_property_grapheme_link (ucs4_t uc);
647 extern bool uc_is_property_bidi_control (ucs4_t uc);
648 extern bool uc_is_property_bidi_left_to_right (ucs4_t uc);
649 extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc);
650 extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc);
651 extern bool uc_is_property_bidi_european_digit (ucs4_t uc);
652 extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc);
653 extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc);
654 extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc);
655 extern bool uc_is_property_bidi_common_separator (ucs4_t uc);
656 extern bool uc_is_property_bidi_block_separator (ucs4_t uc);
657 extern bool uc_is_property_bidi_segment_separator (ucs4_t uc);
658 extern bool uc_is_property_bidi_whitespace (ucs4_t uc);
659 extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc);
660 extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc);
661 extern bool uc_is_property_bidi_pdf (ucs4_t uc);
662 extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc);
663 extern bool uc_is_property_bidi_other_neutral (ucs4_t uc);
664 extern bool uc_is_property_hex_digit (ucs4_t uc);
665 extern bool uc_is_property_ascii_hex_digit (ucs4_t uc);
666 extern bool uc_is_property_ideographic (ucs4_t uc);
667 extern bool uc_is_property_unified_ideograph (ucs4_t uc);
668 extern bool uc_is_property_radical (ucs4_t uc);
669 extern bool uc_is_property_ids_binary_operator (ucs4_t uc);
670 extern bool uc_is_property_ids_trinary_operator (ucs4_t uc);
671 extern bool uc_is_property_zero_width (ucs4_t uc);
672 extern bool uc_is_property_space (ucs4_t uc);
673 extern bool uc_is_property_non_break (ucs4_t uc);
674 extern bool uc_is_property_iso_control (ucs4_t uc);
675 extern bool uc_is_property_format_control (ucs4_t uc);
676 extern bool uc_is_property_dash (ucs4_t uc);
677 extern bool uc_is_property_hyphen (ucs4_t uc);
678 extern bool uc_is_property_punctuation (ucs4_t uc);
679 extern bool uc_is_property_line_separator (ucs4_t uc);
680 extern bool uc_is_property_paragraph_separator (ucs4_t uc);
681 extern bool uc_is_property_quotation_mark (ucs4_t uc);
682 extern bool uc_is_property_sentence_terminal (ucs4_t uc);
683 extern bool uc_is_property_terminal_punctuation (ucs4_t uc);
684 extern bool uc_is_property_currency_symbol (ucs4_t uc);
685 extern bool uc_is_property_math (ucs4_t uc);
686 extern bool uc_is_property_other_math (ucs4_t uc);
687 extern bool uc_is_property_paired_punctuation (ucs4_t uc);
688 extern bool uc_is_property_left_of_pair (ucs4_t uc);
689 extern bool uc_is_property_combining (ucs4_t uc);
690 extern bool uc_is_property_composite (ucs4_t uc);
691 extern bool uc_is_property_decimal_digit (ucs4_t uc);
692 extern bool uc_is_property_numeric (ucs4_t uc);
693 extern bool uc_is_property_diacritic (ucs4_t uc);
694 extern bool uc_is_property_extender (ucs4_t uc);
695 extern bool uc_is_property_ignorable_control (ucs4_t uc);
697 /* ========================================================================= */
699 /* Subdivision of the Unicode characters into scripts. */
703 unsigned int code : 21;
704 unsigned int start : 1;
705 unsigned int end : 1;
710 unsigned int nintervals;
711 const uc_interval_t *intervals;
716 /* Return the script of a Unicode character. */
717 extern const uc_script_t *
718 uc_script (ucs4_t uc);
720 /* Return the script given by name, e.g. "HAN". */
721 extern const uc_script_t *
722 uc_script_byname (const char *script_name);
724 /* Test whether a Unicode character belongs to a given script. */
726 uc_is_script (ucs4_t uc, const uc_script_t *script);
728 /* Get the list of all scripts. */
730 uc_all_scripts (const uc_script_t **scripts, size_t *count);
732 /* ========================================================================= */
734 /* Subdivision of the Unicode character range into blocks. */
744 /* Return the block a character belongs to. */
745 extern const uc_block_t *
746 uc_block (ucs4_t uc);
748 /* Test whether a Unicode character belongs to a given block. */
750 uc_is_block (ucs4_t uc, const uc_block_t *block);
752 /* Get the list of all blocks. */
754 uc_all_blocks (const uc_block_t **blocks, size_t *count);
756 /* ========================================================================= */
758 /* Properties taken from language standards. */
760 /* Test whether a Unicode character is considered whitespace in ISO C 99. */
762 uc_is_c_whitespace (ucs4_t uc);
764 /* Test whether a Unicode character is considered whitespace in Java. */
766 uc_is_java_whitespace (ucs4_t uc);
770 UC_IDENTIFIER_START, /* valid as first or subsequent character */
771 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
772 UC_IDENTIFIER_INVALID, /* not valid */
773 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
776 /* Return the categorization of a Unicode character w.r.t. the ISO C 99
777 identifier syntax. */
779 uc_c_ident_category (ucs4_t uc);
781 /* Return the categorization of a Unicode character w.r.t. the Java
782 identifier syntax. */
784 uc_java_ident_category (ucs4_t uc);
786 /* ========================================================================= */
788 /* Like ISO C <ctype.h> and <wctype.h>. These functions are deprecated,
789 because this set of functions was designed with ASCII in mind and cannot
790 reflect the more diverse reality of the Unicode character set. But they
791 can be a quick-and-dirty porting aid when migrating from wchar_t APIs
792 to Unicode strings. */
794 /* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */
796 uc_is_alnum (ucs4_t uc);
798 /* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
799 or any character that is one of a locale-specific set of characters for
800 which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
803 uc_is_alpha (ucs4_t uc);
805 /* Test for any control character. */
807 uc_is_cntrl (ucs4_t uc);
809 /* Test for any character that corresponds to a decimal-digit character. */
811 uc_is_digit (ucs4_t uc);
813 /* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
816 uc_is_graph (ucs4_t uc);
818 /* Test for any character that corresponds to a lowercase letter or is one
819 of a locale-specific set of characters for which none of 'uc_is_cntrl',
820 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
822 uc_is_lower (ucs4_t uc);
824 /* Test for any printing character. */
826 uc_is_print (ucs4_t uc);
828 /* Test for any printing character that is one of a locale-specific set of
829 characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */
831 uc_is_punct (ucs4_t uc);
833 /* Test for any character that corresponds to a locale-specific set of
834 characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
837 uc_is_space (ucs4_t uc);
839 /* Test for any character that corresponds to an uppercase letter or is one
840 of a locale-specific set of character for which none of 'uc_is_cntrl',
841 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
843 uc_is_upper (ucs4_t uc);
845 /* Test for any character that corresponds to a hexadecimal-digit
848 uc_is_xdigit (ucs4_t uc);
851 /* Test for any character that corresponds to a standard blank character or
852 a locale-specific set of characters for which 'uc_is_alnum' is false. */
854 uc_is_blank (ucs4_t uc);
856 /* ========================================================================= */
862 #endif /* _UNICTYPE_H */