1 /* Unicode character classification and properties.
2 Copyright (C) 2002, 2005-2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
32 /* ========================================================================= */
34 /* Field 1 of Unicode Character Database: Character name.
37 /* ========================================================================= */
39 /* Field 2 of Unicode Character Database: General category. */
41 /* Data type denoting a General category value. This is not just a bitmask,
42 but rather a bitmask and a pointer to the lookup table, so that programs
43 that use only the predefined bitmasks (i.e. don't combine bitmasks with &
44 and |) don't have a link-time dependency towards the big general table. */
47 uint32_t bitmask : 31;
48 /*bool*/ unsigned int generic : 1;
51 const void *table; /* when generic is 0 */
52 bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
55 uc_general_category_t;
57 /* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html
58 says a 32-bit integer will always suffice to represent them.
59 These bit masks are just informative; you cannot use them in any API. */
62 UC_CATEGORY_MASK_L = 0x0000001f,
63 UC_CATEGORY_MASK_Lu = 0x00000001,
64 UC_CATEGORY_MASK_Ll = 0x00000002,
65 UC_CATEGORY_MASK_Lt = 0x00000004,
66 UC_CATEGORY_MASK_Lm = 0x00000008,
67 UC_CATEGORY_MASK_Lo = 0x00000010,
68 UC_CATEGORY_MASK_M = 0x000000e0,
69 UC_CATEGORY_MASK_Mn = 0x00000020,
70 UC_CATEGORY_MASK_Mc = 0x00000040,
71 UC_CATEGORY_MASK_Me = 0x00000080,
72 UC_CATEGORY_MASK_N = 0x00000700,
73 UC_CATEGORY_MASK_Nd = 0x00000100,
74 UC_CATEGORY_MASK_Nl = 0x00000200,
75 UC_CATEGORY_MASK_No = 0x00000400,
76 UC_CATEGORY_MASK_P = 0x0003f800,
77 UC_CATEGORY_MASK_Pc = 0x00000800,
78 UC_CATEGORY_MASK_Pd = 0x00001000,
79 UC_CATEGORY_MASK_Ps = 0x00002000,
80 UC_CATEGORY_MASK_Pe = 0x00004000,
81 UC_CATEGORY_MASK_Pi = 0x00008000,
82 UC_CATEGORY_MASK_Pf = 0x00010000,
83 UC_CATEGORY_MASK_Po = 0x00020000,
84 UC_CATEGORY_MASK_S = 0x003c0000,
85 UC_CATEGORY_MASK_Sm = 0x00040000,
86 UC_CATEGORY_MASK_Sc = 0x00080000,
87 UC_CATEGORY_MASK_Sk = 0x00100000,
88 UC_CATEGORY_MASK_So = 0x00200000,
89 UC_CATEGORY_MASK_Z = 0x01c00000,
90 UC_CATEGORY_MASK_Zs = 0x00400000,
91 UC_CATEGORY_MASK_Zl = 0x00800000,
92 UC_CATEGORY_MASK_Zp = 0x01000000,
93 UC_CATEGORY_MASK_C = 0x3e000000,
94 UC_CATEGORY_MASK_Cc = 0x02000000,
95 UC_CATEGORY_MASK_Cf = 0x04000000,
96 UC_CATEGORY_MASK_Cs = 0x08000000,
97 UC_CATEGORY_MASK_Co = 0x10000000,
98 UC_CATEGORY_MASK_Cn = 0x20000000
101 /* Predefined General category values. */
102 extern const uc_general_category_t UC_CATEGORY_L;
103 extern const uc_general_category_t UC_CATEGORY_Lu;
104 extern const uc_general_category_t UC_CATEGORY_Ll;
105 extern const uc_general_category_t UC_CATEGORY_Lt;
106 extern const uc_general_category_t UC_CATEGORY_Lm;
107 extern const uc_general_category_t UC_CATEGORY_Lo;
108 extern const uc_general_category_t UC_CATEGORY_M;
109 extern const uc_general_category_t UC_CATEGORY_Mn;
110 extern const uc_general_category_t UC_CATEGORY_Mc;
111 extern const uc_general_category_t UC_CATEGORY_Me;
112 extern const uc_general_category_t UC_CATEGORY_N;
113 extern const uc_general_category_t UC_CATEGORY_Nd;
114 extern const uc_general_category_t UC_CATEGORY_Nl;
115 extern const uc_general_category_t UC_CATEGORY_No;
116 extern const uc_general_category_t UC_CATEGORY_P;
117 extern const uc_general_category_t UC_CATEGORY_Pc;
118 extern const uc_general_category_t UC_CATEGORY_Pd;
119 extern const uc_general_category_t UC_CATEGORY_Ps;
120 extern const uc_general_category_t UC_CATEGORY_Pe;
121 extern const uc_general_category_t UC_CATEGORY_Pi;
122 extern const uc_general_category_t UC_CATEGORY_Pf;
123 extern const uc_general_category_t UC_CATEGORY_Po;
124 extern const uc_general_category_t UC_CATEGORY_S;
125 extern const uc_general_category_t UC_CATEGORY_Sm;
126 extern const uc_general_category_t UC_CATEGORY_Sc;
127 extern const uc_general_category_t UC_CATEGORY_Sk;
128 extern const uc_general_category_t UC_CATEGORY_So;
129 extern const uc_general_category_t UC_CATEGORY_Z;
130 extern const uc_general_category_t UC_CATEGORY_Zs;
131 extern const uc_general_category_t UC_CATEGORY_Zl;
132 extern const uc_general_category_t UC_CATEGORY_Zp;
133 extern const uc_general_category_t UC_CATEGORY_C;
134 extern const uc_general_category_t UC_CATEGORY_Cc;
135 extern const uc_general_category_t UC_CATEGORY_Cf;
136 extern const uc_general_category_t UC_CATEGORY_Cs;
137 extern const uc_general_category_t UC_CATEGORY_Co;
138 extern const uc_general_category_t UC_CATEGORY_Cn;
140 extern const uc_general_category_t _UC_CATEGORY_NONE;
142 /* Alias names for predefined General category values. */
143 #define UC_LETTER UC_CATEGORY_L
144 #define UC_UPPERCASE_LETTER UC_CATEGORY_Lu
145 #define UC_LOWERCASE_LETTER UC_CATEGORY_Ll
146 #define UC_TITLECASE_LETTER UC_CATEGORY_Lt
147 #define UC_MODIFIER_LETTER UC_CATEGORY_Lm
148 #define UC_OTHER_LETTER UC_CATEGORY_Lo
149 #define UC_MARK UC_CATEGORY_M
150 #define UC_NON_SPACING_MARK UC_CATEGORY_Mn
151 #define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc
152 #define UC_ENCLOSING_MARK UC_CATEGORY_Me
153 #define UC_NUMBER UC_CATEGORY_N
154 #define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd
155 #define UC_LETTER_NUMBER UC_CATEGORY_Nl
156 #define UC_OTHER_NUMBER UC_CATEGORY_No
157 #define UC_PUNCTUATION UC_CATEGORY_P
158 #define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc
159 #define UC_DASH_PUNCTUATION UC_CATEGORY_Pd
160 #define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
161 #define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
162 #define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
163 #define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf
164 #define UC_OTHER_PUNCTUATION UC_CATEGORY_Po
165 #define UC_SYMBOL UC_CATEGORY_S
166 #define UC_MATH_SYMBOL UC_CATEGORY_Sm
167 #define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc
168 #define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk
169 #define UC_OTHER_SYMBOL UC_CATEGORY_So
170 #define UC_SEPARATOR UC_CATEGORY_Z
171 #define UC_SPACE_SEPARATOR UC_CATEGORY_Zs
172 #define UC_LINE_SEPARATOR UC_CATEGORY_Zl
173 #define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp
174 #define UC_OTHER UC_CATEGORY_C
175 #define UC_CONTROL UC_CATEGORY_Cc
176 #define UC_FORMAT UC_CATEGORY_Cf
177 #define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */
178 #define UC_PRIVATE_USE UC_CATEGORY_Co
179 #define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */
181 /* Return the union of two general categories.
182 This corresponds to the unions of the two sets of characters. */
183 extern uc_general_category_t
184 uc_general_category_or (uc_general_category_t category1,
185 uc_general_category_t category2);
187 /* Return the intersection of two general categories as bit masks.
188 This *does*not* correspond to the intersection of the two sets of
190 extern uc_general_category_t
191 uc_general_category_and (uc_general_category_t category1,
192 uc_general_category_t category2);
194 /* Return the intersection of a general category with the complement of a
195 second general category, as bit masks.
196 This *does*not* correspond to the intersection with complement, when
197 viewing the categories as sets of characters. */
198 extern uc_general_category_t
199 uc_general_category_and_not (uc_general_category_t category1,
200 uc_general_category_t category2);
202 /* Return the name of a general category. */
204 uc_general_category_name (uc_general_category_t category);
206 /* Return the general category given by name, e.g. "Lu". */
207 extern uc_general_category_t
208 uc_general_category_byname (const char *category_name);
210 /* Return the general category of a Unicode character. */
211 extern uc_general_category_t
212 uc_general_category (ucs4_t uc);
214 /* Test whether a Unicode character belongs to a given category.
215 The CATEGORY argument can be the combination of several built-in
216 general categories. */
218 uc_is_general_category (ucs4_t uc, uc_general_category_t category);
219 /* Likewise. This function uses a big table comprising all categories. */
221 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask);
223 /* ========================================================================= */
225 /* Field 3 of Unicode Character Database: Canonical combining class. */
227 /* The possible results of uc_combining_class (0..255) are described in
228 UCD.html. The list here is not definitive; more values can be added
229 in future versions. */
232 UC_CCC_NR = 0, /* Not Reordered */
233 UC_CCC_OV = 1, /* Overlay */
234 UC_CCC_NK = 7, /* Nukta */
235 UC_CCC_KV = 8, /* Kana Voicing */
236 UC_CCC_VR = 9, /* Virama */
237 UC_CCC_ATBL = 200, /* Attached Below Left */
238 UC_CCC_ATB = 202, /* Attached Below */
239 UC_CCC_ATAR = 216, /* Attached Above Right */
240 UC_CCC_BL = 218, /* Below Left */
241 UC_CCC_B = 220, /* Below */
242 UC_CCC_BR = 222, /* Below Right */
243 UC_CCC_L = 224, /* Left */
244 UC_CCC_R = 226, /* Right */
245 UC_CCC_AL = 228, /* Above Left */
246 UC_CCC_A = 230, /* Above */
247 UC_CCC_AR = 232, /* Above Right */
248 UC_CCC_DB = 233, /* Double Below */
249 UC_CCC_DA = 234, /* Double Above */
250 UC_CCC_IS = 240, /* Iota Subscript */
253 /* Return the canonical combining class of a Unicode character. */
255 uc_combining_class (ucs4_t uc);
257 /* ========================================================================= */
259 /* Field 4 of Unicode Character Database: Bidirectional category. */
263 UC_BIDI_L, /* Left-to-Right */
264 UC_BIDI_LRE, /* Left-to-Right Embedding */
265 UC_BIDI_LRO, /* Left-to-Right Override */
266 UC_BIDI_R, /* Right-to-Left */
267 UC_BIDI_AL, /* Right-to-Left Arabic */
268 UC_BIDI_RLE, /* Right-to-Left Embedding */
269 UC_BIDI_RLO, /* Right-to-Left Override */
270 UC_BIDI_PDF, /* Pop Directional Format */
271 UC_BIDI_EN, /* European Number */
272 UC_BIDI_ES, /* European Number Separator */
273 UC_BIDI_ET, /* European Number Terminator */
274 UC_BIDI_AN, /* Arabic Number */
275 UC_BIDI_CS, /* Common Number Separator */
276 UC_BIDI_NSM, /* Non-Spacing Mark */
277 UC_BIDI_BN, /* Boundary Neutral */
278 UC_BIDI_B, /* Paragraph Separator */
279 UC_BIDI_S, /* Segment Separator */
280 UC_BIDI_WS, /* Whitespace */
281 UC_BIDI_ON /* Other Neutral */
284 /* Return the name of a bidirectional category. */
286 uc_bidi_category_name (int category);
288 /* Return the bidirectional category given by name, e.g. "LRE". */
290 uc_bidi_category_byname (const char *category_name);
292 /* Return the bidirectional category of a Unicode character. */
294 uc_bidi_category (ucs4_t uc);
296 /* Test whether a Unicode character belongs to a given bidirectional
299 uc_is_bidi_category (ucs4_t uc, int category);
301 /* ========================================================================= */
303 /* Field 5 of Unicode Character Database: Character decomposition mapping.
306 /* ========================================================================= */
308 /* Field 6 of Unicode Character Database: Decimal digit value. */
310 /* Return the decimal digit value of a Unicode character. */
312 uc_decimal_value (ucs4_t uc);
314 /* ========================================================================= */
316 /* Field 7 of Unicode Character Database: Digit value. */
318 /* Return the digit value of a Unicode character. */
320 uc_digit_value (ucs4_t uc);
322 /* ========================================================================= */
324 /* Field 8 of Unicode Character Database: Numeric value. */
326 /* Return the numeric value of a Unicode character. */
334 uc_numeric_value (ucs4_t uc);
336 /* ========================================================================= */
338 /* Field 9 of Unicode Character Database: Mirrored. */
340 /* Return the mirrored character of a Unicode character UC in *PUC. */
342 uc_mirror_char (ucs4_t uc, ucs4_t *puc);
344 /* ========================================================================= */
346 /* Field 10 of Unicode Character Database: Unicode 1.0 Name.
347 Not available in this library. */
349 /* ========================================================================= */
351 /* Field 11 of Unicode Character Database: ISO 10646 comment.
352 Not available in this library. */
354 /* ========================================================================= */
356 /* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
357 lowercase mapping, titlecase mapping. See "unicase.h". */
359 /* ========================================================================= */
361 /* Common API for properties. */
363 /* Data type denoting a property. This is not just a number, but rather a
364 pointer to the test functions, so that programs that use only few of the
365 properties don't have a link-time dependency towards all the tables. */
368 bool (*test_fn) (ucs4_t uc);
372 /* Predefined properties. */
374 extern const uc_property_t UC_PROPERTY_WHITE_SPACE;
375 extern const uc_property_t UC_PROPERTY_ALPHABETIC;
376 extern const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
377 extern const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
378 extern const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
379 extern const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
380 extern const uc_property_t UC_PROPERTY_DEPRECATED;
381 extern const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
382 extern const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
383 extern const uc_property_t UC_PROPERTY_PRIVATE_USE;
384 extern const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
386 extern const uc_property_t UC_PROPERTY_UPPERCASE;
387 extern const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
388 extern const uc_property_t UC_PROPERTY_LOWERCASE;
389 extern const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
390 extern const uc_property_t UC_PROPERTY_TITLECASE;
391 extern const uc_property_t UC_PROPERTY_SOFT_DOTTED;
393 extern const uc_property_t UC_PROPERTY_ID_START;
394 extern const uc_property_t UC_PROPERTY_OTHER_ID_START;
395 extern const uc_property_t UC_PROPERTY_ID_CONTINUE;
396 extern const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
397 extern const uc_property_t UC_PROPERTY_XID_START;
398 extern const uc_property_t UC_PROPERTY_XID_CONTINUE;
399 extern const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
400 extern const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
401 /* Shaping and rendering. */
402 extern const uc_property_t UC_PROPERTY_JOIN_CONTROL;
403 extern const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
404 extern const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
405 extern const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
406 extern const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
408 extern const uc_property_t UC_PROPERTY_BIDI_CONTROL;
409 extern const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
410 extern const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
411 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
412 extern const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
413 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
414 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
415 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
416 extern const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
417 extern const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
418 extern const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
419 extern const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
420 extern const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
421 extern const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
422 extern const uc_property_t UC_PROPERTY_BIDI_PDF;
423 extern const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
424 extern const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
426 extern const uc_property_t UC_PROPERTY_HEX_DIGIT;
427 extern const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
429 extern const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
430 extern const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
431 extern const uc_property_t UC_PROPERTY_RADICAL;
432 extern const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
433 extern const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
435 extern const uc_property_t UC_PROPERTY_ZERO_WIDTH;
436 extern const uc_property_t UC_PROPERTY_SPACE;
437 extern const uc_property_t UC_PROPERTY_NON_BREAK;
438 extern const uc_property_t UC_PROPERTY_ISO_CONTROL;
439 extern const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
440 extern const uc_property_t UC_PROPERTY_DASH;
441 extern const uc_property_t UC_PROPERTY_HYPHEN;
442 extern const uc_property_t UC_PROPERTY_PUNCTUATION;
443 extern const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
444 extern const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
445 extern const uc_property_t UC_PROPERTY_QUOTATION_MARK;
446 extern const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
447 extern const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
448 extern const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
449 extern const uc_property_t UC_PROPERTY_MATH;
450 extern const uc_property_t UC_PROPERTY_OTHER_MATH;
451 extern const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
452 extern const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
453 extern const uc_property_t UC_PROPERTY_COMBINING;
454 extern const uc_property_t UC_PROPERTY_COMPOSITE;
455 extern const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
456 extern const uc_property_t UC_PROPERTY_NUMERIC;
457 extern const uc_property_t UC_PROPERTY_DIACRITIC;
458 extern const uc_property_t UC_PROPERTY_EXTENDER;
459 extern const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
461 /* Return the property given by name, e.g. "White space". */
463 uc_property_byname (const char *property_name);
465 /* Test whether a Unicode character has a given property. */
467 uc_is_property (ucs4_t uc, uc_property_t property);
468 extern bool uc_is_property_white_space (ucs4_t uc);
469 extern bool uc_is_property_alphabetic (ucs4_t uc);
470 extern bool uc_is_property_other_alphabetic (ucs4_t uc);
471 extern bool uc_is_property_not_a_character (ucs4_t uc);
472 extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc);
473 extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc);
474 extern bool uc_is_property_deprecated (ucs4_t uc);
475 extern bool uc_is_property_logical_order_exception (ucs4_t uc);
476 extern bool uc_is_property_variation_selector (ucs4_t uc);
477 extern bool uc_is_property_private_use (ucs4_t uc);
478 extern bool uc_is_property_unassigned_code_value (ucs4_t uc);
479 extern bool uc_is_property_uppercase (ucs4_t uc);
480 extern bool uc_is_property_other_uppercase (ucs4_t uc);
481 extern bool uc_is_property_lowercase (ucs4_t uc);
482 extern bool uc_is_property_other_lowercase (ucs4_t uc);
483 extern bool uc_is_property_titlecase (ucs4_t uc);
484 extern bool uc_is_property_soft_dotted (ucs4_t uc);
485 extern bool uc_is_property_id_start (ucs4_t uc);
486 extern bool uc_is_property_other_id_start (ucs4_t uc);
487 extern bool uc_is_property_id_continue (ucs4_t uc);
488 extern bool uc_is_property_other_id_continue (ucs4_t uc);
489 extern bool uc_is_property_xid_start (ucs4_t uc);
490 extern bool uc_is_property_xid_continue (ucs4_t uc);
491 extern bool uc_is_property_pattern_white_space (ucs4_t uc);
492 extern bool uc_is_property_pattern_syntax (ucs4_t uc);
493 extern bool uc_is_property_join_control (ucs4_t uc);
494 extern bool uc_is_property_grapheme_base (ucs4_t uc);
495 extern bool uc_is_property_grapheme_extend (ucs4_t uc);
496 extern bool uc_is_property_other_grapheme_extend (ucs4_t uc);
497 extern bool uc_is_property_grapheme_link (ucs4_t uc);
498 extern bool uc_is_property_bidi_control (ucs4_t uc);
499 extern bool uc_is_property_bidi_left_to_right (ucs4_t uc);
500 extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc);
501 extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc);
502 extern bool uc_is_property_bidi_european_digit (ucs4_t uc);
503 extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc);
504 extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc);
505 extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc);
506 extern bool uc_is_property_bidi_common_separator (ucs4_t uc);
507 extern bool uc_is_property_bidi_block_separator (ucs4_t uc);
508 extern bool uc_is_property_bidi_segment_separator (ucs4_t uc);
509 extern bool uc_is_property_bidi_whitespace (ucs4_t uc);
510 extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc);
511 extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc);
512 extern bool uc_is_property_bidi_pdf (ucs4_t uc);
513 extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc);
514 extern bool uc_is_property_bidi_other_neutral (ucs4_t uc);
515 extern bool uc_is_property_hex_digit (ucs4_t uc);
516 extern bool uc_is_property_ascii_hex_digit (ucs4_t uc);
517 extern bool uc_is_property_ideographic (ucs4_t uc);
518 extern bool uc_is_property_unified_ideograph (ucs4_t uc);
519 extern bool uc_is_property_radical (ucs4_t uc);
520 extern bool uc_is_property_ids_binary_operator (ucs4_t uc);
521 extern bool uc_is_property_ids_trinary_operator (ucs4_t uc);
522 extern bool uc_is_property_zero_width (ucs4_t uc);
523 extern bool uc_is_property_space (ucs4_t uc);
524 extern bool uc_is_property_non_break (ucs4_t uc);
525 extern bool uc_is_property_iso_control (ucs4_t uc);
526 extern bool uc_is_property_format_control (ucs4_t uc);
527 extern bool uc_is_property_dash (ucs4_t uc);
528 extern bool uc_is_property_hyphen (ucs4_t uc);
529 extern bool uc_is_property_punctuation (ucs4_t uc);
530 extern bool uc_is_property_line_separator (ucs4_t uc);
531 extern bool uc_is_property_paragraph_separator (ucs4_t uc);
532 extern bool uc_is_property_quotation_mark (ucs4_t uc);
533 extern bool uc_is_property_sentence_terminal (ucs4_t uc);
534 extern bool uc_is_property_terminal_punctuation (ucs4_t uc);
535 extern bool uc_is_property_currency_symbol (ucs4_t uc);
536 extern bool uc_is_property_math (ucs4_t uc);
537 extern bool uc_is_property_other_math (ucs4_t uc);
538 extern bool uc_is_property_paired_punctuation (ucs4_t uc);
539 extern bool uc_is_property_left_of_pair (ucs4_t uc);
540 extern bool uc_is_property_combining (ucs4_t uc);
541 extern bool uc_is_property_composite (ucs4_t uc);
542 extern bool uc_is_property_decimal_digit (ucs4_t uc);
543 extern bool uc_is_property_numeric (ucs4_t uc);
544 extern bool uc_is_property_diacritic (ucs4_t uc);
545 extern bool uc_is_property_extender (ucs4_t uc);
546 extern bool uc_is_property_ignorable_control (ucs4_t uc);
548 /* ========================================================================= */
550 /* Subdivision of the the Unicode characters into scripts. */
554 unsigned int code : 21;
555 unsigned int start : 1;
556 unsigned int end : 1;
561 unsigned int nintervals;
562 const uc_interval_t *intervals;
567 /* Return the script of a Unicode character. */
568 extern const uc_script_t *
569 uc_script (ucs4_t uc);
571 /* Return the script given by name, e.g. "HAN". */
572 extern const uc_script_t *
573 uc_script_byname (const char *script_name);
575 /* Test whether a Unicode character belongs to a given script. */
577 uc_is_script (ucs4_t uc, const uc_script_t *script);
579 /* Get the list of all scripts. */
581 uc_all_scripts (const uc_script_t **scripts, size_t *count);
583 /* ========================================================================= */
585 /* Subdivision of the Unicode character range into blocks. */
595 /* Return the block a character belongs to. */
596 extern const uc_block_t *
597 uc_block (ucs4_t uc);
599 /* Test whether a Unicode character belongs to a given block. */
601 uc_is_block (ucs4_t uc, const uc_block_t *block);
603 /* Get the list of all blocks. */
605 uc_all_blocks (const uc_block_t **blocks, size_t *count);
607 /* ========================================================================= */
609 /* Properties taken from language standards. */
611 /* Test whether a Unicode character is considered whitespace in ISO C 99. */
613 uc_is_c_whitespace (ucs4_t uc);
615 /* Test whether a Unicode character is considered whitespace in Java. */
617 uc_is_java_whitespace (ucs4_t uc);
621 UC_IDENTIFIER_START, /* valid as first or subsequent character */
622 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
623 UC_IDENTIFIER_INVALID, /* not valid */
624 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
627 /* Return the categorization of a Unicode character w.r.t. the ISO C 99
628 identifier syntax. */
630 uc_c_ident_category (ucs4_t uc);
632 /* Return the categorization of a Unicode character w.r.t. the Java
633 identifier syntax. */
635 uc_java_ident_category (ucs4_t uc);
637 /* ========================================================================= */
639 /* Like ISO C <ctype.h> and <wctype.h>. These functions are deprecated,
640 because this set of functions was designed with ASCII in mind and cannot
641 reflect the more diverse reality of the Unicode character set. But they
642 can be a quick-and-dirty porting aid when migrating from wchar_t APIs
643 to Unicode strings. */
645 /* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */
647 uc_is_alnum (ucs4_t uc);
649 /* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
650 or any character that is one of a locale-specific set of characters for
651 which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
654 uc_is_alpha (ucs4_t uc);
656 /* Test for any control character. */
658 uc_is_cntrl (ucs4_t uc);
660 /* Test for any character that corresponds to a decimal-digit character. */
662 uc_is_digit (ucs4_t uc);
664 /* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
667 uc_is_graph (ucs4_t uc);
669 /* Test for any character that corresponds to a lowercase letter or is one
670 of a locale-specific set of characters for which none of 'uc_is_cntrl',
671 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
673 uc_is_lower (ucs4_t uc);
675 /* Test for any printing character. */
677 uc_is_print (ucs4_t uc);
679 /* Test for any printing character that is one of a locale-specific set of
680 characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */
682 uc_is_punct (ucs4_t uc);
684 /* Test for any character that corresponds to a locale-specific set of
685 characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
688 uc_is_space (ucs4_t uc);
690 /* Test for any character that corresponds to an uppercase letter or is one
691 of a locale-specific set of character for which none of 'uc_is_cntrl',
692 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
694 uc_is_upper (ucs4_t uc);
696 /* Test for any character that corresponds to a hexadecimal-digit character
697 equivalent to that performed by the functions described in the previous
700 uc_is_xdigit (ucs4_t uc);
703 /* Test for any character that corresponds to a standard blank character or
704 a locale-specific set of characters for which 'uc_is_alnum' is false. */
706 uc_is_blank (ucs4_t uc);
708 /* ========================================================================= */
714 #endif /* _UNICTYPE_H */