1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 /* Quoting the Unicode standard:
19 Definition: A character is defined to be "cased" if it has the Lowercase or
20 Uppercase property or has a General_Category value of Titlecase_Letter. */
24 return (uc_is_property_lowercase (uc)
25 || uc_is_property_uppercase (uc)
26 || uc_is_general_category (uc, UC_TITLECASE_LETTER));
29 /* Quoting the Unicode standard:
30 Definition: A character is defined to be "case-ignorable" if it has the
31 value MidLetter {or the value MidNumLet} for the Word_Break property or
32 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
33 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
34 The text marked in braces was added in Unicode 5.1.0, see
35 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
36 Definition of case-ignorable". */
38 is_case_ignorable (ucs4_t uc)
40 int wbp = uc_wordbreak_property (uc);
42 return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET
43 || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn
47 | UC_CATEGORY_MASK_Sk));
50 /* Quoting the Unicode standard, section "Default Case Algorithms":
51 Find the word boundaries in X according to Unicode Standard Annex #29,
52 “Text Boundaries.” For each word boundary, find the first cased character
53 F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
54 then map all characters C between F and the following word boundary to
55 Lowercase_Mapping(C). */
58 FUNC (const UNIT *s, size_t n, const char *iso639_language,
60 UNIT *resultbuf, size_t *lengthp)
62 /* The result being accumulated. */
66 /* An array containing the word break positions. */
69 /* Initialize the accumulator. */
70 if (nf != NULL || resultbuf == NULL)
82 /* Initialize the word breaks array. */
85 wordbreaks = (char *) malloc (n);
86 if (wordbreaks == NULL)
91 U_WORDBREAKS (s, n, wordbreaks);
97 const UNIT *s_end = s + n;
98 const char *wp = wordbreaks;
100 /* When considering the string as segmented by word boundaries: For each
102 - In the first part, we are searching for the first cased character.
103 In this state, in_word_first_part = true, and no conversion takes
105 - In the second part, we are converting every character: the first
106 among these characters to title case, the other ones to lower case.
107 In this state, in_word_first_part = false. */
108 bool in_word_first_part = true;
110 /* Helper for evaluating the FINAL_SIGMA condition:
111 Last character that was not case-ignorable. */
112 ucs4_t last_char_except_ignorable = 0xFFFD;
114 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
115 Last character that was of combining class 230 ("Above") or 0. */
116 ucs4_t last_char_normal_or_above = 0xFFFD;
120 /* Fetch the next character. */
122 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
124 ucs4_t (*single_character_map) (ucs4_t);
125 size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
128 unsigned int mapped_count;
131 /* Crossing a word boundary. */
132 in_word_first_part = true;
134 /* Determine single_character_map, offset_in_rule.
135 There are three possibilities:
136 - uc should not be converted.
137 - uc should be titlecased.
138 - uc should be lowercased. */
139 if (in_word_first_part)
143 /* uc is to be titlecased. */
144 single_character_map = uc_totitle;
145 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
146 in_word_first_part = false;
150 /* uc is not converted. */
151 single_character_map = NULL;
157 /* uc is to be lowercased. */
158 single_character_map = uc_tolower;
159 offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
162 /* Actually map uc. */
163 if (single_character_map == NULL)
172 /* Look first in the special-casing table. */
175 code[0] = (uc >> 8) & 0xff;
178 for (code[2] = 0; ; code[2]++)
180 const struct special_casing_rule *rule =
181 gl_unicase_special_lookup (code, 3);
186 /* Test if the condition applies. */
187 /* Does the language apply? */
188 if (rule->language[0] == '\0'
189 || (iso639_language != NULL
190 && iso639_language[0] == rule->language[0]
191 && iso639_language[1] == rule->language[1]))
193 /* Does the context apply? */
194 int context = rule->context;
205 case SCC_FINAL_SIGMA:
206 /* "Before" condition: preceded by a sequence
207 consisting of a cased letter and a case-ignorable
209 "After" condition: not followed by a sequence
210 consisting of a case-ignorable sequence and then a
212 /* Test the "before" condition. */
213 applies = is_cased (last_char_except_ignorable);
214 /* Test the "after" condition. */
217 const UNIT *s2 = s + count;
221 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
227 if (!is_case_ignorable (uc2))
234 case SCC_AFTER_SOFT_DOTTED:
235 /* "Before" condition: There is a Soft_Dotted character
236 before it, with no intervening character of
237 combining class 0 or 230 (Above). */
238 /* Test the "before" condition. */
239 applies = uc_is_property_soft_dotted (last_char_normal_or_above);
243 /* "After" condition: followed by a character of
244 combining class 230 (Above) with no intervening
245 character of combining class 0 or 230 (Above). */
246 /* Test the "after" condition. */
248 const UNIT *s2 = s + count;
253 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
254 int ccc = uc_combining_class (uc2);
260 if (ccc == UC_CCC_NR)
268 /* "After" condition: followed by COMBINING DOT ABOVE
269 (U+0307). Any sequence of characters with a
270 combining class that is neither 0 nor 230 may
271 intervene between the current character and the
272 combining dot above. */
273 /* Test the "after" condition. */
275 const UNIT *s2 = s + count;
280 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
281 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
287 int ccc = uc_combining_class (uc2);
288 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
297 /* "Before" condition: There is an uppercase I before
298 it, and there is no intervening character of
299 combining class 0 or 230 (Above). */
300 /* Test the "before" condition. */
301 applies = (last_char_normal_or_above == 'I');
307 if (rule->context < 0)
313 Look up the mapping (0 to 3 characters). */
314 const unsigned short *mapped_in_rule =
315 (const unsigned short *)((const char *)rule + offset_in_rule);
317 if (mapped_in_rule[0] == 0)
321 mapped_uc[0] = mapped_in_rule[0];
322 if (mapped_in_rule[1] == 0)
326 mapped_uc[1] = mapped_in_rule[1];
327 if (mapped_in_rule[2] == 0)
331 mapped_uc[2] = mapped_in_rule[2];
340 /* Optimization: Save a hash table lookup in the next round. */
346 /* No special-cased mapping. So use the locale and context independent
348 mapped_uc[0] = single_character_map (uc);
352 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
356 for (i = 0; i < mapped_count; i++)
358 ucs4_t muc = mapped_uc[i];
360 /* Append muc to the result accumulator. */
361 if (length < allocated)
363 int ret = U_UCTOMB (result + length, muc, allocated - length);
376 size_t old_allocated = allocated;
377 size_t new_allocated = 2 * old_allocated;
378 if (new_allocated < 64)
380 if (new_allocated < old_allocated) /* integer overflow? */
386 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
387 if (larger_result == NULL)
393 else if (result == resultbuf)
395 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
396 if (larger_result == NULL)
401 U_CPY (larger_result, resultbuf, length);
406 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
407 if (larger_result == NULL)
413 result = larger_result;
414 allocated = new_allocated;
416 int ret = U_UCTOMB (result + length, muc, allocated - length);
433 if (!is_case_ignorable (uc))
434 last_char_except_ignorable = uc;
437 int ccc = uc_combining_class (uc);
438 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
439 last_char_normal_or_above = uc;
451 /* Finally, normalize the result. */
452 UNIT *normalized_result;
454 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
455 if (normalized_result == NULL)
459 return normalized_result;
466 /* Return a non-NULL value. NULL means error. */
467 result = (UNIT *) malloc (1);
475 else if (result != resultbuf && length < allocated)
477 /* Shrink the allocated memory if possible. */
480 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
490 int saved_errno = errno;
495 if (result != resultbuf)
497 int saved_errno = errno;