1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 /* Quoting the Unicode standard, section "Default Case Algorithms":
19 Find the word boundaries in X according to Unicode Standard Annex #29,
20 “Text Boundaries.” For each word boundary, find the first cased character
21 F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22 then map all characters C between F and the following word boundary to
23 Lowercase_Mapping(C). */
26 FUNC (const UNIT *s, size_t n,
27 casing_prefix_context_t prefix_context,
28 casing_suffix_context_t suffix_context,
29 const char *iso639_language,
31 UNIT *resultbuf, size_t *lengthp)
33 /* The result being accumulated. */
37 /* An array containing the word break positions. */
40 /* Initialize the accumulator. */
41 if (nf != NULL || resultbuf == NULL)
53 /* Initialize the word breaks array. */
56 wordbreaks = (char *) malloc (n);
57 if (wordbreaks == NULL)
62 U_WORDBREAKS (s, n, wordbreaks);
68 const UNIT *s_end = s + n;
69 const char *wp = wordbreaks;
71 /* When considering the string as segmented by word boundaries: For each
73 - In the first part, we are searching for the first cased character.
74 In this state, in_word_first_part = true, and no conversion takes
76 - In the second part, we are converting every character: the first
77 among these characters to title case, the other ones to lower case.
78 In this state, in_word_first_part = false. */
79 bool in_word_first_part = true;
81 /* Helper for evaluating the FINAL_SIGMA condition:
82 Last character that was not case-ignorable. */
83 ucs4_t last_char_except_ignorable =
84 prefix_context.last_char_except_ignorable;
86 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87 Last character that was of combining class 230 ("Above") or 0. */
88 ucs4_t last_char_normal_or_above =
89 prefix_context.last_char_normal_or_above;
93 /* Fetch the next character. */
95 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
97 ucs4_t (*single_character_map) (ucs4_t);
98 size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
101 unsigned int mapped_count;
104 /* Crossing a word boundary. */
105 in_word_first_part = true;
107 /* Determine single_character_map, offset_in_rule.
108 There are three possibilities:
109 - uc should not be converted.
110 - uc should be titlecased.
111 - uc should be lowercased. */
112 if (in_word_first_part)
114 if (uc_is_cased (uc))
116 /* uc is to be titlecased. */
117 single_character_map = uc_totitle;
118 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
119 in_word_first_part = false;
123 /* uc is not converted. */
124 single_character_map = NULL;
130 /* uc is to be lowercased. */
131 single_character_map = uc_tolower;
132 offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
135 /* Actually map uc. */
136 if (single_character_map == NULL)
145 /* Look first in the special-casing table. */
148 code[0] = (uc >> 8) & 0xff;
151 for (code[2] = 0; ; code[2]++)
153 const struct special_casing_rule *rule =
154 gl_unicase_special_lookup (code, 3);
159 /* Test if the condition applies. */
160 /* Does the language apply? */
161 if (rule->language[0] == '\0'
162 || (iso639_language != NULL
163 && iso639_language[0] == rule->language[0]
164 && iso639_language[1] == rule->language[1]))
166 /* Does the context apply? */
167 int context = rule->context;
178 case SCC_FINAL_SIGMA:
179 /* "Before" condition: preceded by a sequence
180 consisting of a cased letter and a case-ignorable
182 "After" condition: not followed by a sequence
183 consisting of a case-ignorable sequence and then a
185 /* Test the "before" condition. */
186 applies = uc_is_cased (last_char_except_ignorable);
187 /* Test the "after" condition. */
190 const UNIT *s2 = s + count;
196 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
197 if (uc_is_cased (uc2))
202 if (!uc_is_case_ignorable (uc2))
208 applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
215 case SCC_AFTER_SOFT_DOTTED:
216 /* "Before" condition: There is a Soft_Dotted character
217 before it, with no intervening character of
218 combining class 0 or 230 (Above). */
219 /* Test the "before" condition. */
220 applies = uc_is_property_soft_dotted (last_char_normal_or_above);
224 /* "After" condition: followed by a character of
225 combining class 230 (Above) with no intervening
226 character of combining class 0 or 230 (Above). */
227 /* Test the "after" condition. */
229 const UNIT *s2 = s + count;
236 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
237 int ccc = uc_combining_class (uc2);
243 if (ccc == UC_CCC_NR)
249 applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
257 /* "After" condition: followed by COMBINING DOT ABOVE
258 (U+0307). Any sequence of characters with a
259 combining class that is neither 0 nor 230 may
260 intervene between the current character and the
261 combining dot above. */
262 /* Test the "after" condition. */
264 const UNIT *s2 = s + count;
271 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
272 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
278 int ccc = uc_combining_class (uc2);
279 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
286 applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
294 /* "Before" condition: There is an uppercase I before
295 it, and there is no intervening character of
296 combining class 0 or 230 (Above). */
297 /* Test the "before" condition. */
298 applies = (last_char_normal_or_above == 'I');
304 if (rule->context < 0)
310 Look up the mapping (0 to 3 characters). */
311 const unsigned short *mapped_in_rule =
312 (const unsigned short *)((const char *)rule + offset_in_rule);
314 if (mapped_in_rule[0] == 0)
318 mapped_uc[0] = mapped_in_rule[0];
319 if (mapped_in_rule[1] == 0)
323 mapped_uc[1] = mapped_in_rule[1];
324 if (mapped_in_rule[2] == 0)
328 mapped_uc[2] = mapped_in_rule[2];
337 /* Optimization: Save a hash table lookup in the next round. */
343 /* No special-cased mapping. So use the locale and context independent
345 mapped_uc[0] = single_character_map (uc);
349 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
353 for (i = 0; i < mapped_count; i++)
355 ucs4_t muc = mapped_uc[i];
357 /* Append muc to the result accumulator. */
358 if (length < allocated)
360 int ret = U_UCTOMB (result + length, muc, allocated - length);
373 size_t old_allocated = allocated;
374 size_t new_allocated = 2 * old_allocated;
375 if (new_allocated < 64)
377 if (new_allocated < old_allocated) /* integer overflow? */
383 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
384 if (larger_result == NULL)
390 else if (result == resultbuf)
392 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
393 if (larger_result == NULL)
398 U_CPY (larger_result, resultbuf, length);
403 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
404 if (larger_result == NULL)
410 result = larger_result;
411 allocated = new_allocated;
413 int ret = U_UCTOMB (result + length, muc, allocated - length);
430 if (!uc_is_case_ignorable (uc))
431 last_char_except_ignorable = uc;
434 int ccc = uc_combining_class (uc);
435 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
436 last_char_normal_or_above = uc;
448 /* Finally, normalize the result. */
449 UNIT *normalized_result;
451 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
452 if (normalized_result == NULL)
456 return normalized_result;
463 /* Return a non-NULL value. NULL means error. */
464 result = (UNIT *) malloc (1);
472 else if (result != resultbuf && length < allocated)
474 /* Shrink the allocated memory if possible. */
477 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
487 int saved_errno = errno;
492 if (result != resultbuf)
494 int saved_errno = errno;