From 677440bc23d22bff682c78a3e59f981b0cb66dae Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Tue, 30 Jun 2009 00:13:55 +0200 Subject: [PATCH] Define u8_totitle as a wrapper around u8_ct_totitle. --- ChangeLog | 8 + lib/unicase/u-totitle.h | 484 +-------------------------------------------- lib/unicase/u8-totitle.c | 17 +- modules/unicase/u8-totitle | 20 +- 4 files changed, 17 insertions(+), 512 deletions(-) diff --git a/ChangeLog b/ChangeLog index a25c6b52e..c5686c733 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2009-06-29 Bruno Haible + Define u8_totitle as a wrapper around u8_ct_totitle. + * lib/unicase/u-totitle.h (is_cased, is_case_ignorable): Remove + functions. + (FUNC): Delegate to U_CT_TOTITLE. + * lib/unicase/u8-totitle.c: Update. + * modules/unicase/u8-totitle (Depends-on): Add unicase/u8-ct-totitle, + unicase/empty-prefix-context, unicase/empty-suffix-context. Clean up. + * lib/unicase/u32-tolower.c (u32_tolower): Update u32_casemap invocation. * modules/unicase/u32-tolower (Depends-on): Add diff --git a/lib/unicase/u-totitle.h b/lib/unicase/u-totitle.h index a72e4db44..431f000ae 100644 --- a/lib/unicase/u-totitle.h +++ b/lib/unicase/u-totitle.h @@ -15,488 +15,14 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ -/* Quoting the Unicode standard: - Definition: A character is defined to be "cased" if it has the Lowercase or - Uppercase property or has a General_Category value of Titlecase_Letter. */ -static inline bool -is_cased (ucs4_t uc) -{ - return (uc_is_property_lowercase (uc) - || uc_is_property_uppercase (uc) - || uc_is_general_category (uc, UC_TITLECASE_LETTER)); -} - -/* Quoting the Unicode standard: - Definition: A character is defined to be "case-ignorable" if it has the - value MidLetter {or the value MidNumLet} for the Word_Break property or - its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), - Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). - The text marked in braces was added in Unicode 5.1.0, see - section "Update of - Definition of case-ignorable". */ -static inline bool -is_case_ignorable (ucs4_t uc) -{ - int wbp = uc_wordbreak_property (uc); - - return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET - || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn - | UC_CATEGORY_MASK_Me - | UC_CATEGORY_MASK_Cf - | UC_CATEGORY_MASK_Lm - | UC_CATEGORY_MASK_Sk)); -} - -/* Quoting the Unicode standard, section "Default Case Algorithms": - Find the word boundaries in X according to Unicode Standard Annex #29, - “Text Boundaries.” For each word boundary, find the first cased character - F following the word boundary. If F exists, map F to Titlecase_Mapping(F); - then map all characters C between F and the following word boundary to - Lowercase_Mapping(C). */ - UNIT * FUNC (const UNIT *s, size_t n, const char *iso639_language, uninorm_t nf, UNIT *resultbuf, size_t *lengthp) { - /* The result being accumulated. */ - UNIT *result; - size_t length; - size_t allocated; - /* An array containing the word break positions. */ - char *wordbreaks; - - /* Initialize the accumulator. */ - if (nf != NULL || resultbuf == NULL) - { - result = NULL; - allocated = 0; - } - else - { - result = resultbuf; - allocated = *lengthp; - } - length = 0; - - /* Initialize the word breaks array. */ - if (n > 0) - { - wordbreaks = (char *) malloc (n); - if (wordbreaks == NULL) - { - errno = ENOMEM; - goto fail2; - } - U_WORDBREAKS (s, n, wordbreaks); - } - else - wordbreaks = NULL; - - { - const UNIT *s_end = s + n; - const char *wp = wordbreaks; - - /* When considering the string as segmented by word boundaries: For each - such segment: - - In the first part, we are searching for the first cased character. - In this state, in_word_first_part = true, and no conversion takes - place. - - In the second part, we are converting every character: the first - among these characters to title case, the other ones to lower case. - In this state, in_word_first_part = false. */ - bool in_word_first_part = true; - - /* Helper for evaluating the FINAL_SIGMA condition: - Last character that was not case-ignorable. */ - ucs4_t last_char_except_ignorable = 0xFFFD; - - /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: - Last character that was of combining class 230 ("Above") or 0. */ - ucs4_t last_char_normal_or_above = 0xFFFD; - - while (s < s_end) - { - /* Fetch the next character. */ - ucs4_t uc; - int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); - - ucs4_t (*single_character_map) (ucs4_t); - size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ - - ucs4_t mapped_uc[3]; - unsigned int mapped_count; - - if (*wp) - /* Crossing a word boundary. */ - in_word_first_part = true; - - /* Determine single_character_map, offset_in_rule. - There are three possibilities: - - uc should not be converted. - - uc should be titlecased. - - uc should be lowercased. */ - if (in_word_first_part) - { - if (is_cased (uc)) - { - /* uc is to be titlecased. */ - single_character_map = uc_totitle; - offset_in_rule = offsetof (struct special_casing_rule, title[0]); - in_word_first_part = false; - } - else - { - /* uc is not converted. */ - single_character_map = NULL; - offset_in_rule = 0; - } - } - else - { - /* uc is to be lowercased. */ - single_character_map = uc_tolower; - offset_in_rule = offsetof (struct special_casing_rule, lower[0]); - } - - /* Actually map uc. */ - if (single_character_map == NULL) - { - mapped_uc[0] = uc; - mapped_count = 1; - goto found_mapping; - } - - if (uc < 0x10000) - { - /* Look first in the special-casing table. */ - char code[3]; - - code[0] = (uc >> 8) & 0xff; - code[1] = uc & 0xff; - - for (code[2] = 0; ; code[2]++) - { - const struct special_casing_rule *rule = - gl_unicase_special_lookup (code, 3); - - if (rule == NULL) - break; - - /* Test if the condition applies. */ - /* Does the language apply? */ - if (rule->language[0] == '\0' - || (iso639_language != NULL - && iso639_language[0] == rule->language[0] - && iso639_language[1] == rule->language[1])) - { - /* Does the context apply? */ - int context = rule->context; - bool applies; - - if (context < 0) - context = - context; - switch (context) - { - case SCC_ALWAYS: - applies = true; - break; - - case SCC_FINAL_SIGMA: - /* "Before" condition: preceded by a sequence - consisting of a cased letter and a case-ignorable - sequence. - "After" condition: not followed by a sequence - consisting of a case-ignorable sequence and then a - cased letter. */ - /* Test the "before" condition. */ - applies = is_cased (last_char_except_ignorable); - /* Test the "after" condition. */ - if (applies) - { - const UNIT *s2 = s + count; - while (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - if (is_cased (uc2)) - { - applies = false; - break; - } - if (!is_case_ignorable (uc2)) - break; - s2 += count2; - } - } - break; - - case SCC_AFTER_SOFT_DOTTED: - /* "Before" condition: There is a Soft_Dotted character - before it, with no intervening character of - combining class 0 or 230 (Above). */ - /* Test the "before" condition. */ - applies = uc_is_property_soft_dotted (last_char_normal_or_above); - break; - - case SCC_MORE_ABOVE: - /* "After" condition: followed by a character of - combining class 230 (Above) with no intervening - character of combining class 0 or 230 (Above). */ - /* Test the "after" condition. */ - { - const UNIT *s2 = s + count; - applies = false; - while (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - int ccc = uc_combining_class (uc2); - if (ccc == UC_CCC_A) - { - applies = true; - break; - } - if (ccc == UC_CCC_NR) - break; - s2 += count2; - } - } - break; - - case SCC_BEFORE_DOT: - /* "After" condition: followed by COMBINING DOT ABOVE - (U+0307). Any sequence of characters with a - combining class that is neither 0 nor 230 may - intervene between the current character and the - combining dot above. */ - /* Test the "after" condition. */ - { - const UNIT *s2 = s + count; - applies = false; - while (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ - { - applies = true; - break; - } - { - int ccc = uc_combining_class (uc2); - if (ccc == UC_CCC_A || ccc == UC_CCC_NR) - break; - } - s2 += count2; - } - } - break; - - case SCC_AFTER_I: - /* "Before" condition: There is an uppercase I before - it, and there is no intervening character of - combining class 0 or 230 (Above). */ - /* Test the "before" condition. */ - applies = (last_char_normal_or_above == 'I'); - break; - - default: - abort (); - } - if (rule->context < 0) - applies = !applies; - - if (applies) - { - /* The rule applies. - Look up the mapping (0 to 3 characters). */ - const unsigned short *mapped_in_rule = - (const unsigned short *)((const char *)rule + offset_in_rule); - - if (mapped_in_rule[0] == 0) - mapped_count = 0; - else - { - mapped_uc[0] = mapped_in_rule[0]; - if (mapped_in_rule[1] == 0) - mapped_count = 1; - else - { - mapped_uc[1] = mapped_in_rule[1]; - if (mapped_in_rule[2] == 0) - mapped_count = 2; - else - { - mapped_uc[2] = mapped_in_rule[2]; - mapped_count = 3; - } - } - } - goto found_mapping; - } - } - - /* Optimization: Save a hash table lookup in the next round. */ - if (!rule->has_next) - break; - } - } - - /* No special-cased mapping. So use the locale and context independent - mapping. */ - mapped_uc[0] = single_character_map (uc); - mapped_count = 1; - - found_mapping: - /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ - { - unsigned int i; - - for (i = 0; i < mapped_count; i++) - { - ucs4_t muc = mapped_uc[i]; - - /* Append muc to the result accumulator. */ - if (length < allocated) - { - int ret = U_UCTOMB (result + length, muc, allocated - length); - if (ret == -1) - { - errno = EINVAL; - goto fail1; - } - if (ret >= 0) - { - length += ret; - goto done_appending; - } - } - { - size_t old_allocated = allocated; - size_t new_allocated = 2 * old_allocated; - if (new_allocated < 64) - new_allocated = 64; - if (new_allocated < old_allocated) /* integer overflow? */ - abort (); - { - UNIT *larger_result; - if (result == NULL) - { - larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - } - else if (result == resultbuf) - { - larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - U_CPY (larger_result, resultbuf, length); - } - else - { - larger_result = - (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - } - result = larger_result; - allocated = new_allocated; - { - int ret = U_UCTOMB (result + length, muc, allocated - length); - if (ret == -1) - { - errno = EINVAL; - goto fail1; - } - if (ret < 0) - abort (); - length += ret; - goto done_appending; - } - } - } - done_appending: ; - } - } - - if (!is_case_ignorable (uc)) - last_char_except_ignorable = uc; - - { - int ccc = uc_combining_class (uc); - if (ccc == UC_CCC_A || ccc == UC_CCC_NR) - last_char_normal_or_above = uc; - } - - s += count; - wp += count; - } - } - - free (wordbreaks); - - if (nf != NULL) - { - /* Finally, normalize the result. */ - UNIT *normalized_result; - - normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); - if (normalized_result == NULL) - goto fail2; - - free (result); - return normalized_result; - } - - if (length == 0) - { - if (result == NULL) - { - /* Return a non-NULL value. NULL means error. */ - result = (UNIT *) malloc (1); - if (result == NULL) - { - errno = ENOMEM; - goto fail2; - } - } - } - else if (result != resultbuf && length < allocated) - { - /* Shrink the allocated memory if possible. */ - UNIT *memory; - - memory = (UNIT *) realloc (result, length * sizeof (UNIT)); - if (memory != NULL) - result = memory; - } - - *lengthp = length; - return result; - - fail1: - { - int saved_errno = errno; - free (wordbreaks); - errno = saved_errno; - } - fail2: - if (result != resultbuf) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } - return NULL; + return U_CT_TOTITLE (s, n, + unicase_empty_prefix_context, unicase_empty_suffix_context, + iso639_language, + nf, + resultbuf, lengthp); } diff --git a/lib/unicase/u8-totitle.c b/lib/unicase/u8-totitle.c index b51de46ea..cf29c1bab 100644 --- a/lib/unicase/u8-totitle.c +++ b/lib/unicase/u8-totitle.c @@ -20,24 +20,9 @@ /* Specification. */ #include "unicase.h" -#include -#include -#include -#include - -#include "unistr.h" -#include "unictype.h" -#include "uniwbrk.h" -#include "uninorm.h" -#include "special-casing.h" - #define FUNC u8_totitle #define UNIT uint8_t -#define U_WORDBREAKS u8_wordbreaks -#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe -#define U_UCTOMB u8_uctomb -#define U_CPY u8_cpy -#define U_NORMALIZE u8_normalize +#define U_CT_TOTITLE u8_ct_totitle #include "u-totitle.h" diff --git a/modules/unicase/u8-totitle b/modules/unicase/u8-totitle index b1cafb066..f576f4d75 100644 --- a/modules/unicase/u8-totitle +++ b/modules/unicase/u8-totitle @@ -7,23 +7,9 @@ lib/unicase/u-totitle.h Depends-on: unicase/base -unicase/special-casing -unicase/totitle -unicase/tolower -uniwbrk/wordbreak-property -uniwbrk/u8-wordbreaks -unictype/category-of -unictype/category-test -unictype/category-Lt -unictype/combining-class -unictype/property-lowercase -unictype/property-uppercase -unictype/property-soft-dotted -unistr/u8-mbtouc-unsafe -unistr/u8-uctomb -unistr/u8-cpy -uninorm/u8-normalize -stdbool +unicase/u8-ct-totitle +unicase/empty-prefix-context +unicase/empty-suffix-context configure.ac: -- 2.11.0