X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Funicase%2Fu-ct-totitle.h;h=8892a5df4b8d48e21d740373e461f5325e296799;hb=fa8d786628cd8a1dbd2194e7776bea07f97cca99;hp=181e569bc0662779da727aef805740560aec6e5c;hpb=db3758fddfc8971fd52a3d82741df9a412a7cc59;p=gnulib.git diff --git a/lib/unicase/u-ct-totitle.h b/lib/unicase/u-ct-totitle.h index 181e569bc..8892a5df4 100644 --- a/lib/unicase/u-ct-totitle.h +++ b/lib/unicase/u-ct-totitle.h @@ -1,5 +1,5 @@ /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent). - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible , 2009. This program is free software: you can redistribute it and/or modify it @@ -55,10 +55,10 @@ FUNC (const UNIT *s, size_t n, { wordbreaks = (char *) malloc (n); if (wordbreaks == NULL) - { - errno = ENOMEM; - goto fail2; - } + { + errno = ENOMEM; + goto fail2; + } U_WORDBREAKS (s, n, wordbreaks); } else @@ -70,12 +70,12 @@ FUNC (const UNIT *s, size_t n, /* When considering the string as segmented by word boundaries: For each such segment: - - In the first part, we are searching for the first cased character. - In this state, in_word_first_part = true, and no conversion takes - place. - - In the second part, we are converting every character: the first - among these characters to title case, the other ones to lower case. - In this state, in_word_first_part = false. */ + - In the first part, we are searching for the first cased character. + In this state, in_word_first_part = true, and no conversion takes + place. + - In the second part, we are converting every character: the first + among these characters to title case, the other ones to lower case. + In this state, in_word_first_part = false. */ bool in_word_first_part = true; /* Helper for evaluating the FINAL_SIGMA condition: @@ -90,354 +90,356 @@ FUNC (const UNIT *s, size_t n, while (s < s_end) { - /* Fetch the next character. */ - ucs4_t uc; - int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); - - ucs4_t (*single_character_map) (ucs4_t); - size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ - - ucs4_t mapped_uc[3]; - unsigned int mapped_count; - - if (*wp) - /* Crossing a word boundary. */ - in_word_first_part = true; - - /* Determine single_character_map, offset_in_rule. - There are three possibilities: - - uc should not be converted. - - uc should be titlecased. - - uc should be lowercased. */ - if (in_word_first_part) - { - if (uc_is_cased (uc)) - { - /* uc is to be titlecased. */ - single_character_map = uc_totitle; - offset_in_rule = offsetof (struct special_casing_rule, title[0]); - in_word_first_part = false; - } - else - { - /* uc is not converted. */ - single_character_map = NULL; - offset_in_rule = 0; - } - } - else - { - /* uc is to be lowercased. */ - single_character_map = uc_tolower; - offset_in_rule = offsetof (struct special_casing_rule, lower[0]); - } - - /* Actually map uc. */ - if (single_character_map == NULL) - { - mapped_uc[0] = uc; - mapped_count = 1; - goto found_mapping; - } - - if (uc < 0x10000) - { - /* Look first in the special-casing table. */ - char code[3]; - - code[0] = (uc >> 8) & 0xff; - code[1] = uc & 0xff; - - for (code[2] = 0; ; code[2]++) - { - const struct special_casing_rule *rule = - gl_unicase_special_lookup (code, 3); - - if (rule == NULL) - break; - - /* Test if the condition applies. */ - /* Does the language apply? */ - if (rule->language[0] == '\0' - || (iso639_language != NULL - && iso639_language[0] == rule->language[0] - && iso639_language[1] == rule->language[1])) - { - /* Does the context apply? */ - int context = rule->context; - bool applies; - - if (context < 0) - context = - context; - switch (context) - { - case SCC_ALWAYS: - applies = true; - break; - - case SCC_FINAL_SIGMA: - /* "Before" condition: preceded by a sequence - consisting of a cased letter and a case-ignorable - sequence. - "After" condition: not followed by a sequence - consisting of a case-ignorable sequence and then a - cased letter. */ - /* Test the "before" condition. */ - applies = uc_is_cased (last_char_except_ignorable); - /* Test the "after" condition. */ - if (applies) - { - const UNIT *s2 = s + count; - for (;;) - { - if (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - if (uc_is_cased (uc2)) - { - applies = false; - break; - } - if (!uc_is_case_ignorable (uc2)) - break; - s2 += count2; - } - else - { - applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0); - break; - } - } - } - break; - - case SCC_AFTER_SOFT_DOTTED: - /* "Before" condition: There is a Soft_Dotted character - before it, with no intervening character of - combining class 0 or 230 (Above). */ - /* Test the "before" condition. */ - applies = uc_is_property_soft_dotted (last_char_normal_or_above); - break; - - case SCC_MORE_ABOVE: - /* "After" condition: followed by a character of - combining class 230 (Above) with no intervening - character of combining class 0 or 230 (Above). */ - /* Test the "after" condition. */ - { - const UNIT *s2 = s + count; - applies = false; - for (;;) - { - if (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - int ccc = uc_combining_class (uc2); - if (ccc == UC_CCC_A) - { - applies = true; - break; - } - if (ccc == UC_CCC_NR) - break; - s2 += count2; - } - else - { - applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); - break; - } - } - } - break; - - case SCC_BEFORE_DOT: - /* "After" condition: followed by COMBINING DOT ABOVE - (U+0307). Any sequence of characters with a - combining class that is neither 0 nor 230 may - intervene between the current character and the - combining dot above. */ - /* Test the "after" condition. */ - { - const UNIT *s2 = s + count; - applies = false; - for (;;) - { - if (s2 < s_end) - { - ucs4_t uc2; - int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); - if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ - { - applies = true; - break; - } - { - int ccc = uc_combining_class (uc2); - if (ccc == UC_CCC_A || ccc == UC_CCC_NR) - break; - } - s2 += count2; - } - else - { - applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); - break; - } - } - } - break; - - case SCC_AFTER_I: - /* "Before" condition: There is an uppercase I before - it, and there is no intervening character of - combining class 0 or 230 (Above). */ - /* Test the "before" condition. */ - applies = (last_char_normal_or_above == 'I'); - break; - - default: - abort (); - } - if (rule->context < 0) - applies = !applies; - - if (applies) - { - /* The rule applies. - Look up the mapping (0 to 3 characters). */ - const unsigned short *mapped_in_rule = - (const unsigned short *)((const char *)rule + offset_in_rule); - - if (mapped_in_rule[0] == 0) - mapped_count = 0; - else - { - mapped_uc[0] = mapped_in_rule[0]; - if (mapped_in_rule[1] == 0) - mapped_count = 1; - else - { - mapped_uc[1] = mapped_in_rule[1]; - if (mapped_in_rule[2] == 0) - mapped_count = 2; - else - { - mapped_uc[2] = mapped_in_rule[2]; - mapped_count = 3; - } - } - } - goto found_mapping; - } - } - - /* Optimization: Save a hash table lookup in the next round. */ - if (!rule->has_next) - break; - } - } - - /* No special-cased mapping. So use the locale and context independent - mapping. */ - mapped_uc[0] = single_character_map (uc); - mapped_count = 1; + /* Fetch the next character. */ + ucs4_t uc; + int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); + + ucs4_t (*single_character_map) (ucs4_t); + size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ + + ucs4_t mapped_uc[3]; + unsigned int mapped_count; + + if (*wp) + /* Crossing a word boundary. */ + in_word_first_part = true; + + /* Determine single_character_map, offset_in_rule. + There are three possibilities: + - uc should not be converted. + - uc should be titlecased. + - uc should be lowercased. */ + if (in_word_first_part) + { + if (uc_is_cased (uc)) + { + /* uc is to be titlecased. */ + single_character_map = uc_totitle; + offset_in_rule = offsetof (struct special_casing_rule, title[0]); + in_word_first_part = false; + } + else + { + /* uc is not converted. */ + single_character_map = NULL; + offset_in_rule = 0; + } + } + else + { + /* uc is to be lowercased. */ + single_character_map = uc_tolower; + offset_in_rule = offsetof (struct special_casing_rule, lower[0]); + } + + /* Actually map uc. */ + if (single_character_map == NULL) + { + mapped_uc[0] = uc; + mapped_count = 1; + goto found_mapping; + } + + if (uc < 0x10000) + { + /* Look first in the special-casing table. */ + char code[3]; + + code[0] = (uc >> 8) & 0xff; + code[1] = uc & 0xff; + + for (code[2] = 0; ; code[2]++) + { + const struct special_casing_rule *rule = + gl_unicase_special_lookup (code, 3); + + if (rule == NULL) + break; + + /* Test if the condition applies. */ + /* Does the language apply? */ + if (rule->language[0] == '\0' + || (iso639_language != NULL + && iso639_language[0] == rule->language[0] + && iso639_language[1] == rule->language[1])) + { + /* Does the context apply? */ + int context = rule->context; + bool applies; + + if (context < 0) + context = - context; + switch (context) + { + case SCC_ALWAYS: + applies = true; + break; + + case SCC_FINAL_SIGMA: + /* "Before" condition: preceded by a sequence + consisting of a cased letter and a case-ignorable + sequence. + "After" condition: not followed by a sequence + consisting of a case-ignorable sequence and then a + cased letter. */ + /* Test the "before" condition. */ + applies = uc_is_cased (last_char_except_ignorable); + /* Test the "after" condition. */ + if (applies) + { + const UNIT *s2 = s + count; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + /* Our uc_is_case_ignorable function is + known to return false for all cased + characters. So we can call + uc_is_case_ignorable first. */ + if (!uc_is_case_ignorable (uc2)) + { + applies = ! uc_is_cased (uc2); + break; + } + s2 += count2; + } + else + { + applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); + break; + } + } + } + break; + + case SCC_AFTER_SOFT_DOTTED: + /* "Before" condition: There is a Soft_Dotted character + before it, with no intervening character of + combining class 0 or 230 (Above). */ + /* Test the "before" condition. */ + applies = uc_is_property_soft_dotted (last_char_normal_or_above); + break; + + case SCC_MORE_ABOVE: + /* "After" condition: followed by a character of + combining class 230 (Above) with no intervening + character of combining class 0 or 230 (Above). */ + /* Test the "after" condition. */ + { + const UNIT *s2 = s + count; + applies = false; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + int ccc = uc_combining_class (uc2); + if (ccc == UC_CCC_A) + { + applies = true; + break; + } + if (ccc == UC_CCC_NR) + break; + s2 += count2; + } + else + { + applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); + break; + } + } + } + break; + + case SCC_BEFORE_DOT: + /* "After" condition: followed by COMBINING DOT ABOVE + (U+0307). Any sequence of characters with a + combining class that is neither 0 nor 230 may + intervene between the current character and the + combining dot above. */ + /* Test the "after" condition. */ + { + const UNIT *s2 = s + count; + applies = false; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ + { + applies = true; + break; + } + { + int ccc = uc_combining_class (uc2); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + break; + } + s2 += count2; + } + else + { + applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); + break; + } + } + } + break; + + case SCC_AFTER_I: + /* "Before" condition: There is an uppercase I before + it, and there is no intervening character of + combining class 0 or 230 (Above). */ + /* Test the "before" condition. */ + applies = (last_char_normal_or_above == 'I'); + break; + + default: + abort (); + } + if (rule->context < 0) + applies = !applies; + + if (applies) + { + /* The rule applies. + Look up the mapping (0 to 3 characters). */ + const unsigned short *mapped_in_rule = + (const unsigned short *)((const char *)rule + offset_in_rule); + + if (mapped_in_rule[0] == 0) + mapped_count = 0; + else + { + mapped_uc[0] = mapped_in_rule[0]; + if (mapped_in_rule[1] == 0) + mapped_count = 1; + else + { + mapped_uc[1] = mapped_in_rule[1]; + if (mapped_in_rule[2] == 0) + mapped_count = 2; + else + { + mapped_uc[2] = mapped_in_rule[2]; + mapped_count = 3; + } + } + } + goto found_mapping; + } + } + + /* Optimization: Save a hash table lookup in the next round. */ + if (!rule->has_next) + break; + } + } + + /* No special-cased mapping. So use the locale and context independent + mapping. */ + mapped_uc[0] = single_character_map (uc); + mapped_count = 1; found_mapping: - /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ - { - unsigned int i; - - for (i = 0; i < mapped_count; i++) - { - ucs4_t muc = mapped_uc[i]; - - /* Append muc to the result accumulator. */ - if (length < allocated) - { - int ret = U_UCTOMB (result + length, muc, allocated - length); - if (ret == -1) - { - errno = EINVAL; - goto fail1; - } - if (ret >= 0) - { - length += ret; - goto done_appending; - } - } - { - size_t old_allocated = allocated; - size_t new_allocated = 2 * old_allocated; - if (new_allocated < 64) - new_allocated = 64; - if (new_allocated < old_allocated) /* integer overflow? */ - abort (); - { - UNIT *larger_result; - if (result == NULL) - { - larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - } - else if (result == resultbuf) - { - larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - U_CPY (larger_result, resultbuf, length); - } - else - { - larger_result = - (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); - if (larger_result == NULL) - { - errno = ENOMEM; - goto fail1; - } - } - result = larger_result; - allocated = new_allocated; - { - int ret = U_UCTOMB (result + length, muc, allocated - length); - if (ret == -1) - { - errno = EINVAL; - goto fail1; - } - if (ret < 0) - abort (); - length += ret; - goto done_appending; - } - } - } - done_appending: ; - } - } - - if (!uc_is_case_ignorable (uc)) - last_char_except_ignorable = uc; - - { - int ccc = uc_combining_class (uc); - if (ccc == UC_CCC_A || ccc == UC_CCC_NR) - last_char_normal_or_above = uc; - } - - s += count; - wp += count; + /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ + { + unsigned int i; + + for (i = 0; i < mapped_count; i++) + { + ucs4_t muc = mapped_uc[i]; + + /* Append muc to the result accumulator. */ + if (length < allocated) + { + int ret = U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail1; + } + if (ret >= 0) + { + length += ret; + goto done_appending; + } + } + { + size_t old_allocated = allocated; + size_t new_allocated = 2 * old_allocated; + if (new_allocated < 64) + new_allocated = 64; + if (new_allocated < old_allocated) /* integer overflow? */ + abort (); + { + UNIT *larger_result; + if (result == NULL) + { + larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + } + else if (result == resultbuf) + { + larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + U_CPY (larger_result, resultbuf, length); + } + else + { + larger_result = + (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + } + result = larger_result; + allocated = new_allocated; + { + int ret = U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail1; + } + if (ret < 0) + abort (); + length += ret; + goto done_appending; + } + } + } + done_appending: ; + } + } + + if (!uc_is_case_ignorable (uc)) + last_char_except_ignorable = uc; + + { + int ccc = uc_combining_class (uc); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + last_char_normal_or_above = uc; + } + + s += count; + wp += count; } } @@ -450,7 +452,7 @@ FUNC (const UNIT *s, size_t n, normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); if (normalized_result == NULL) - goto fail2; + goto fail2; free (result); return normalized_result; @@ -459,15 +461,15 @@ FUNC (const UNIT *s, size_t n, if (length == 0) { if (result == NULL) - { - /* Return a non-NULL value. NULL means error. */ - result = (UNIT *) malloc (1); - if (result == NULL) - { - errno = ENOMEM; - goto fail2; - } - } + { + /* Return a non-NULL value. NULL means error. */ + result = (UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + goto fail2; + } + } } else if (result != resultbuf && length < allocated) { @@ -476,7 +478,7 @@ FUNC (const UNIT *s, size_t n, memory = (UNIT *) realloc (result, length * sizeof (UNIT)); if (memory != NULL) - result = memory; + result = memory; } *lengthp = length;