acl: allow cross-compilation to Gentoo
[gnulib.git] / lib / unicase / u-totitle.h
index a72e4db..4b94540 100644 (file)
@@ -1,5 +1,5 @@
 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
-   Copyright (C) 2009 Free Software Foundation, Inc.
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2009.
 
    This program is free software: you can redistribute it and/or modify it
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
-/* Quoting the Unicode standard:
-     Definition: A character is defined to be "cased" if it has the Lowercase or
-     Uppercase property or has a General_Category value of Titlecase_Letter.  */
-static inline bool
-is_cased (ucs4_t uc)
-{
-  return (uc_is_property_lowercase (uc)
-         || uc_is_property_uppercase (uc)
-         || uc_is_general_category (uc, UC_TITLECASE_LETTER));
-}
-
-/* Quoting the Unicode standard:
-     Definition: A character is defined to be "case-ignorable" if it has the
-     value MidLetter {or the value MidNumLet} for the Word_Break property or
-     its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
-     Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
-   The text marked in braces was added in Unicode 5.1.0, see
-   <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
-   Definition of case-ignorable".   */
-static inline bool
-is_case_ignorable (ucs4_t uc)
-{
-  int wbp = uc_wordbreak_property (uc);
-
-  return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET
-         || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn
-                                                  | UC_CATEGORY_MASK_Me
-                                                  | UC_CATEGORY_MASK_Cf
-                                                  | UC_CATEGORY_MASK_Lm
-                                                  | UC_CATEGORY_MASK_Sk));
-}
-
-/* Quoting the Unicode standard, section "Default Case Algorithms":
-     Find the word boundaries in X according to Unicode Standard Annex #29,
-     “Text Boundaries.” For each word boundary, find the first cased character
-     F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
-     then map all characters C between F and the following word boundary to
-     Lowercase_Mapping(C).  */
-
 UNIT *
 FUNC (const UNIT *s, size_t n, const char *iso639_language,
       uninorm_t nf,
       UNIT *resultbuf, size_t *lengthp)
 {
-  /* The result being accumulated.  */
-  UNIT *result;
-  size_t length;
-  size_t allocated;
-  /* An array containing the word break positions.  */
-  char *wordbreaks;
-
-  /* Initialize the accumulator.  */
-  if (nf != NULL || resultbuf == NULL)
-    {
-      result = NULL;
-      allocated = 0;
-    }
-  else
-    {
-      result = resultbuf;
-      allocated = *lengthp;
-    }
-  length = 0;
-
-  /* Initialize the word breaks array.  */
-  if (n > 0)
-    {
-      wordbreaks = (char *) malloc (n);
-      if (wordbreaks == NULL)
-       {
-         errno = ENOMEM;
-         goto fail2;
-       }
-      U_WORDBREAKS (s, n, wordbreaks);
-    }
-  else
-    wordbreaks = NULL;
-
-  {
-    const UNIT *s_end = s + n;
-    const char *wp = wordbreaks;
-
-    /* When considering the string as segmented by word boundaries: For each
-       such segment:
-       - In the first part, we are searching for the first cased character.
-         In this state, in_word_first_part = true, and no conversion takes
-         place.
-       - In the second part, we are converting every character: the first
-         among these characters to title case, the other ones to lower case.
-         In this state, in_word_first_part = false.  */
-    bool in_word_first_part = true;
-
-    /* Helper for evaluating the FINAL_SIGMA condition:
-       Last character that was not case-ignorable.  */
-    ucs4_t last_char_except_ignorable = 0xFFFD;
-
-    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
-       Last character that was of combining class 230 ("Above") or 0.  */
-    ucs4_t last_char_normal_or_above = 0xFFFD;
-
-    while (s < s_end)
-      {
-       /* Fetch the next character.  */
-       ucs4_t uc;
-       int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
-
-       ucs4_t (*single_character_map) (ucs4_t);
-       size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
-
-       ucs4_t mapped_uc[3];
-       unsigned int mapped_count;
-
-       if (*wp)
-         /* Crossing a word boundary.  */
-         in_word_first_part = true;
-
-       /* Determine single_character_map, offset_in_rule.
-          There are three possibilities:
-            - uc should not be converted.
-            - uc should be titlecased.
-            - uc should be lowercased.  */
-       if (in_word_first_part)
-         {
-           if (is_cased (uc))
-             {
-               /* uc is to be titlecased.  */
-               single_character_map = uc_totitle;
-               offset_in_rule = offsetof (struct special_casing_rule, title[0]);
-               in_word_first_part = false;
-             }
-           else
-             {
-               /* uc is not converted.  */
-               single_character_map = NULL;
-               offset_in_rule = 0;
-             }
-         }
-       else
-         {
-           /* uc is to be lowercased.  */
-           single_character_map = uc_tolower;
-           offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
-         }
-
-       /* Actually map uc.  */
-       if (single_character_map == NULL)
-         {
-           mapped_uc[0] = uc;
-           mapped_count = 1;
-           goto found_mapping;
-         }
-
-       if (uc < 0x10000)
-         {
-           /* Look first in the special-casing table.  */
-           char code[3];
-
-           code[0] = (uc >> 8) & 0xff;
-           code[1] = uc & 0xff;
-
-           for (code[2] = 0; ; code[2]++)
-             {
-               const struct special_casing_rule *rule =
-                 gl_unicase_special_lookup (code, 3);
-
-               if (rule == NULL)
-                 break;
-
-               /* Test if the condition applies.  */
-               /* Does the language apply?  */
-               if (rule->language[0] == '\0'
-                   || (iso639_language != NULL
-                       && iso639_language[0] == rule->language[0]
-                       && iso639_language[1] == rule->language[1]))
-                 {
-                   /* Does the context apply?  */
-                   int context = rule->context;
-                   bool applies;
-
-                   if (context < 0)
-                     context = - context;
-                   switch (context)
-                     {
-                     case SCC_ALWAYS:
-                       applies = true;
-                       break;
-
-                     case SCC_FINAL_SIGMA:
-                       /* "Before" condition: preceded by a sequence
-                          consisting of a cased letter and a case-ignorable
-                          sequence.
-                          "After" condition: not followed by a sequence
-                          consisting of a case-ignorable sequence and then a
-                          cased letter.  */
-                       /* Test the "before" condition.  */
-                       applies = is_cased (last_char_except_ignorable);
-                       /* Test the "after" condition.  */
-                       if (applies)
-                         {
-                           const UNIT *s2 = s + count;
-                           while (s2 < s_end)
-                             {
-                               ucs4_t uc2;
-                               int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
-                               if (is_cased (uc2))
-                                 {
-                                   applies = false;
-                                   break;
-                                 }
-                               if (!is_case_ignorable (uc2))
-                                 break;
-                               s2 += count2;
-                             }
-                         }
-                       break;
-
-                     case SCC_AFTER_SOFT_DOTTED:
-                       /* "Before" condition: There is a Soft_Dotted character
-                          before it, with no intervening character of
-                          combining class 0 or 230 (Above).  */
-                       /* Test the "before" condition.  */
-                       applies = uc_is_property_soft_dotted (last_char_normal_or_above);
-                       break;
-
-                     case SCC_MORE_ABOVE:
-                       /* "After" condition: followed by a character of
-                          combining class 230 (Above) with no intervening
-                          character of combining class 0 or 230 (Above).  */
-                       /* Test the "after" condition.  */
-                       {
-                         const UNIT *s2 = s + count;
-                         applies = false;
-                         while (s2 < s_end)
-                           {
-                             ucs4_t uc2;
-                             int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
-                             int ccc = uc_combining_class (uc2);
-                             if (ccc == UC_CCC_A)
-                               {
-                                 applies = true;
-                                 break;
-                               }
-                             if (ccc == UC_CCC_NR)
-                               break;
-                             s2 += count2;
-                           }
-                       }
-                       break;
-
-                     case SCC_BEFORE_DOT:
-                       /* "After" condition: followed by COMBINING DOT ABOVE
-                          (U+0307). Any sequence of characters with a
-                          combining class that is neither 0 nor 230 may
-                          intervene between the current character and the
-                          combining dot above.  */
-                       /* Test the "after" condition.  */
-                       {
-                         const UNIT *s2 = s + count;
-                         applies = false;
-                         while (s2 < s_end)
-                           {
-                             ucs4_t uc2;
-                             int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
-                             if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
-                               {
-                                 applies = true;
-                                 break;
-                               }
-                             {
-                               int ccc = uc_combining_class (uc2);
-                               if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
-                                 break;
-                             }
-                             s2 += count2;
-                           }
-                       }
-                       break;
-
-                     case SCC_AFTER_I:
-                       /* "Before" condition: There is an uppercase I before
-                          it, and there is no intervening character of
-                          combining class 0 or 230 (Above).  */
-                       /* Test the "before" condition.  */
-                       applies = (last_char_normal_or_above == 'I');
-                       break;
-
-                     default:
-                       abort ();
-                     }
-                   if (rule->context < 0)
-                     applies = !applies;
-
-                   if (applies)
-                     {
-                       /* The rule applies.
-                          Look up the mapping (0 to 3 characters).  */
-                       const unsigned short *mapped_in_rule =
-                         (const unsigned short *)((const char *)rule + offset_in_rule);
-
-                       if (mapped_in_rule[0] == 0)
-                         mapped_count = 0;
-                       else
-                         {
-                           mapped_uc[0] = mapped_in_rule[0];
-                           if (mapped_in_rule[1] == 0)
-                             mapped_count = 1;
-                           else
-                             {
-                               mapped_uc[1] = mapped_in_rule[1];
-                               if (mapped_in_rule[2] == 0)
-                                 mapped_count = 2;
-                               else
-                                 {
-                                   mapped_uc[2] = mapped_in_rule[2];
-                                   mapped_count = 3;
-                                 }
-                             }
-                         }
-                       goto found_mapping;
-                     }
-                 }
-
-               /* Optimization: Save a hash table lookup in the next round.  */
-               if (!rule->has_next)
-                 break;
-             }
-         }
-
-       /* No special-cased mapping.  So use the locale and context independent
-          mapping.  */
-       mapped_uc[0] = single_character_map (uc);
-       mapped_count = 1;
-
-       found_mapping:
-       /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
-       {
-         unsigned int i;
-
-         for (i = 0; i < mapped_count; i++)
-           {
-             ucs4_t muc = mapped_uc[i];
-
-             /* Append muc to the result accumulator.  */
-             if (length < allocated)
-               {
-                 int ret = U_UCTOMB (result + length, muc, allocated - length);
-                 if (ret == -1)
-                   {
-                     errno = EINVAL;
-                     goto fail1;
-                   }
-                 if (ret >= 0)
-                   {
-                     length += ret;
-                     goto done_appending;
-                   }
-               }
-             {
-               size_t old_allocated = allocated;
-               size_t new_allocated = 2 * old_allocated;
-               if (new_allocated < 64)
-                 new_allocated = 64;
-               if (new_allocated < old_allocated) /* integer overflow? */
-                 abort ();
-               {
-                 UNIT *larger_result;
-                 if (result == NULL)
-                   {
-                     larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
-                     if (larger_result == NULL)
-                       {
-                         errno = ENOMEM;
-                         goto fail1;
-                       }
-                   }
-                 else if (result == resultbuf)
-                   {
-                     larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
-                     if (larger_result == NULL)
-                       {
-                         errno = ENOMEM;
-                         goto fail1;
-                       }
-                     U_CPY (larger_result, resultbuf, length);
-                   }
-                 else
-                   {
-                     larger_result =
-                       (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
-                     if (larger_result == NULL)
-                       {
-                         errno = ENOMEM;
-                         goto fail1;
-                       }
-                   }
-                 result = larger_result;
-                 allocated = new_allocated;
-                 {
-                   int ret = U_UCTOMB (result + length, muc, allocated - length);
-                   if (ret == -1)
-                     {
-                       errno = EINVAL;
-                       goto fail1;
-                     }
-                   if (ret < 0)
-                     abort ();
-                   length += ret;
-                   goto done_appending;
-                 }
-               }
-             }
-            done_appending: ;
-           }
-       }
-
-       if (!is_case_ignorable (uc))
-         last_char_except_ignorable = uc;
-
-       {
-         int ccc = uc_combining_class (uc);
-         if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
-           last_char_normal_or_above = uc;
-       }
-
-       s += count;
-       wp += count;
-      }
-  }
-
-  free (wordbreaks);
-
-  if (nf != NULL)
-    {
-      /* Finally, normalize the result.  */
-      UNIT *normalized_result;
-
-      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
-      if (normalized_result == NULL)
-       goto fail2;
-
-      free (result);
-      return normalized_result;
-    }
-
-  if (length == 0)
-    {
-      if (result == NULL)
-       {
-         /* Return a non-NULL value.  NULL means error.  */
-         result = (UNIT *) malloc (1);
-         if (result == NULL)
-           {
-             errno = ENOMEM;
-             goto fail2;
-           }
-       }
-    }
-  else if (result != resultbuf && length < allocated)
-    {
-      /* Shrink the allocated memory if possible.  */
-      UNIT *memory;
-
-      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
-      if (memory != NULL)
-       result = memory;
-    }
-
-  *lengthp = length;
-  return result;
-
- fail1:
-  {
-    int saved_errno = errno;
-    free (wordbreaks);
-    errno = saved_errno;
-  }
- fail2:
-  if (result != resultbuf)
-    {
-      int saved_errno = errno;
-      free (result);
-      errno = saved_errno;
-    }
-  return NULL;
+  return U_CT_TOTITLE (s, n,
+                       unicase_empty_prefix_context, unicase_empty_suffix_context,
+                       iso639_language,
+                       nf,
+                       resultbuf, lengthp);
 }