New module 'uninorm/u8-normalize'.
authorBruno Haible <bruno@clisp.org>
Sat, 21 Feb 2009 11:38:14 +0000 (12:38 +0100)
committerBruno Haible <bruno@clisp.org>
Sat, 21 Feb 2009 11:38:14 +0000 (12:38 +0100)
ChangeLog
lib/uninorm/normalize-internal.h [new file with mode: 0644]
lib/uninorm/u-normalize-internal.h [new file with mode: 0644]
lib/uninorm/u8-normalize.c [new file with mode: 0644]
modules/uninorm/u8-normalize [new file with mode: 0644]

index d478e9f..175d2d9 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2009-02-21  Bruno Haible  <bruno@clisp.org>
 
+       New module 'uninorm/u8-normalize'.
+       * lib/uninorm/u8-normalize.c: New file.
+       * lib/uninorm/normalize-internal.h: New file.
+       * lib/uninorm/u-normalize-internal.h: New file.
+       * modules/uninorm/u8-normalize: New file.
+
        New module 'uninorm/decompose-internal'.
        * lib/uninorm/decompose-internal.c: New file.
        * modules/uninorm/decompose-internal: New file.
diff --git a/lib/uninorm/normalize-internal.h b/lib/uninorm/normalize-internal.h
new file mode 100644 (file)
index 0000000..e1ef9c4
--- /dev/null
@@ -0,0 +1,34 @@
+/* Normalization of Unicode strings.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+#include "unitypes.h"
+
+/* Complete definition of normalization form descriptor.  */
+struct unicode_normalization_form
+{
+  /* Bit mask containing meta-information.  */
+  unsigned int description;
+  #define NF_IS_COMPAT_DECOMPOSING  (1 << 0)
+  #define NF_IS_COMPOSING           (1 << 1)
+  /* Function that decomposes a Unicode character.  */
+  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
+  /* Function that combines two Unicode characters, a starter and another
+     character.  */
+  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
+};
diff --git a/lib/uninorm/u-normalize-internal.h b/lib/uninorm/u-normalize-internal.h
new file mode 100644 (file)
index 0000000..f6e01aa
--- /dev/null
@@ -0,0 +1,352 @@
+/* Decomposition and composition of Unicode strings.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+UNIT *
+FUNC (uninorm_t nf, const UNIT *s, size_t n,
+      UNIT *resultbuf, size_t *lengthp)
+{
+  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
+  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
+
+  /* The result being accumulated.  */
+  UNIT *result;
+  size_t length;
+  size_t allocated;
+  /* The buffer for sorting.  */
+  #define SORTBUF_PREALLOCATED 64
+  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
+  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
+  size_t sortbuf_allocated;
+  size_t sortbuf_count;
+
+  /* Initialize the accumulator.  */
+  if (resultbuf == NULL)
+    {
+      result = NULL;
+      allocated = 0;
+    }
+  else
+    {
+      result = resultbuf;
+      allocated = *lengthp;
+    }
+  length = 0;
+
+  /* Initialize the buffer for sorting.  */
+  sortbuf = sortbuf_preallocated;
+  sortbuf_allocated = SORTBUF_PREALLOCATED;
+  sortbuf_count = 0;
+
+  {
+    const UNIT *s_end = s + n;
+
+    for (;;)
+      {
+       int count;
+       ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+       int decomposed_count;
+       int i;
+
+       if (s < s_end)
+         {
+           /* Fetch the next character.  */
+           count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
+           decomposed_count = 1;
+
+           /* Decompose it, recursively.
+              It would be possible to precompute the recursive decomposition
+              and store it in a table.  But this would significantly increase
+              the size of the decomposition tables, because for example for
+              U+1FC1 the recursive canonical decomposition and the recursive
+              compatibility decomposition are different.  */
+           {
+             int curr;
+
+             for (curr = 0; curr < decomposed_count; )
+               {
+                 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
+                    all elements are atomic.  */
+                 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+                 int curr_decomposed_count;
+
+                 curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
+                 if (curr_decomposed_count >= 0)
+                   {
+                     /* Move curr_decomposed[0..curr_decomposed_count-1] over
+                        decomposed[curr], making room.  It's not worth using
+                        memcpy() here, since the counts are so small.  */
+                     int shift = curr_decomposed_count - 1;
+
+                     if (shift < 0)
+                       abort ();
+                     if (shift > 0)
+                       {
+                         int j;
+
+                         decomposed_count += shift;
+                         if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
+                           abort ();
+                         for (j = decomposed_count - 1 - shift; j > curr; j--)
+                           decomposed[j + shift] = decomposed[j];
+                       }
+                     for (; shift >= 0; shift--)
+                       decomposed[curr + shift] = curr_decomposed[shift];
+                   }
+                 else
+                   {
+                     /* decomposed[curr] is atomic.  */
+                     curr++;
+                   }
+               }
+           }
+         }
+       else
+         {
+           count = 0;
+           decomposed_count = 0;
+         }
+
+       i = 0;
+       for (;;)
+         {
+           ucs4_t uc;
+           int ccc;
+
+           if (s < s_end)
+             {
+               /* Fetch the next character from the decomposition.  */
+               if (i == decomposed_count)
+                 break;
+               uc = decomposed[i];
+               ccc = uc_combining_class (uc);
+             }
+           else
+             {
+               /* End of string reached.  */
+               uc = 0;
+               ccc = 0;
+             }
+
+           if (ccc == 0)
+             {
+               size_t j;
+
+               /* Apply the canonical ordering algorithm to the accumulated
+                  sequence of characters.  */
+               if (sortbuf_count > 1)
+                 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
+                                                          sortbuf + sortbuf_count);
+
+               if (composer != NULL)
+                 {
+                   /* Attempt to combine decomposed characters, as specified
+                      in the Unicode Standard Annex #15 "Unicode Normalization
+                      Forms".  We need to check
+                        1. whether the first accumulated character is a
+                           "starter" (i.e. has ccc = 0).  This is usually the
+                           case.  But when the string starts with a
+                           non-starter, the sortbuf also starts with a
+                           non-starter.  Btw, this check could also be
+                           omitted, because the composition table has only
+                           entries (code1, code2) for which code1 is a
+                           starter; if the first accumulated character is not
+                           a starter, no lookup will succeed.
+                        2. If the sortbuf has more than one character, check
+                           for each of these characters that are not "blocked"
+                           from the starter (i.e. have a ccc that is higher
+                           than the ccc of the previous character) whether it
+                           can be combined with the first character.
+                        3. If only one character is left in sortbuf, check
+                           whether it can be combined with the next character
+                           (also a starter).  */
+                   if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
+                     {
+                       for (j = 1; j < sortbuf_count; )
+                         {
+                           if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
+                             {
+                               ucs4_t combined =
+                                 composer (sortbuf[0].code, sortbuf[j].code);
+                               if (combined)
+                                 {
+                                   size_t k;
+
+                                   sortbuf[0].code = combined;
+                                   /* sortbuf[0].ccc = 0, still valid.  */
+                                   for (k = j + 1; k < sortbuf_count; k++)
+                                     sortbuf[k - 1] = sortbuf[k];
+                                   sortbuf_count--;
+                                   continue;
+                                 }
+                             }
+                           j++;
+                         }
+                       if (s < s_end && sortbuf_count == 1)
+                         {
+                           ucs4_t combined =
+                             composer (sortbuf[0].code, uc);
+                           if (combined)
+                             {
+                               uc = combined;
+                               ccc = 0;
+                               /* uc could be further combined with subsequent
+                                  characters.  So don't put it into sortbuf[0] in
+                                  this round, only in the next round.  */
+                               sortbuf_count = 0;
+                             }
+                         }
+                     }
+                 }
+
+               for (j = 0; j < sortbuf_count; j++)
+                 {
+                   ucs4_t muc = sortbuf[j].code;
+
+                   /* Append muc to the result accumulator.  */
+                   if (length < allocated)
+                     {
+                       int ret =
+                         U_UCTOMB (result + length, muc, allocated - length);
+                       if (ret == -1)
+                         {
+                           errno = EINVAL;
+                           goto fail;
+                         }
+                       if (ret >= 0)
+                         {
+                           length += ret;
+                           goto done_appending;
+                         }
+                     }
+                   {
+                     size_t old_allocated = allocated;
+                     size_t new_allocated = 2 * old_allocated;
+                     if (new_allocated < 64)
+                       new_allocated = 64;
+                     if (new_allocated < old_allocated) /* integer overflow? */
+                       abort ();
+                     {
+                       UNIT *larger_result;
+                       if (result == NULL)
+                         {
+                           larger_result =
+                             (UNIT *) malloc (new_allocated * sizeof (UNIT));
+                           if (larger_result == NULL)
+                             {
+                               errno = ENOMEM;
+                               goto fail;
+                             }
+                         }
+                       else if (result == resultbuf)
+                         {
+                           larger_result =
+                             (UNIT *) malloc (new_allocated * sizeof (UNIT));
+                           if (larger_result == NULL)
+                             {
+                               errno = ENOMEM;
+                               goto fail;
+                             }
+                           U_CPY (larger_result, resultbuf, length);
+                         }
+                       else
+                         {
+                           larger_result =
+                             (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
+                           if (larger_result == NULL)
+                             {
+                               errno = ENOMEM;
+                               goto fail;
+                             }
+                         }
+                       result = larger_result;
+                       allocated = new_allocated;
+                       {
+                         int ret =
+                           U_UCTOMB (result + length, muc, allocated - length);
+                         if (ret == -1)
+                           {
+                             errno = EINVAL;
+                             goto fail;
+                           }
+                         if (ret < 0)
+                           abort ();
+                         length += ret;
+                         goto done_appending;
+                       }
+                     }
+                   }
+                  done_appending: ;
+                 }
+
+               /* sortbuf is now empty.  */
+               sortbuf_count = 0;
+             }
+
+           if (!(s < s_end))
+             /* End of string reached.  */
+             break;
+
+           /* Append (uc, ccc) to sortbuf.  */
+           if (sortbuf_count == sortbuf_allocated)
+             {
+               struct ucs4_with_ccc *new_sortbuf;
+
+               sortbuf_allocated = 2 * sortbuf_allocated;
+               if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
+                 abort ();
+               new_sortbuf =
+                 (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
+               memcpy (new_sortbuf, sortbuf,
+                       sortbuf_count * sizeof (struct ucs4_with_ccc));
+               if (sortbuf != sortbuf_preallocated)
+                 free (sortbuf);
+               sortbuf = new_sortbuf;
+             }
+           sortbuf[sortbuf_count].code = uc;
+           sortbuf[sortbuf_count].ccc = ccc;
+           sortbuf_count++;
+
+           i++;
+         }
+
+       if (!(s < s_end))
+         /* End of string reached.  */
+         break;
+
+       s += count;
+      }
+  }
+
+  if (sortbuf_count > 0)
+    abort ();
+  if (sortbuf != sortbuf_preallocated)
+    free (sortbuf);
+
+  *lengthp = length;
+  return result;
+
+ fail:
+  {
+    int saved_errno = errno;
+    if (sortbuf != sortbuf_preallocated)
+      free (sortbuf);
+    if (result != resultbuf)
+      free (result);
+    errno = saved_errno;
+  }
+  return NULL;
+}
diff --git a/lib/uninorm/u8-normalize.c b/lib/uninorm/u8-normalize.c
new file mode 100644 (file)
index 0000000..7e003ec
--- /dev/null
@@ -0,0 +1,38 @@
+/* Normalization of UTF-8 strings.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "uninorm.h"
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unistr.h"
+#include "unictype.h"
+#include "normalize-internal.h"
+#include "decompose-internal.h"
+
+#define FUNC u8_normalize
+#define UNIT uint8_t
+#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe
+#define U_UCTOMB u8_uctomb
+#define U_CPY u8_cpy
+#include "u-normalize-internal.h"
diff --git a/modules/uninorm/u8-normalize b/modules/uninorm/u8-normalize
new file mode 100644 (file)
index 0000000..698f12d
--- /dev/null
@@ -0,0 +1,31 @@
+Description:
+Normalization of UTF-8 strings.
+
+Files:
+lib/uninorm/u8-normalize.c
+lib/uninorm/normalize-internal.h
+lib/uninorm/u-normalize-internal.h
+
+Depends-on:
+uninorm/base
+unistr/u8-mbtouc-unsafe
+unistr/u8-uctomb
+unistr/u8-cpy
+unictype/combining-class
+uninorm/decompose-internal
+
+configure.ac:
+gl_MODULE_INDICATOR([uninorm/u8-normalize])
+
+Makefile.am:
+lib_SOURCES += uninorm/u8-normalize.c
+
+Include:
+"uninorm.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible
+