regex: fix glibc bug 9697

author Paolo Bonzini <bonzini@gnu.org>

Fri, 9 Jan 2009 08:10:36 +0000 (09:10 +0100)

committer Paolo Bonzini <bonzini@gnu.org>

Fri, 9 Jan 2009 10:16:48 +0000 (11:16 +0100)
author Paolo Bonzini <bonzini@gnu.org>
Fri, 9 Jan 2009 08:10:36 +0000 (09:10 +0100)
committer Paolo Bonzini <bonzini@gnu.org>
Fri, 9 Jan 2009 10:16:48 +0000 (11:16 +0100)
diff --git a/ChangeLog b/ChangeLog

index fe1f703..0c9bcfb 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
  2009-01-09  Paolo Bonzini  <bonzini@gnu.org>
  
+       regex: fix glibc bug 9697
+       * lib/regcomp.c (re_compile_fastmap_iter): Rewrite COMPLEX_BRACKET
+       handling.
+
+2009-01-09  Paolo Bonzini  <bonzini@gnu.org>
+
         regex: fix glibc bug 697
         * lib/regexec.c (prune_impossible_nodes): Handle sifted_states[0]
         being NULL also if there are no backreferences.
diff --git a/lib/regcomp.c b/lib/regcomp.c

index fc3cf98..6472ff6 100644 (file)
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -357,45 +357,65 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
  #ifdef RE_ENABLE_I18N
        else if (type == COMPLEX_BRACKET)
         {
-         Idx i;
           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
-         if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
-             || cset->nranges || cset->nchar_classes)
-           {
+         Idx i;
+
  # ifdef _LIBC
-             if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
+         /* See if we have to try all bytes which start multiple collation
+            elements.
+            e.g. In da_DK, we want to catch 'a' since "aa" is a valid
+                 collation element, and don't catch 'b' since 'b' is
+                 the only collation element which starts from 'b' (and
+                 it is caught by SIMPLE_BRACKET).  */
+             if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
+                 && (cset->ncoll_syms || cset->nranges))
                 {
-                 /* In this case we want to catch the bytes which are
-                    the first byte of any collation elements.
-                    e.g. In da_DK, we want to catch 'a' since "aa"
-                         is a valid collation element, and don't catch
-                         'b' since 'b' is the only collation element
-                         which starts from 'b'.  */
                   const int32_t *table = (const int32_t *)
                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
                   for (i = 0; i < SBC_MAX; ++i)
                     if (table[i] < 0)
                       re_set_fastmap (fastmap, icase, i);
                 }
-# else
-             if (dfa->mb_cur_max > 1)
-               for (i = 0; i < SBC_MAX; ++i)
-                 if (__btowc (i) == WEOF)
-                   re_set_fastmap (fastmap, icase, i);
-# endif /* not _LIBC */
+# endif /* _LIBC */
+
+         /* See if we have to start the match at all multibyte characters,
+            i.e. where we would not find an invalid sequence.  This only
+            applies to multibyte character sets; for single byte character
+            sets, the SIMPLE_BRACKET again suffices.  */
+         if (dfa->mb_cur_max > 1
+             && (cset->nchar_classes || cset->non_match
+# ifdef _LIBC
+                 || cset->nequiv_classes
+# endif /* _LIBC */
+                ))
+           {
+             unsigned char c = 0;
+             do
+               {
+                 mbstate_t mbs;
+                 memset (&mbs, 0, sizeof (mbs));
+                 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
+                   re_set_fastmap (fastmap, false, (int) c);
+               }
+             while (++c != 0);
             }
-         for (i = 0; i < cset->nmbchars; ++i)
+
+         else
             {
-             char buf[256];
-             mbstate_t state;
-             memset (&state, '\0', sizeof (state));
-             if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
-               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
-             if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
+             /* ... Else catch all bytes which can start the mbchars.  */
+             for (i = 0; i < cset->nmbchars; ++i)
                 {
-                 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
-                     != (size_t) -1)
-                   re_set_fastmap (fastmap, false, *(unsigned char *) buf);
+                 char buf[256];
+                 mbstate_t state;
+                 memset (&state, '\0', sizeof (state));
+                 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
+                   re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
+                 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
+                   {
+                     if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
+                         != (size_t) -1)
+                       re_set_fastmap (fastmap, false, *(unsigned char *) buf);
+                   }
                 }
             }
         }
author	Paolo Bonzini <bonzini@gnu.org>
	Fri, 9 Jan 2009 08:10:36 +0000 (09:10 +0100)
committer	Paolo Bonzini <bonzini@gnu.org>
	Fri, 9 Jan 2009 10:16:48 +0000 (11:16 +0100)
ChangeLog		patch \| blob \| history
lib/regcomp.c		patch \| blob \| history