now-generated

[gnulib.git] / lib / regex.c
diff --git a/lib/regex.c b/lib/regex.c

index 54b9249..c0ded39 100644 (file)
--- a/lib/regex.c
+++ b/lib/regex.c
@@ -62,9 +62,10 @@
  # define US_CHAR_TYPE wchar_t/* unsigned character type */
  # define COMPILED_BUFFER_VAR wc_buffer
  # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
+# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
  # define PUT_CHAR(c) \
    do {                                                                       \
-    if (MC_CUR_MAX == 1)                                                     \
+    if (MB_CUR_MAX == 1)                                                     \
        putchar (c);                                                           \
      else                                                                     \
        printf ("%C", (wint_t) c); /* Should we use wide stream??  */          \
@@ -288,6 +289,8 @@ extern char *re_syntax_table;
  
  static char re_syntax_table[CHAR_SET_SIZE];
  
+static void init_syntax_once PARAMS ((void));
+
  static void
  init_syntax_once ()
  {
@@ -1172,7 +1175,7 @@ printchar (c)
  
  static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
                                   size_t len, int *offset_buffer,
-                                 int *is_binary);
+                                 char *is_binary);
  static size_t
  convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
       CHAR_TYPE *dest;
@@ -1189,7 +1192,7 @@ convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
                         = {0, 3, 4, 6}
       */
       int *offset_buffer;
-     int *is_binary;
+     char *is_binary;
  {
    wchar_t *pdest = dest;
    const unsigned char *psrc = src;
@@ -1904,7 +1907,8 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
  #ifndef TRANSLATE
  # ifdef MBS_SUPPORT
  #  define TRANSLATE(d) \
-  (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d))
+  ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
+   ? (char) translate[(unsigned char) (d)] : (d))
  #else
  #  define TRANSLATE(d) \
    (translate ? (char) translate[(unsigned char) (d)] : (d))
@@ -2133,21 +2137,21 @@ typedef struct
  
  
  /* Get the next unsigned number in the uncompiled pattern.  */
-#define GET_UNSIGNED_NUMBER(num)                                       \
-  { if (p != pend)                                                     \
-     {                                                                 \
-       PATFETCH (c);                                                   \
-       while ('0' <= c && c <= '9')                                    \
-         {                                                             \
-           if (num < 0)                                                        \
-              num = 0;                                                 \
-           num = num * 10 + c - '0';                                   \
-           if (p == pend)                                              \
-              break;                                                   \
-           PATFETCH (c);                                               \
-         }                                                             \
-       }                                                               \
-    }
+#define GET_UNSIGNED_NUMBER(num) \
+  {                                                                    \
+    while (p != pend)                                                  \
+      {                                                                        \
+       PATFETCH (c);                                                   \
+       if (c < '0' || c > '9')                                         \
+         break;                                                        \
+       if (num <= RE_DUP_MAX)                                          \
+         {                                                             \
+           if (num < 0)                                                \
+             num = 0;                                                  \
+           num = num * 10 + c - '0';                                   \
+         }                                                             \
+      }                                                                        \
+  }
  
  #if defined _LIBC || WIDE_CHAR_SUPPORT
  /* The GNU C library provides support for user-defined character classes
@@ -2281,9 +2285,9 @@ regex_compile (pattern, size, syntax, bufp)
    /* offset buffer for optimizatoin. See convert_mbs_to_wc.  */
    int *mbs_offset = NULL;
    /* It hold whether each wchar_t is binary data or not.  */
-  int *is_binary = NULL;
+  char *is_binary = NULL;
    /* A flag whether exactn is handling binary data or not.  */
-  int is_exactn_bin = FALSE;
+  char is_exactn_bin = FALSE;
  #endif /* MBS_SUPPORT */
  
    /* A random temporary spot in PATTERN.  */
@@ -2321,14 +2325,6 @@ regex_compile (pattern, size, syntax, bufp)
    /* Address of beginning of regexp, or inside of last group.  */
    US_CHAR_TYPE *begalt;
  
-  /* Place in the uncompiled pattern (i.e., the {) to
-     which to go back if the interval is invalid.  */
-#ifdef MBS_SUPPORT
-  const US_CHAR_TYPE *beg_interval;
-#else
-  const char *beg_interval;
-#endif /* MBS_SUPPORT */
-
    /* Address of the place where a forward jump should go to the end of
       the containing expression.  Each alternative of an `or' -- except the
       last -- ends with a forward jump of this sort.  */
@@ -2341,23 +2337,24 @@ regex_compile (pattern, size, syntax, bufp)
  
  #ifdef MBS_SUPPORT
    /* Initialize the wchar_t PATTERN and offset_buffer.  */
-  p = pend = pattern = TALLOC(csize, CHAR_TYPE);
+  p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
    mbs_offset = TALLOC(csize + 1, int);
-  is_binary = TALLOC(csize + 1, int);
+  is_binary = TALLOC(csize + 1, char);
    if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
      {
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
        return REG_ESPACE;
      }
+  pattern[csize] = L'\0';      /* sentinel */
    size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
    pend = p + size;
    if (size < 0)
      {
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
        return REG_BADPAT;
      }
  #endif
@@ -2379,9 +2376,9 @@ regex_compile (pattern, size, syntax, bufp)
    if (compile_stack.stack == NULL)
      {
  #ifdef MBS_SUPPORT
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
  #endif
        return REG_ESPACE;
      }
@@ -2642,6 +2639,7 @@ regex_compile (pattern, size, syntax, bufp)
                charset[5] = p (= length of chars)
  
                 charset[6] = char_class (wctype_t)
+               charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
                           ...
                 charset[l+5]  = char_class (wctype_t)
  
@@ -2804,6 +2802,8 @@ regex_compile (pattern, size, syntax, bufp)
                      if (c == ':' && *p == ']')
                        {
                         wctype_t wt;
+                       uintptr_t alignedp;
+
                         /* Query the character class as wctype_t.  */
                         wt = IS_CHAR_CLASS (str);
                         if (wt == 0)
@@ -2816,15 +2816,21 @@ regex_compile (pattern, size, syntax, bufp)
                          if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
  
                         /* Allocate the space for character class.  */
-                        GET_BUFFER_SPACE(1);
+                        GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
                         /* Update the pointer to indicate end of buffer.  */
-                        b++;
+                        b += CHAR_CLASS_SIZE;
                         /* Move data which follow character classes
                             not to violate the data.  */
-                        insert_space(1, laststart+6, b-1);
+                        insert_space(CHAR_CLASS_SIZE,
+                                    laststart + 6 + laststart[1],
+                                    b - 1);
+                       alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
+                                   + __alignof__(wctype_t) - 1)
+                                   & ~(uintptr_t)(__alignof__(wctype_t) - 1);
                         /* Store the character class.  */
-                        laststart[6] = (CHAR_TYPE) wt;
-                        laststart[1]++; /* Update length of char_classes */
+                        *((wctype_t*)alignedp) = wt;
+                        /* Update length of char_classes */
+                        laststart[1] += CHAR_CLASS_SIZE;
  
                          had_char_class = true;
                        }
@@ -2990,7 +2996,7 @@ regex_compile (pattern, size, syntax, bufp)
                                     /* Adjust for the alignment.  */
                                     idx = (idx + 3) & ~4;
  
-                                   str[0] = (wchar_t) &extra[idx + 4];
+                                   str[0] = (wchar_t) idx + 4;
                                   }
                                 else if (symb_table[2 * elem] == 0 && c1 == 1)
                                   {
@@ -3813,25 +3819,19 @@ regex_compile (pattern, size, syntax, bufp)
  
                  /* At least (most) this many matches must be made.  */
                  int lower_bound = -1, upper_bound = -1;
-                beg_interval = p - 1;
+
+               /* Place in the uncompiled pattern (i.e., just after
+                  the '{') to go back to if the interval is invalid.  */
+               const CHAR_TYPE *beg_interval = p;
  
                  if (p == pend)
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_EBRACE);
-                  }
+                 goto invalid_interval;
  
                  GET_UNSIGNED_NUMBER (lower_bound);
  
                  if (c == ',')
                    {
                      GET_UNSIGNED_NUMBER (upper_bound);
-                   if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
-                       || ((syntax & RE_NO_BK_BRACES) && c != '}'))
-                     FREE_STACK_RETURN (REG_BADBR);
-
                     if (upper_bound < 0)
                       upper_bound = RE_DUP_MAX;
                    }
@@ -3839,36 +3839,24 @@ regex_compile (pattern, size, syntax, bufp)
                    /* Interval such as `{1}' => match exactly once. */
                    upper_bound = lower_bound;
  
-                if (lower_bound < 0 || upper_bound > RE_DUP_MAX
-                    || lower_bound > upper_bound)
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
-                  }
+                if (! (0 <= lower_bound && lower_bound <= upper_bound))
+                 goto invalid_interval;
  
                  if (!(syntax & RE_NO_BK_BRACES))
                    {
-                    if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
-
+                   if (c != '\\' || p == pend)
+                     goto invalid_interval;
                      PATFETCH (c);
                    }
  
                  if (c != '}')
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
-                  }
-
-                /* We just parsed a valid interval.  */
+                 goto invalid_interval;
  
                  /* If it's invalid to have no preceding re.  */
                  if (!laststart)
                    {
-                    if (syntax & RE_CONTEXT_INVALID_OPS)
+                   if (syntax & RE_CONTEXT_INVALID_OPS
+                       && !(syntax & RE_INVALID_INTERVAL_ORD))
                        FREE_STACK_RETURN (REG_BADRPT);
                      else if (syntax & RE_CONTEXT_INDEP_OPS)
                        laststart = b;
@@ -3876,6 +3864,11 @@ regex_compile (pattern, size, syntax, bufp)
                        goto unfetch_interval;
                    }
  
+                /* We just parsed a valid interval.  */
+
+                if (RE_DUP_MAX < upper_bound)
+                 FREE_STACK_RETURN (REG_BADBR);
+
                  /* If the upper bound is zero, don't want to succeed at
                     all; jump from `laststart' to `b + 3', which will be
                    the end of the buffer after we insert the jump.  */
@@ -3961,25 +3954,20 @@ regex_compile (pattern, size, syntax, bufp)
                         }
                     }
                  pending_exact = 0;
-                beg_interval = NULL;
-              }
-              break;
-
-            unfetch_interval:
-              /* If an invalid interval, match the characters as literals.  */
-               assert (beg_interval);
-               p = beg_interval;
-               beg_interval = NULL;
-
-               /* normal_char and normal_backslash need `c'.  */
-               PATFETCH (c);
-
-               if (!(syntax & RE_NO_BK_BRACES))
-                 {
-                   if (p > pattern  &&  p[-1] == '\\')
-                     goto normal_backslash;
-                 }
-               goto normal_char;
+               break;
+
+             invalid_interval:
+               if (!(syntax & RE_INVALID_INTERVAL_ORD))
+                 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
+             unfetch_interval:
+               /* Match the characters as literals.  */
+               p = beg_interval;
+               c = '{';
+               if (syntax & RE_NO_BK_BRACES)
+                 goto normal_char;
+               else
+                 goto normal_backslash;
+             }
  
  #ifdef emacs
              /* There is no way to specify the before_dot and after_dot
@@ -4355,7 +4343,8 @@ group_in_compile_stack (compile_stack, regnum)
  }
  
  #ifdef MBS_SUPPORT
-/* This insert space into the pattern.  */
+/* This insert space, which size is "num", into the pattern at "loc".
+   "end" must point the end of the allocated buffer.  */
  static void
  insert_space (num, loc, end)
       int num;
@@ -4396,13 +4385,15 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
      {
        const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
                                                        _NL_COLLATE_COLLSEQWC);
+      const unsigned char *extra = (const unsigned char *)
+       _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
  
        if (range_start_char < -1)
         {
           /* range_start is a collating symbol.  */
           int32_t *wextra;
           /* Retreive the index and get collation sequence value.  */
-         wextra = (int32_t*)char_set[-range_start_char];
+         wextra = (int32_t*)(extra + char_set[-range_start_char]);
           start_val = wextra[1 + *wextra];
         }
        else
@@ -5044,9 +5035,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
             }
           else                          /* Searching backwards.  */
             {
-             register char c = (size1 == 0 || startpos >= size1
-                                 ? string2[startpos - size1]
-                                 : string1[startpos]);
+             register CHAR_TYPE c = (size1 == 0 || startpos >= size1
+                                     ? string2[startpos - size1]
+                                     : string1[startpos]);
  
               if (!fastmap[(unsigned char) TRANSLATE (c)])
                 goto advance;
@@ -5180,8 +5171,6 @@ weak_alias (__re_search_2, re_search_2)
      FREE_VAR (string2);                                                        \
      FREE_VAR (mbs_offset1);                                            \
      FREE_VAR (mbs_offset2);                                            \
-    FREE_VAR (is_binary1);                                             \
-    FREE_VAR (is_binary2);                                             \
    } while (0)
  # else /* not MBS_SUPPORT */
  #  define FREE_VARIABLES()                                             \
@@ -5199,17 +5188,16 @@ weak_alias (__re_search_2, re_search_2)
    } while (0)
  # endif /* MBS_SUPPORT */
  #else
+# define FREE_VAR(var) if (var) free (var); var = NULL
  # ifdef MBS_SUPPORT
  #  define FREE_VARIABLES()                                             \
    do {                                                                 \
-    if (string1) free (string1);                                       \
-    if (string2) free (string2);                                       \
-    if (mbs_offset1) free (mbs_offset1);                               \
-    if (mbs_offset2) free (mbs_offset2);                               \
-    if (is_binary1) free (is_binary1);                                 \
-    if (is_binary2) free (is_binary2);                                 \
+    FREE_VAR (string1);                                                        \
+    FREE_VAR (string2);                                                        \
+    FREE_VAR (mbs_offset1);                                            \
+    FREE_VAR (mbs_offset2);                                            \
    } while (0)
-# eles
+# else
  #  define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning. */
  # endif /* MBS_SUPPORT */
  #endif /* not MATCH_MAY_ALLOCATE */
@@ -5298,10 +5286,14 @@ weak_alias (__re_match_2, re_match_2)
  #endif
  
  #ifdef MBS_SUPPORT
+
+static int count_mbs_length PARAMS ((int *, int));
+
  /* This check the substring (from 0, to length) of the multibyte string,
     to which offset_buffer correspond. And count how many wchar_t_characters
     the substring occupy. We use offset_buffer to optimization.
     See convert_mbs_to_wcs.  */
+
  static int
  count_mbs_length(offset_buffer, length)
       int *offset_buffer;
@@ -5359,7 +5351,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
    /* offset buffer for optimizatoin. See convert_mbs_to_wc.  */
    int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
    /* They hold whether each wchar_t is binary data or not.  */
-  int *is_binary1 = NULL, *is_binary2 = NULL;
+  char *is_binary = NULL;
  #endif /* MBS_SUPPORT */
  
    /* Just past the end of the corresponding string.  */
@@ -5538,38 +5530,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
       fill them with converted string.  */
    if (csize1 != 0)
      {
-      string1 = TALLOC (csize1 + 1, CHAR_TYPE);
-      mbs_offset1 = TALLOC (csize1 + 1, int);
-      is_binary1 = TALLOC (csize1 + 1, int);
-      if (!string1 || !mbs_offset1 || !is_binary1)
+      string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
+      mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
+      is_binary = REGEX_TALLOC (csize1 + 1, char);
+      if (!string1 || !mbs_offset1 || !is_binary)
         {
-         if (string1) free(string1);
-         if (mbs_offset1) free(mbs_offset1);
-         if (is_binary1) free(is_binary1);
+         FREE_VAR (string1);
+         FREE_VAR (mbs_offset1);
+         FREE_VAR (is_binary);
           return -2;
         }
        size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
-                                mbs_offset1, is_binary1);
+                                mbs_offset1, is_binary);
        string1[size1] = L'\0'; /* for a sentinel  */
+      FREE_VAR (is_binary);
      }
    if (csize2 != 0)
      {
        string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
        mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
-      is_binary2 = TALLOC (csize2 + 1, int);
-      if (!string2 || !mbs_offset2 || !is_binary2)
+      is_binary = REGEX_TALLOC (csize2 + 1, char);
+      if (!string2 || !mbs_offset2 || !is_binary)
         {
-         if (string1) free(string1);
-         if (mbs_offset1) free(mbs_offset1);
-         if (is_binary1) free(is_binary1);
-         if (string2) free(string2);
-         if (mbs_offset2) free(mbs_offset2);
-         if (is_binary2) free(is_binary2);
+         FREE_VAR (string1);
+         FREE_VAR (mbs_offset1);
+         FREE_VAR (string2);
+         FREE_VAR (mbs_offset2);
+         FREE_VAR (is_binary);
           return -2;
         }
        size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
-                                mbs_offset2, is_binary2);
+                                mbs_offset2, is_binary);
        string2[size2] = L'\0'; /* for a sentinel  */
+      FREE_VAR (is_binary);
      }
  
    /* We need to cast pattern to (wchar_t*), because we casted this compiled
@@ -5601,6 +5594,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
        size2 = size1;
        string1 = 0;
        size1 = 0;
+#ifdef MBS_SUPPORT
+      mbs_offset2 = mbs_offset1;
+      csize2 = csize1;
+      mbs_offset1 = NULL;
+      csize1 = 0;
+#endif
      }
    end1 = string1 + size1;
    end2 = string2 + size2;
@@ -5615,6 +5614,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
      }
    else
      {
+      if (stop > csize1 + csize2)
+       stop = csize1 + csize2;
        end_match_1 = end1;
        mcnt = count_mbs_length(mbs_offset2, stop-csize1);
        end_match_2 = string2 + mcnt;
@@ -5992,19 +5993,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                2*ranges_length + chars_length;
  
              /* match with char_class?  */
-           for (i = 0; i < char_class_length ; i++)
-              if (iswctype((wint_t)c, (wctype_t)(*workp++)))
-                goto char_set_matched;
+           for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
+             {
+               wctype_t wctype;
+               uintptr_t alignedp = ((uintptr_t)workp
+                                     + __alignof__(wctype_t) - 1)
+                                     & ~(uintptr_t)(__alignof__(wctype_t) - 1);
+               wctype = *((wctype_t*)alignedp);
+               workp += CHAR_CLASS_SIZE;
+               if (iswctype((wint_t)c, wctype))
+                 goto char_set_matched;
+             }
  
              /* match with collating_symbol?  */
  # ifdef _LIBC
             if (nrules != 0)
               {
+               const unsigned char *extra = (const unsigned char *)
+                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+
                 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
                      workp++)
                   {
                     int32_t *wextra;
-                   wextra = (int32_t*) *workp++;
+                   wextra = (int32_t*)(extra + *workp++);
                     for (i = 0; i < *wextra; ++i)
                       if (TRANSLATE(d[i]) != wextra[1 + i])
                         break;
@@ -6124,7 +6136,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
  
                 /* Update d, however d will be incremented at
                    char_set_matched:, we decrement d here.  */
-               d = backup_d + (wint_t)cp - (wint_t)str_buf - 1;
+               d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
                 if (d >= dend)
                   {
                     if (dend == end_match_2)
@@ -7081,14 +7093,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
  
         case wordbeg:
            DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
-         if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
+         if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
+             && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
             break;
            goto fail;
  
         case wordend:
            DEBUG_PRINT1 ("EXECUTING wordend.\n");
           if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
-              && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
+              && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
             break;
            goto fail;