now-generated
[gnulib.git] / lib / regex.c
index 54b9249..c0ded39 100644 (file)
 # define US_CHAR_TYPE wchar_t/* unsigned character type */
 # define COMPILED_BUFFER_VAR wc_buffer
 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
+# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
 # define PUT_CHAR(c) \
   do {                                                                       \
-    if (MC_CUR_MAX == 1)                                                     \
+    if (MB_CUR_MAX == 1)                                                     \
       putchar (c);                                                           \
     else                                                                     \
       printf ("%C", (wint_t) c); /* Should we use wide stream??  */          \
@@ -288,6 +289,8 @@ extern char *re_syntax_table;
 
 static char re_syntax_table[CHAR_SET_SIZE];
 
+static void init_syntax_once PARAMS ((void));
+
 static void
 init_syntax_once ()
 {
@@ -1172,7 +1175,7 @@ printchar (c)
 
 static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
                                  size_t len, int *offset_buffer,
-                                 int *is_binary);
+                                 char *is_binary);
 static size_t
 convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
      CHAR_TYPE *dest;
@@ -1189,7 +1192,7 @@ convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
                        = {0, 3, 4, 6}
      */
      int *offset_buffer;
-     int *is_binary;
+     char *is_binary;
 {
   wchar_t *pdest = dest;
   const unsigned char *psrc = src;
@@ -1904,7 +1907,8 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
 #ifndef TRANSLATE
 # ifdef MBS_SUPPORT
 #  define TRANSLATE(d) \
-  (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d))
+  ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
+   ? (char) translate[(unsigned char) (d)] : (d))
 #else
 #  define TRANSLATE(d) \
   (translate ? (char) translate[(unsigned char) (d)] : (d))
@@ -2133,21 +2137,21 @@ typedef struct
 
 
 /* Get the next unsigned number in the uncompiled pattern.  */
-#define GET_UNSIGNED_NUMBER(num)                                       \
-  { if (p != pend)                                                     \
-     {                                                                 \
-       PATFETCH (c);                                                   \
-       while ('0' <= c && c <= '9')                                    \
-         {                                                             \
-           if (num < 0)                                                        \
-              num = 0;                                                 \
-           num = num * 10 + c - '0';                                   \
-           if (p == pend)                                              \
-              break;                                                   \
-           PATFETCH (c);                                               \
-         }                                                             \
-       }                                                               \
-    }
+#define GET_UNSIGNED_NUMBER(num) \
+  {                                                                    \
+    while (p != pend)                                                  \
+      {                                                                        \
+       PATFETCH (c);                                                   \
+       if (c < '0' || c > '9')                                         \
+         break;                                                        \
+       if (num <= RE_DUP_MAX)                                          \
+         {                                                             \
+           if (num < 0)                                                \
+             num = 0;                                                  \
+           num = num * 10 + c - '0';                                   \
+         }                                                             \
+      }                                                                        \
+  }
 
 #if defined _LIBC || WIDE_CHAR_SUPPORT
 /* The GNU C library provides support for user-defined character classes
@@ -2281,9 +2285,9 @@ regex_compile (pattern, size, syntax, bufp)
   /* offset buffer for optimizatoin. See convert_mbs_to_wc.  */
   int *mbs_offset = NULL;
   /* It hold whether each wchar_t is binary data or not.  */
-  int *is_binary = NULL;
+  char *is_binary = NULL;
   /* A flag whether exactn is handling binary data or not.  */
-  int is_exactn_bin = FALSE;
+  char is_exactn_bin = FALSE;
 #endif /* MBS_SUPPORT */
 
   /* A random temporary spot in PATTERN.  */
@@ -2321,14 +2325,6 @@ regex_compile (pattern, size, syntax, bufp)
   /* Address of beginning of regexp, or inside of last group.  */
   US_CHAR_TYPE *begalt;
 
-  /* Place in the uncompiled pattern (i.e., the {) to
-     which to go back if the interval is invalid.  */
-#ifdef MBS_SUPPORT
-  const US_CHAR_TYPE *beg_interval;
-#else
-  const char *beg_interval;
-#endif /* MBS_SUPPORT */
-
   /* Address of the place where a forward jump should go to the end of
      the containing expression.  Each alternative of an `or' -- except the
      last -- ends with a forward jump of this sort.  */
@@ -2341,23 +2337,24 @@ regex_compile (pattern, size, syntax, bufp)
 
 #ifdef MBS_SUPPORT
   /* Initialize the wchar_t PATTERN and offset_buffer.  */
-  p = pend = pattern = TALLOC(csize, CHAR_TYPE);
+  p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
   mbs_offset = TALLOC(csize + 1, int);
-  is_binary = TALLOC(csize + 1, int);
+  is_binary = TALLOC(csize + 1, char);
   if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
     {
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
       return REG_ESPACE;
     }
+  pattern[csize] = L'\0';      /* sentinel */
   size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
   pend = p + size;
   if (size < 0)
     {
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
       return REG_BADPAT;
     }
 #endif
@@ -2379,9 +2376,9 @@ regex_compile (pattern, size, syntax, bufp)
   if (compile_stack.stack == NULL)
     {
 #ifdef MBS_SUPPORT
-      if (pattern) free(pattern);
-      if (mbs_offset) free(mbs_offset);
-      if (is_binary) free(is_binary);
+      free(pattern);
+      free(mbs_offset);
+      free(is_binary);
 #endif
       return REG_ESPACE;
     }
@@ -2642,6 +2639,7 @@ regex_compile (pattern, size, syntax, bufp)
               charset[5] = p (= length of chars)
 
                charset[6] = char_class (wctype_t)
+               charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
                          ...
                charset[l+5]  = char_class (wctype_t)
 
@@ -2804,6 +2802,8 @@ regex_compile (pattern, size, syntax, bufp)
                     if (c == ':' && *p == ']')
                       {
                        wctype_t wt;
+                       uintptr_t alignedp;
+
                        /* Query the character class as wctype_t.  */
                        wt = IS_CHAR_CLASS (str);
                        if (wt == 0)
@@ -2816,15 +2816,21 @@ regex_compile (pattern, size, syntax, bufp)
                         if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
                        /* Allocate the space for character class.  */
-                        GET_BUFFER_SPACE(1);
+                        GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
                        /* Update the pointer to indicate end of buffer.  */
-                        b++;
+                        b += CHAR_CLASS_SIZE;
                        /* Move data which follow character classes
                            not to violate the data.  */
-                        insert_space(1, laststart+6, b-1);
+                        insert_space(CHAR_CLASS_SIZE,
+                                    laststart + 6 + laststart[1],
+                                    b - 1);
+                       alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
+                                   + __alignof__(wctype_t) - 1)
+                                   & ~(uintptr_t)(__alignof__(wctype_t) - 1);
                        /* Store the character class.  */
-                        laststart[6] = (CHAR_TYPE) wt;
-                        laststart[1]++; /* Update length of char_classes */
+                        *((wctype_t*)alignedp) = wt;
+                        /* Update length of char_classes */
+                        laststart[1] += CHAR_CLASS_SIZE;
 
                         had_char_class = true;
                       }
@@ -2990,7 +2996,7 @@ regex_compile (pattern, size, syntax, bufp)
                                    /* Adjust for the alignment.  */
                                    idx = (idx + 3) & ~4;
 
-                                   str[0] = (wchar_t) &extra[idx + 4];
+                                   str[0] = (wchar_t) idx + 4;
                                  }
                                else if (symb_table[2 * elem] == 0 && c1 == 1)
                                  {
@@ -3813,25 +3819,19 @@ regex_compile (pattern, size, syntax, bufp)
 
                 /* At least (most) this many matches must be made.  */
                 int lower_bound = -1, upper_bound = -1;
-                beg_interval = p - 1;
+
+               /* Place in the uncompiled pattern (i.e., just after
+                  the '{') to go back to if the interval is invalid.  */
+               const CHAR_TYPE *beg_interval = p;
 
                 if (p == pend)
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_EBRACE);
-                  }
+                 goto invalid_interval;
 
                 GET_UNSIGNED_NUMBER (lower_bound);
 
                 if (c == ',')
                   {
                     GET_UNSIGNED_NUMBER (upper_bound);
-                   if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
-                       || ((syntax & RE_NO_BK_BRACES) && c != '}'))
-                     FREE_STACK_RETURN (REG_BADBR);
-
                    if (upper_bound < 0)
                      upper_bound = RE_DUP_MAX;
                   }
@@ -3839,36 +3839,24 @@ regex_compile (pattern, size, syntax, bufp)
                   /* Interval such as `{1}' => match exactly once. */
                   upper_bound = lower_bound;
 
-                if (lower_bound < 0 || upper_bound > RE_DUP_MAX
-                    || lower_bound > upper_bound)
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
-                  }
+                if (! (0 <= lower_bound && lower_bound <= upper_bound))
+                 goto invalid_interval;
 
                 if (!(syntax & RE_NO_BK_BRACES))
                   {
-                    if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
-
+                   if (c != '\\' || p == pend)
+                     goto invalid_interval;
                     PATFETCH (c);
                   }
 
                 if (c != '}')
-                  {
-                    if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
-                  }
-
-                /* We just parsed a valid interval.  */
+                 goto invalid_interval;
 
                 /* If it's invalid to have no preceding re.  */
                 if (!laststart)
                   {
-                    if (syntax & RE_CONTEXT_INVALID_OPS)
+                   if (syntax & RE_CONTEXT_INVALID_OPS
+                       && !(syntax & RE_INVALID_INTERVAL_ORD))
                       FREE_STACK_RETURN (REG_BADRPT);
                     else if (syntax & RE_CONTEXT_INDEP_OPS)
                       laststart = b;
@@ -3876,6 +3864,11 @@ regex_compile (pattern, size, syntax, bufp)
                       goto unfetch_interval;
                   }
 
+                /* We just parsed a valid interval.  */
+
+                if (RE_DUP_MAX < upper_bound)
+                 FREE_STACK_RETURN (REG_BADBR);
+
                 /* If the upper bound is zero, don't want to succeed at
                    all; jump from `laststart' to `b + 3', which will be
                   the end of the buffer after we insert the jump.  */
@@ -3961,25 +3954,20 @@ regex_compile (pattern, size, syntax, bufp)
                        }
                    }
                 pending_exact = 0;
-                beg_interval = NULL;
-              }
-              break;
-
-            unfetch_interval:
-              /* If an invalid interval, match the characters as literals.  */
-               assert (beg_interval);
-               p = beg_interval;
-               beg_interval = NULL;
-
-               /* normal_char and normal_backslash need `c'.  */
-               PATFETCH (c);
-
-               if (!(syntax & RE_NO_BK_BRACES))
-                 {
-                   if (p > pattern  &&  p[-1] == '\\')
-                     goto normal_backslash;
-                 }
-               goto normal_char;
+               break;
+
+             invalid_interval:
+               if (!(syntax & RE_INVALID_INTERVAL_ORD))
+                 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
+             unfetch_interval:
+               /* Match the characters as literals.  */
+               p = beg_interval;
+               c = '{';
+               if (syntax & RE_NO_BK_BRACES)
+                 goto normal_char;
+               else
+                 goto normal_backslash;
+             }
 
 #ifdef emacs
             /* There is no way to specify the before_dot and after_dot
@@ -4355,7 +4343,8 @@ group_in_compile_stack (compile_stack, regnum)
 }
 
 #ifdef MBS_SUPPORT
-/* This insert space into the pattern.  */
+/* This insert space, which size is "num", into the pattern at "loc".
+   "end" must point the end of the allocated buffer.  */
 static void
 insert_space (num, loc, end)
      int num;
@@ -4396,13 +4385,15 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
     {
       const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
                                                       _NL_COLLATE_COLLSEQWC);
+      const unsigned char *extra = (const unsigned char *)
+       _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
 
       if (range_start_char < -1)
        {
          /* range_start is a collating symbol.  */
          int32_t *wextra;
          /* Retreive the index and get collation sequence value.  */
-         wextra = (int32_t*)char_set[-range_start_char];
+         wextra = (int32_t*)(extra + char_set[-range_start_char]);
          start_val = wextra[1 + *wextra];
        }
       else
@@ -5044,9 +5035,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
            }
          else                          /* Searching backwards.  */
            {
-             register char c = (size1 == 0 || startpos >= size1
-                                 ? string2[startpos - size1]
-                                 : string1[startpos]);
+             register CHAR_TYPE c = (size1 == 0 || startpos >= size1
+                                     ? string2[startpos - size1]
+                                     : string1[startpos]);
 
              if (!fastmap[(unsigned char) TRANSLATE (c)])
                goto advance;
@@ -5180,8 +5171,6 @@ weak_alias (__re_search_2, re_search_2)
     FREE_VAR (string2);                                                        \
     FREE_VAR (mbs_offset1);                                            \
     FREE_VAR (mbs_offset2);                                            \
-    FREE_VAR (is_binary1);                                             \
-    FREE_VAR (is_binary2);                                             \
   } while (0)
 # else /* not MBS_SUPPORT */
 #  define FREE_VARIABLES()                                             \
@@ -5199,17 +5188,16 @@ weak_alias (__re_search_2, re_search_2)
   } while (0)
 # endif /* MBS_SUPPORT */
 #else
+# define FREE_VAR(var) if (var) free (var); var = NULL
 # ifdef MBS_SUPPORT
 #  define FREE_VARIABLES()                                             \
   do {                                                                 \
-    if (string1) free (string1);                                       \
-    if (string2) free (string2);                                       \
-    if (mbs_offset1) free (mbs_offset1);                               \
-    if (mbs_offset2) free (mbs_offset2);                               \
-    if (is_binary1) free (is_binary1);                                 \
-    if (is_binary2) free (is_binary2);                                 \
+    FREE_VAR (string1);                                                        \
+    FREE_VAR (string2);                                                        \
+    FREE_VAR (mbs_offset1);                                            \
+    FREE_VAR (mbs_offset2);                                            \
   } while (0)
-# eles
+# else
 #  define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning. */
 # endif /* MBS_SUPPORT */
 #endif /* not MATCH_MAY_ALLOCATE */
@@ -5298,10 +5286,14 @@ weak_alias (__re_match_2, re_match_2)
 #endif
 
 #ifdef MBS_SUPPORT
+
+static int count_mbs_length PARAMS ((int *, int));
+
 /* This check the substring (from 0, to length) of the multibyte string,
    to which offset_buffer correspond. And count how many wchar_t_characters
    the substring occupy. We use offset_buffer to optimization.
    See convert_mbs_to_wcs.  */
+
 static int
 count_mbs_length(offset_buffer, length)
      int *offset_buffer;
@@ -5359,7 +5351,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
   /* offset buffer for optimizatoin. See convert_mbs_to_wc.  */
   int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
   /* They hold whether each wchar_t is binary data or not.  */
-  int *is_binary1 = NULL, *is_binary2 = NULL;
+  char *is_binary = NULL;
 #endif /* MBS_SUPPORT */
 
   /* Just past the end of the corresponding string.  */
@@ -5538,38 +5530,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
      fill them with converted string.  */
   if (csize1 != 0)
     {
-      string1 = TALLOC (csize1 + 1, CHAR_TYPE);
-      mbs_offset1 = TALLOC (csize1 + 1, int);
-      is_binary1 = TALLOC (csize1 + 1, int);
-      if (!string1 || !mbs_offset1 || !is_binary1)
+      string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
+      mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
+      is_binary = REGEX_TALLOC (csize1 + 1, char);
+      if (!string1 || !mbs_offset1 || !is_binary)
        {
-         if (string1) free(string1);
-         if (mbs_offset1) free(mbs_offset1);
-         if (is_binary1) free(is_binary1);
+         FREE_VAR (string1);
+         FREE_VAR (mbs_offset1);
+         FREE_VAR (is_binary);
          return -2;
        }
       size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
-                                mbs_offset1, is_binary1);
+                                mbs_offset1, is_binary);
       string1[size1] = L'\0'; /* for a sentinel  */
+      FREE_VAR (is_binary);
     }
   if (csize2 != 0)
     {
       string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
       mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
-      is_binary2 = TALLOC (csize2 + 1, int);
-      if (!string2 || !mbs_offset2 || !is_binary2)
+      is_binary = REGEX_TALLOC (csize2 + 1, char);
+      if (!string2 || !mbs_offset2 || !is_binary)
        {
-         if (string1) free(string1);
-         if (mbs_offset1) free(mbs_offset1);
-         if (is_binary1) free(is_binary1);
-         if (string2) free(string2);
-         if (mbs_offset2) free(mbs_offset2);
-         if (is_binary2) free(is_binary2);
+         FREE_VAR (string1);
+         FREE_VAR (mbs_offset1);
+         FREE_VAR (string2);
+         FREE_VAR (mbs_offset2);
+         FREE_VAR (is_binary);
          return -2;
        }
       size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
-                                mbs_offset2, is_binary2);
+                                mbs_offset2, is_binary);
       string2[size2] = L'\0'; /* for a sentinel  */
+      FREE_VAR (is_binary);
     }
 
   /* We need to cast pattern to (wchar_t*), because we casted this compiled
@@ -5601,6 +5594,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
       size2 = size1;
       string1 = 0;
       size1 = 0;
+#ifdef MBS_SUPPORT
+      mbs_offset2 = mbs_offset1;
+      csize2 = csize1;
+      mbs_offset1 = NULL;
+      csize1 = 0;
+#endif
     }
   end1 = string1 + size1;
   end2 = string2 + size2;
@@ -5615,6 +5614,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
     }
   else
     {
+      if (stop > csize1 + csize2)
+       stop = csize1 + csize2;
       end_match_1 = end1;
       mcnt = count_mbs_length(mbs_offset2, stop-csize1);
       end_match_2 = string2 + mcnt;
@@ -5992,19 +5993,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
               2*ranges_length + chars_length;
 
             /* match with char_class?  */
-           for (i = 0; i < char_class_length ; i++)
-              if (iswctype((wint_t)c, (wctype_t)(*workp++)))
-                goto char_set_matched;
+           for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
+             {
+               wctype_t wctype;
+               uintptr_t alignedp = ((uintptr_t)workp
+                                     + __alignof__(wctype_t) - 1)
+                                     & ~(uintptr_t)(__alignof__(wctype_t) - 1);
+               wctype = *((wctype_t*)alignedp);
+               workp += CHAR_CLASS_SIZE;
+               if (iswctype((wint_t)c, wctype))
+                 goto char_set_matched;
+             }
 
             /* match with collating_symbol?  */
 # ifdef _LIBC
            if (nrules != 0)
              {
+               const unsigned char *extra = (const unsigned char *)
+                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+
                for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
                     workp++)
                  {
                    int32_t *wextra;
-                   wextra = (int32_t*) *workp++;
+                   wextra = (int32_t*)(extra + *workp++);
                    for (i = 0; i < *wextra; ++i)
                      if (TRANSLATE(d[i]) != wextra[1 + i])
                        break;
@@ -6124,7 +6136,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 
                /* Update d, however d will be incremented at
                   char_set_matched:, we decrement d here.  */
-               d = backup_d + (wint_t)cp - (wint_t)str_buf - 1;
+               d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
                if (d >= dend)
                  {
                    if (dend == end_match_2)
@@ -7081,14 +7093,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 
        case wordbeg:
           DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
-         if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
+         if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
+             && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
            break;
           goto fail;
 
        case wordend:
           DEBUG_PRINT1 ("EXECUTING wordend.\n");
          if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
-              && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
+              && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
            break;
           goto fail;