update from texinfo
[gnulib.git] / regex.c
diff --git a/regex.c b/regex.c
index 3b04fe3..b4f2d8d 100644 (file)
--- a/regex.c
+++ b/regex.c
@@ -19,9 +19,7 @@
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
    USA.         */
 
-/* BUGS:
-   - (x?)*y\1z should match both xxxxyxz and xxxyz.
-   TODO:
+/* TODO:
    - structure the opcode space into opcode+flag.
    - merge with glibc's regex.[ch].
    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
@@ -1520,26 +1518,6 @@ do {                                                                     \
     }                                                                  \
 } while (0)
 
-/* Discard a saved register off the stack.  */
-#define DISCARD_FAILURE_REG_OR_COUNT()                                 \
-do {                                                                   \
-  int reg = POP_FAILURE_INT ();                                                \
-  if (reg == -1)                                                       \
-    {                                                                  \
-      /* It's a counter.  */                                           \
-      POP_FAILURE_POINTER ();                                          \
-      reg = POP_FAILURE_INT ();                                                \
-      DEBUG_PRINT3 ("     Discard counter %p = %d\n", ptr, reg);       \
-    }                                                                  \
-  else                                                                 \
-    {                                                                  \
-      POP_FAILURE_POINTER ();                                          \
-      POP_FAILURE_POINTER ();                                          \
-      DEBUG_PRINT4 ("     Discard reg %d (spanning %p -> %p)\n",       \
-                   reg, regstart[reg], regend[reg]);                   \
-    }                                                                  \
-} while (0)
-
 /* Check that we are not stuck in an infinite loop.  */
 #define CHECK_INFINITE_LOOP(pat_cur, string_place)                     \
 do {                                                                   \
@@ -1553,16 +1531,15 @@ do {                                                                    \
              && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
       if (FAILURE_PAT (failure) == pat_cur)                            \
        {                                                               \
-         while (fail_stack.frame < fail_stack.avail)                   \
-           DISCARD_FAILURE_REG_OR_COUNT ();                            \
-         goto fail;                                                    \
+         cycle = 1;                                                    \
+         break;                                                        \
        }                                                               \
       DEBUG_PRINT2 ("  Other pattern: %p\n", FAILURE_PAT (failure));   \
       failure = NEXT_FAILURE_HANDLE(failure);                          \
     }                                                                  \
   DEBUG_PRINT2 ("  Other string: %p\n", FAILURE_STR (failure));                \
 } while (0)
-    
+
 /* Push the information about the state we will need
    if we ever fail back to it.
 
@@ -1834,7 +1811,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
 
 /* But patterns can have more than `MAX_REGNUM' registers.  We just
    ignore the excess.  */
-typedef unsigned regnum_t;
+typedef int regnum_t;
 
 
 /* Macros for the compile stack.  */
@@ -2145,9 +2122,10 @@ set_image_of_range_1 (work_area, start, end, translate)
 
   if (!RE_TRANSLATE_P (translate))
     {
+      EXTEND_RANGE_TABLE (work_area, 2);
       work_area->table[work_area->used++] = (start);
       work_area->table[work_area->used++] = (end);
-      return;
+      return -1;
     }
 
   eqv_table = XCHAR_TABLE (translate)->extras[2];
@@ -2253,12 +2231,14 @@ set_image_of_range_1 (work_area, start, end, translate)
 
 #endif /* emacs */
 
-/* We need to find the image of the range start..end when passed through
+/* Record the the image of the range start..end when passed through
    TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
    and is not even necessarily contiguous.
-   We approximate it with the smallest contiguous range that contains
-   all the chars we need.  However, that is not good enough for Latin-1,
-   so we do a better job in that case.
+   Normally we approximate it with the smallest contiguous range that contains
+   all the chars we need.  However, for Latin-1 we go to extra effort
+   to do a better job.
+
+   This function is not called for ASCII ranges.
 
    Returns -1 if successful, REG_ESPACE if ran out of space.  */
 
@@ -2273,12 +2253,17 @@ set_image_of_range (work_area, start, end, translate)
 #ifdef emacs
   /* For Latin-1 ranges, use set_image_of_range_1
      to get proper handling of ranges that include letters and nonletters.
-     For ASCII, this is not necessary.
+     For a range that includes the whole of Latin-1, this is not necessary.
      For other character sets, we don't bother to get this right.  */
-  if (start < 04400 && end > 0200)
+  if (RE_TRANSLATE_P (translate) && start < 04400
+      && !(start < 04200 && end >= 04377))
     {
+      int newend;
       int tem;
-      tem = set_image_of_range_1 (work_area, start, end, translate);
+      newend = end;
+      if (newend > 04377)
+       newend = 04377;
+      tem = set_image_of_range_1 (work_area, start, newend, translate);
       if (tem > 0)
        return tem;
 
@@ -2288,19 +2273,38 @@ set_image_of_range (work_area, start, end, translate)
     }
 #endif
 
-  cmin = TRANSLATE (start), cmax = TRANSLATE (end);
+  EXTEND_RANGE_TABLE (work_area, 2);
+  work_area->table[work_area->used++] = (start);
+  work_area->table[work_area->used++] = (end);
+
+  cmin = -1, cmax = -1;
 
   if (RE_TRANSLATE_P (translate))
-    for (; start <= end; start++)
-      {
-       re_wchar_t c = TRANSLATE (start);
-       cmin = MIN (cmin, c);
-       cmax = MAX (cmax, c);
-      }
+    {
+      int ch;
 
-  EXTEND_RANGE_TABLE (work_area, 2);
-  work_area->table[work_area->used++] = (cmin);
-  work_area->table[work_area->used++] = (cmax);
+      for (ch = start; ch <= end; ch++)
+       {
+         re_wchar_t c = TRANSLATE (ch);
+         if (! (start <= c && c <= end))
+           {
+             if (cmin == -1)
+               cmin = c, cmax = c;
+             else
+               {
+                 cmin = MIN (cmin, c);
+                 cmax = MAX (cmax, c);
+               }
+           }
+       }
+
+      if (cmin != -1)
+       {
+         EXTEND_RANGE_TABLE (work_area, 2);
+         work_area->table[work_area->used++] = (cmin);
+         work_area->table[work_area->used++] = (cmax);
+       }
+    }
 
   return -1;
 }
@@ -2618,8 +2622,8 @@ regex_compile (pattern, size, syntax, bufp)
                    unsigned int startoffset = 0;
                    re_opcode_t ofj =
                      /* Check if the loop can match the empty string.  */
-                     (simple || !analyse_first (laststart, b, NULL, 0)) ?
-                     on_failure_jump : on_failure_jump_loop;
+                     (simple || !analyse_first (laststart, b, NULL, 0))
+                     on_failure_jump : on_failure_jump_loop;
                    assert (skip_one_char (laststart) <= b);
                    
                    if (!zero_times_ok && simple)
@@ -2667,8 +2671,9 @@ regex_compile (pattern, size, syntax, bufp)
                  {
                    boolean emptyp = analyse_first (laststart, b, NULL, 0);
 
-                   /* The non-greedy multiple match looks like a repeat..until:
-                      we only need a conditional jump at the end of the loop */
+                   /* The non-greedy multiple match looks like
+                      a repeat..until: we only need a conditional jump
+                      at the end of the loop.  */
                    if (emptyp) BUF_PUSH (no_op);
                    STORE_JUMP (emptyp ? on_failure_jump_nastyloop
                                : on_failure_jump, b, laststart);
@@ -2677,7 +2682,7 @@ regex_compile (pattern, size, syntax, bufp)
                      {
                        /* The repeat...until naturally matches one or more.
                           To also match zero times, we need to first jump to
-                          the end of the loop (its conditional jump). */
+                          the end of the loop (its conditional jump).  */
                        INSERT_JUMP (jump, laststart, b);
                        b += 3;
                      }
@@ -3214,99 +3219,99 @@ regex_compile (pattern, size, syntax, bufp)
                      goto unfetch_interval;
                  }
 
-                if (upper_bound == 0)
-                  /* If the upper bound is zero, just drop the sub pattern
-                     altogether.  */
-                  b = laststart;
-                else if (lower_bound == 1 && upper_bound == 1)
-                  /* Just match it once: nothing to do here.  */
-                  ;
-
-                /* Otherwise, we have a nontrivial interval.  When
-                   we're all done, the pattern will look like:
-                     set_number_at <jump count> <upper bound>
-                     set_number_at <succeed_n count> <lower bound>
-                     succeed_n <after jump addr> <succeed_n count>
-                     <body of loop>
-                     jump_n <succeed_n addr> <jump count>
-                   (The upper bound and `jump_n' are omitted if
-                   `upper_bound' is 1, though.)  */
-                else
-                  { /* If the upper bound is > 1, we need to insert
-                       more at the end of the loop.  */
-                    unsigned int nbytes = (upper_bound < 0 ? 3
-                                           : upper_bound > 1 ? 5 : 0);
-                    unsigned int startoffset = 0;
-
-                    GET_BUFFER_SPACE (20); /* We might use less.  */
-
-                    if (lower_bound == 0)
-                      {
-                        /* A succeed_n that starts with 0 is really a
-                           a simple on_failure_jump_loop.  */
-                        INSERT_JUMP (on_failure_jump_loop, laststart,
-                                     b + 3 + nbytes);
-                        b += 3;
-                      }
-                    else
-                      {
-                        /* Initialize lower bound of the `succeed_n', even
-                           though it will be set during matching by its
-                           attendant `set_number_at' (inserted next),
-                           because `re_compile_fastmap' needs to know.
-                           Jump to the `jump_n' we might insert below.  */
-                        INSERT_JUMP2 (succeed_n, laststart,
-                                      b + 5 + nbytes,
-                                      lower_bound);
-                        b += 5;
-
-                        /* Code to initialize the lower bound.  Insert
-                           before the `succeed_n'.      The `5' is the last two
-                           bytes of this `set_number_at', plus 3 bytes of
-                           the following `succeed_n'.  */
-                        insert_op2 (set_number_at, laststart, 5, lower_bound, b);
-                        b += 5;
-                        startoffset += 5;
-                      }
-
-                    if (upper_bound < 0)
-                      {
-                        /* A negative upper bound stands for infinity,
-                           in which case it degenerates to a plain jump.  */
-                        STORE_JUMP (jump, b, laststart + startoffset);
-                        b += 3;
-                      }
-                    else if (upper_bound > 1)
-                      { /* More than one repetition is allowed, so
-                           append a backward jump to the `succeed_n'
-                           that starts this interval.
-
-                           When we've reached this during matching,
-                           we'll have matched the interval once, so
-                           jump back only `upper_bound - 1' times.  */
-                        STORE_JUMP2 (jump_n, b, laststart + startoffset,
-                                     upper_bound - 1);
-                        b += 5;
-
-                        /* The location we want to set is the second
-                           parameter of the `jump_n'; that is `b-2' as
-                           an absolute address.  `laststart' will be
-                           the `set_number_at' we're about to insert;
-                           `laststart+3' the number to set, the source
-                           for the relative address.  But we are
-                           inserting into the middle of the pattern --
-                           so everything is getting moved up by 5.
-                           Conclusion: (b - 2) - (laststart + 3) + 5,
-                           i.e., b - laststart.
-
-                           We insert this at the beginning of the loop
-                           so that if we fail during matching, we'll
-                           reinitialize the bounds.  */
-                        insert_op2 (set_number_at, laststart, b - laststart,
-                                    upper_bound - 1, b);
-                        b += 5;
-                      }
-                  }
+               if (upper_bound == 0)
+                 /* If the upper bound is zero, just drop the sub pattern
+                    altogether.  */
+                 b = laststart;
+               else if (lower_bound == 1 && upper_bound == 1)
+                 /* Just match it once: nothing to do here.  */
+                 ;
+
+               /* Otherwise, we have a nontrivial interval.  When
+                  we're all done, the pattern will look like:
+                  set_number_at <jump count> <upper bound>
+                  set_number_at <succeed_n count> <lower bound>
+                  succeed_n <after jump addr> <succeed_n count>
+                  <body of loop>
+                  jump_n <succeed_n addr> <jump count>
+                  (The upper bound and `jump_n' are omitted if
+                  `upper_bound' is 1, though.)  */
+               else
+                 { /* If the upper bound is > 1, we need to insert
+                      more at the end of the loop.  */
+                   unsigned int nbytes = (upper_bound < 0 ? 3
+                                          : upper_bound > 1 ? 5 : 0);
+                   unsigned int startoffset = 0;
+
+                   GET_BUFFER_SPACE (20); /* We might use less.  */
+
+                   if (lower_bound == 0)
+                     {
+                       /* A succeed_n that starts with 0 is really a
+                          a simple on_failure_jump_loop.  */
+                       INSERT_JUMP (on_failure_jump_loop, laststart,
+                                    b + 3 + nbytes);
+                       b += 3;
+                     }
+                   else
+                     {
+                       /* Initialize lower bound of the `succeed_n', even
+                          though it will be set during matching by its
+                          attendant `set_number_at' (inserted next),
+                          because `re_compile_fastmap' needs to know.
+                          Jump to the `jump_n' we might insert below.  */
+                       INSERT_JUMP2 (succeed_n, laststart,
+                                     b + 5 + nbytes,
+                                     lower_bound);
+                       b += 5;
+
+                       /* Code to initialize the lower bound.  Insert
+                          before the `succeed_n'.       The `5' is the last two
+                          bytes of this `set_number_at', plus 3 bytes of
+                          the following `succeed_n'.  */
+                       insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+                       b += 5;
+                       startoffset += 5;
+                     }
+
+                   if (upper_bound < 0)
+                     {
+                       /* A negative upper bound stands for infinity,
+                          in which case it degenerates to a plain jump.  */
+                       STORE_JUMP (jump, b, laststart + startoffset);
+                       b += 3;
+                     }
+                   else if (upper_bound > 1)
+                     { /* More than one repetition is allowed, so
+                          append a backward jump to the `succeed_n'
+                          that starts this interval.
+
+                          When we've reached this during matching,
+                          we'll have matched the interval once, so
+                          jump back only `upper_bound - 1' times.  */
+                       STORE_JUMP2 (jump_n, b, laststart + startoffset,
+                                    upper_bound - 1);
+                       b += 5;
+
+                       /* The location we want to set is the second
+                          parameter of the `jump_n'; that is `b-2' as
+                          an absolute address.  `laststart' will be
+                          the `set_number_at' we're about to insert;
+                          `laststart+3' the number to set, the source
+                          for the relative address.  But we are
+                          inserting into the middle of the pattern --
+                          so everything is getting moved up by 5.
+                          Conclusion: (b - 2) - (laststart + 3) + 5,
+                          i.e., b - laststart.
+
+                          We insert this at the beginning of the loop
+                          so that if we fail during matching, we'll
+                          reinitialize the bounds.  */
+                       insert_op2 (set_number_at, laststart, b - laststart,
+                                   upper_bound - 1, b);
+                       b += 5;
+                     }
+                 }
                pending_exact = 0;
                beg_interval = NULL;
              }
@@ -5491,7 +5496,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
             cycle detection cannot work.  Worse yet, such a detection
             can not only fail to detect a cycle, but it can also wrongly
             detect a cycle (between different instantiations of the same
-            loop.
+            loop).
             So the method used for those nasty loops is a little different:
             We use a special cycle-detection-stack-frame which is pushed
             when the on_failure_jump_nastyloop failure-point is *popped*.
@@ -5505,11 +5510,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                        mcnt, p + mcnt);
 
          assert ((re_opcode_t)p[-4] == no_op);
-         CHECK_INFINITE_LOOP (p - 4, d);
-         PUSH_FAILURE_POINT (p - 3, d);
+         {
+           int cycle = 0;
+           CHECK_INFINITE_LOOP (p - 4, d);
+           if (!cycle)
+             /* If there's a cycle, just continue without pushing
+                this failure point.  The failure point is the "try again"
+                option, which shouldn't be tried.
+                We want (x?)*?y\1z to match both xxyz and xxyxz.  */
+             PUSH_FAILURE_POINT (p - 3, d);
+         }
          break;
 
-
          /* Simple loop detecting on_failure_jump:  just check on the
             failure stack if the same spot was already hit earlier.  */
        case on_failure_jump_loop:
@@ -5517,9 +5529,19 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
          EXTRACT_NUMBER_AND_INCR (mcnt, p);
          DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
                        mcnt, p + mcnt);
-
-         CHECK_INFINITE_LOOP (p - 3, d);
-         PUSH_FAILURE_POINT (p - 3, d);
+         {
+           int cycle = 0;
+           CHECK_INFINITE_LOOP (p - 3, d);
+           if (cycle)
+             /* If there's a cycle, get out of the loop, as if the matching
+                had failed.  We used to just `goto fail' here, but that was
+                aborting the search a bit too early: we want to keep the
+                empty-loop-match and keep matching after the loop.
+                We want (x?)*y\1z to match both xxyz and xxyxz.  */
+             p += mcnt;
+           else
+             PUSH_FAILURE_POINT (p - 3, d);
+         }
          break;