X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=f55cc5aeb614b7249d54680b2c5529ce82ea40c9;hb=92b5babed0ef62fb044a2b9b7e331b743158c304;hp=1f6b8a2c1caad17823e7257d566cecfa51025201;hpb=4c90fe8b1d4564fd3843649c098b2dc78503670f;p=gnulib.git diff --git a/regex.c b/regex.c index 1f6b8a2c1..f55cc5aeb 100644 --- a/regex.c +++ b/regex.c @@ -22,19 +22,17 @@ /* TODO: - structure the opcode space into opcode+flag. - merge with glibc's regex.[ch]. - - replace succeed_n + jump_n with a combined operation so that the counter - can simply be decremented when popping the failure_point without having - to stack up failure_count entries. - */ + - replace (succeed_n + jump_n + set_number_at) with something that doesn't + need to modify the compiled regexp so that re_match can be reentrant. + - get rid of on_failure_jump_smart by doing the optimization in re_comp + rather than at run-time, so that re_match can be reentrant. +*/ /* AIX requires this to be the first thing in the file. */ #if defined _AIX && !defined REGEX_MALLOC #pragma alloca #endif -#undef _GNU_SOURCE -#define _GNU_SOURCE - #ifdef HAVE_CONFIG_H # include #endif @@ -48,8 +46,12 @@ /* Whether to use ISO C Amendment 1 wide char functions. Those should not be used for Emacs since it uses its own. */ +#if defined _LIBC +#define WIDE_CHAR_SUPPORT 1 +#else #define WIDE_CHAR_SUPPORT \ - (defined _LIBC || HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) + (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) +#endif /* For platform which support the ISO C amendement 1 functionality we support user defined character classes. */ @@ -123,8 +125,17 @@ # include "charset.h" # include "category.h" +# ifdef malloc +# undef malloc +# endif # define malloc xmalloc +# ifdef realloc +# undef realloc +# endif # define realloc xrealloc +# ifdef free +# undef free +# endif # define free xfree /* Converts the pointer to the char to BEG-based offset from the start. */ @@ -146,8 +157,9 @@ { \ re_char *dtemp = (p) == (str2) ? (end1) : (p); \ re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ - while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ - c = STRING_CHAR (dtemp, (p) - dtemp); \ + re_char *d0 = dtemp; \ + PREV_CHAR_BOUNDARY (d0, dlimit); \ + c = STRING_CHAR (d0, dtemp - d0); \ } \ else \ (c = ((p) == (str2) ? (end1) : (p))[-1]); \ @@ -224,6 +236,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 }; # define SINGLE_BYTE_CHAR_P(c) (1) # define SAME_CHARSET_P(c1, c2) (1) # define MULTIBYTE_FORM_LENGTH(p, s) (1) +# define PREV_CHAR_BOUNDARY(p, limit) ((p)--) # define STRING_CHAR(p, s) (*(p)) # define RE_STRING_CHAR STRING_CHAR # define CHAR_STRING(c, s) (*(s) = (c), 1) @@ -553,7 +566,7 @@ typedef enum is followed by a range table: 2 bytes of flags for character sets (low 8 bits, high 8 bits) See RANGE_TABLE_WORK_BITS below. - 2 bytes, the number of pairs that follow + 2 bytes, the number of pairs that follow (upto 32767) pairs, each 2 multibyte characters, each multibyte character represented as 3 bytes. */ charset, @@ -700,7 +713,7 @@ static void extract_number _RE_ARGS ((int *dest, re_char *source)); static void extract_number (dest, source) int *dest; - unsigned char *source; + re_char *source; { int temp = SIGN_EXTEND_CHAR (*(source + 1)); *dest = *source & 0377; @@ -729,7 +742,7 @@ static void extract_number_and_incr _RE_ARGS ((int *destination, static void extract_number_and_incr (destination, source) int *destination; - unsigned char **source; + re_char **source; { extract_number (destination, *source); *source += 2; @@ -803,9 +816,9 @@ extract_number_and_incr (destination, source) #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ do \ { \ - int range_start, range_end; \ - unsigned char *p; \ - unsigned char *range_table_end \ + re_wchar_t range_start, range_end; \ + re_char *p; \ + re_char *range_table_end \ = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ \ for (p = (range_table); p < range_table_end; p += 2 * 3) \ @@ -829,8 +842,8 @@ extract_number_and_incr (destination, source) { \ /* Number of ranges in range table. */ \ int count; \ - unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \ - \ + re_char *range_table = CHARSET_RANGE_TABLE (charset); \ + \ EXTRACT_NUMBER_AND_INCR (count, range_table); \ CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ } \ @@ -899,59 +912,58 @@ print_fastmap (fastmap) void print_partial_compiled_pattern (start, end) - unsigned char *start; - unsigned char *end; + re_char *start; + re_char *end; { int mcnt, mcnt2; - unsigned char *p = start; - unsigned char *pend = end; + re_char *p = start; + re_char *pend = end; if (start == NULL) { - printf ("(null)\n"); + fprintf (stderr, "(null)\n"); return; } /* Loop over pattern commands. */ while (p < pend) { - printf ("%d:\t", p - start); + fprintf (stderr, "%d:\t", p - start); switch ((re_opcode_t) *p++) { case no_op: - printf ("/no_op"); + fprintf (stderr, "/no_op"); break; case succeed: - printf ("/succeed"); + fprintf (stderr, "/succeed"); break; case exactn: mcnt = *p++; - printf ("/exactn/%d", mcnt); + fprintf (stderr, "/exactn/%d", mcnt); do { - putchar ('/'); - putchar (*p++); + fprintf (stderr, "/%c", *p++); } while (--mcnt); break; case start_memory: - printf ("/start_memory/%d", *p++); + fprintf (stderr, "/start_memory/%d", *p++); break; case stop_memory: - printf ("/stop_memory/%d", *p++); + fprintf (stderr, "/stop_memory/%d", *p++); break; case duplicate: - printf ("/duplicate/%d", *p++); + fprintf (stderr, "/duplicate/%d", *p++); break; case anychar: - printf ("/anychar"); + fprintf (stderr, "/anychar"); break; case charset: @@ -962,7 +974,7 @@ print_partial_compiled_pattern (start, end) int length = CHARSET_BITMAP_SIZE (p - 1); int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1); - printf ("/charset [%s", + fprintf (stderr, "/charset [%s", (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); assert (p + *p < pend); @@ -974,33 +986,33 @@ print_partial_compiled_pattern (start, end) /* Are we starting a range? */ if (last + 1 == c && ! in_range) { - putchar ('-'); + fprintf (stderr, "-"); in_range = 1; } /* Have we broken a range? */ else if (last + 1 != c && in_range) { - putchar (last); + fprintf (stderr, "%c", last); in_range = 0; } if (! in_range) - putchar (c); + fprintf (stderr, "%c", c); last = c; } if (in_range) - putchar (last); + fprintf (stderr, "%c", last); - putchar (']'); + fprintf (stderr, "]"); p += 1 + length; if (has_range_table) { int count; - printf ("has-range-table"); + fprintf (stderr, "has-range-table"); /* ??? Should print the range table; for now, just skip it. */ p += 2; /* skip range table bits */ @@ -1011,130 +1023,130 @@ print_partial_compiled_pattern (start, end) break; case begline: - printf ("/begline"); + fprintf (stderr, "/begline"); break; case endline: - printf ("/endline"); + fprintf (stderr, "/endline"); break; case on_failure_jump: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_jump to %d", p + mcnt - start); + fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start); break; case on_failure_keep_string_jump: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); + fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start); break; case on_failure_jump_nastyloop: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start); + fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start); break; case on_failure_jump_loop: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_jump_loop to %d", p + mcnt - start); + fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start); break; case on_failure_jump_smart: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_jump_smart to %d", p + mcnt - start); + fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start); break; case jump: extract_number_and_incr (&mcnt, &p); - printf ("/jump to %d", p + mcnt - start); + fprintf (stderr, "/jump to %d", p + mcnt - start); break; case succeed_n: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); - printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2); + fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2); break; case jump_n: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); - printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2); + fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2); break; case set_number_at: extract_number_and_incr (&mcnt, &p); extract_number_and_incr (&mcnt2, &p); - printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2); + fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2); break; case wordbound: - printf ("/wordbound"); + fprintf (stderr, "/wordbound"); break; case notwordbound: - printf ("/notwordbound"); + fprintf (stderr, "/notwordbound"); break; case wordbeg: - printf ("/wordbeg"); + fprintf (stderr, "/wordbeg"); break; case wordend: - printf ("/wordend"); + fprintf (stderr, "/wordend"); case syntaxspec: - printf ("/syntaxspec"); + fprintf (stderr, "/syntaxspec"); mcnt = *p++; - printf ("/%d", mcnt); + fprintf (stderr, "/%d", mcnt); break; case notsyntaxspec: - printf ("/notsyntaxspec"); + fprintf (stderr, "/notsyntaxspec"); mcnt = *p++; - printf ("/%d", mcnt); + fprintf (stderr, "/%d", mcnt); break; # ifdef emacs case before_dot: - printf ("/before_dot"); + fprintf (stderr, "/before_dot"); break; case at_dot: - printf ("/at_dot"); + fprintf (stderr, "/at_dot"); break; case after_dot: - printf ("/after_dot"); + fprintf (stderr, "/after_dot"); break; case categoryspec: - printf ("/categoryspec"); + fprintf (stderr, "/categoryspec"); mcnt = *p++; - printf ("/%d", mcnt); + fprintf (stderr, "/%d", mcnt); break; case notcategoryspec: - printf ("/notcategoryspec"); + fprintf (stderr, "/notcategoryspec"); mcnt = *p++; - printf ("/%d", mcnt); + fprintf (stderr, "/%d", mcnt); break; # endif /* emacs */ case begbuf: - printf ("/begbuf"); + fprintf (stderr, "/begbuf"); break; case endbuf: - printf ("/endbuf"); + fprintf (stderr, "/endbuf"); break; default: - printf ("?%d", *(p-1)); + fprintf (stderr, "?%d", *(p-1)); } - putchar ('\n'); + fprintf (stderr, "\n"); } - printf ("%d:\tend of pattern.\n", p - start); + fprintf (stderr, "%d:\tend of pattern.\n", p - start); } @@ -1142,7 +1154,7 @@ void print_compiled_pattern (bufp) struct re_pattern_buffer *bufp; { - unsigned char *buffer = bufp->buffer; + re_char *buffer = bufp->buffer; print_partial_compiled_pattern (buffer, buffer + bufp->used); printf ("%ld bytes used/%ld bytes allocated.\n", @@ -1313,7 +1325,8 @@ static const char *re_error_msgid[] = /* Roughly the maximum number of failure points on the stack. Would be exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. This is a variable only so users of regex can assign to it; we never - change it ourselves. */ + change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE + before using it, so it should probably be a byte-count instead. */ # if defined MATCH_MAY_ALLOCATE /* Note that 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. In order for a larger @@ -1326,7 +1339,7 @@ size_t re_max_failures = 4000; union fail_stack_elt { - const unsigned char *pointer; + re_char *pointer; /* This should be the biggest `int' that's no bigger than a pointer. */ long integer; }; @@ -1341,7 +1354,6 @@ typedef struct size_t frame; /* Offset of the cur constructed frame. */ } fail_stack_type; -#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0) #define FAIL_STACK_EMPTY() (fail_stack.frame == 0) #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) @@ -1413,22 +1425,11 @@ typedef struct 1))) -/* Push pointer POINTER on FAIL_STACK. - Return 1 if was able to do so and 0 if ran out of memory allocating - space to do so. */ -#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ - ((FAIL_STACK_FULL () \ - && !GROW_FAIL_STACK (FAIL_STACK)) \ - ? 0 \ - : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ - 1)) -#define POP_PATTERN_OP() POP_FAILURE_POINTER () - /* Push a pointer value onto the failure stack. Assumes the variable `fail_stack'. Probably should only be called from within `PUSH_FAILURE_POINT'. */ #define PUSH_FAILURE_POINTER(item) \ - fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + fail_stack.stack[fail_stack.avail++].pointer = (item) /* This pushes an integer-valued item onto the failure stack. Assumes the variable `fail_stack'. Probably should only @@ -1478,16 +1479,19 @@ do { \ PUSH_FAILURE_INT (num); \ } while (0) -#define PUSH_FAILURE_COUNT(ptr) \ +/* Change the counter's value to VAL, but make sure that it will + be reset when backtracking. */ +#define PUSH_NUMBER(ptr,val) \ do { \ char *destination; \ int c; \ ENSURE_FAIL_STACK(3); \ EXTRACT_NUMBER (c, ptr); \ - DEBUG_PRINT3 (" Push counter %p = %d\n", ptr, c); \ + DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \ PUSH_FAILURE_INT (c); \ PUSH_FAILURE_POINTER (ptr); \ PUSH_FAILURE_INT (-1); \ + STORE_NUMBER (ptr, val); \ } while (0) /* Pop a saved register off the stack. */ @@ -1497,6 +1501,7 @@ do { \ if (reg == -1) \ { \ /* It's a counter. */ \ + /* Here, we discard `const', making re_match non-reentrant. */ \ unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \ reg = POP_FAILURE_INT (); \ STORE_NUMBER (ptr, reg); \ @@ -1514,22 +1519,25 @@ do { \ /* Check that we are not stuck in an infinite loop. */ #define CHECK_INFINITE_LOOP(pat_cur, string_place) \ do { \ - int failure = TOP_FAILURE_HANDLE(); \ + int failure = TOP_FAILURE_HANDLE (); \ /* Check for infinite matching loops */ \ - while (failure > 0 && \ - (FAILURE_STR (failure) == string_place \ - || FAILURE_STR (failure) == NULL)) \ + while (failure > 0 \ + && (FAILURE_STR (failure) == string_place \ + || FAILURE_STR (failure) == NULL)) \ { \ assert (FAILURE_PAT (failure) >= bufp->buffer \ && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \ if (FAILURE_PAT (failure) == pat_cur) \ - goto fail; \ + { \ + cycle = 1; \ + break; \ + } \ DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \ failure = NEXT_FAILURE_HANDLE(failure); \ } \ DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \ } while (0) - + /* Push the information about the state we will need if we ever fail back to it. @@ -1573,7 +1581,7 @@ do { \ /* Estimate the size of data pushed by a typical failure stack entry. An estimate is all we need, because all we use this for is to choose a limit for how big to make the failure stack. */ - +/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */ #define TYPICAL_FAILURE_SIZE 20 /* How many items can still be added to the stack without overflowing it. */ @@ -1603,14 +1611,14 @@ do { \ while (fail_stack.frame < fail_stack.avail) \ POP_FAILURE_REG_OR_COUNT (); \ \ - pat = (unsigned char *) POP_FAILURE_POINTER (); \ + pat = POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping pattern %p: ", pat); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ \ /* If the saved string location is NULL, it came from an \ on_failure_keep_string_jump opcode, and we want to throw away the \ saved NULL, thus retaining our current position in the string. */ \ - str = (re_char *) POP_FAILURE_POINTER (); \ + str = POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping string %p: `", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ DEBUG_PRINT1 ("'\n"); \ @@ -1641,29 +1649,19 @@ static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)); static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)); -static boolean at_begline_loc_p _RE_ARGS ((const unsigned char *pattern, - const unsigned char *p, +static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern, + re_char *p, reg_syntax_t syntax)); -static boolean at_endline_loc_p _RE_ARGS ((const unsigned char *p, - const unsigned char *pend, +static boolean at_endline_loc_p _RE_ARGS ((re_char *p, + re_char *pend, reg_syntax_t syntax)); -static unsigned char *skip_one_char _RE_ARGS ((unsigned char *p)); -static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, +static re_char *skip_one_char _RE_ARGS ((re_char *p)); +static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, char *fastmap, const int multibyte)); -/* Fetch the next character in the uncompiled pattern---translating it - if necessary. Also cast from a signed character in the constant - string passed to us by the user to an unsigned char that we can use - as an array index (in, e.g., `translate'). */ -#define PATFETCH(c) \ - do { \ - PATFETCH_RAW (c); \ - c = TRANSLATE (c); \ - } while (0) - /* Fetch the next character in the uncompiled pattern, with no translation. */ -#define PATFETCH_RAW(c) \ +#define PATFETCH(c) \ do { \ int len; \ if (p == pend) return REG_EEND; \ @@ -1689,7 +1687,7 @@ static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, /* Make sure we have at least N more bytes of space in buffer. */ #define GET_BUFFER_SPACE(n) \ - while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ + while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \ EXTEND_BUFFER () /* Make sure we have one more byte of buffer space and then add C to it. */ @@ -1778,13 +1776,13 @@ static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, #endif #define EXTEND_BUFFER() \ do { \ - unsigned char *old_buffer = bufp->buffer; \ + re_char *old_buffer = bufp->buffer; \ if (bufp->allocated == MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ bufp->allocated = MAX_BUF_SIZE; \ - bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \ if (bufp->buffer == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ @@ -1811,7 +1809,7 @@ static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, /* But patterns can have more than `MAX_REGNUM' registers. We just ignore the excess. */ -typedef unsigned regnum_t; +typedef int regnum_t; /* Macros for the compile stack. */ @@ -1846,7 +1844,17 @@ typedef struct /* The next available element. */ #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) - +/* Explicit quit checking is only used on NTemacs. */ +#if defined WINDOWSNT && defined emacs && defined QUIT +extern int immediate_quit; +# define IMMEDIATE_QUIT_CHECK \ + do { \ + if (immediate_quit) QUIT; \ + } while (0) +#else +# define IMMEDIATE_QUIT_CHECK ((void)0) +#endif + /* Structure to manage work area for range table. */ struct range_table_work_area { @@ -1856,21 +1864,19 @@ struct range_table_work_area int bits; /* flag to record character classes */ }; -/* Make sure that WORK_AREA can hold more N multibyte characters. */ -#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \ - do { \ - if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \ - { \ - (work_area).allocated += 16 * sizeof (int); \ - if ((work_area).table) \ - (work_area).table \ - = (int *) realloc ((work_area).table, (work_area).allocated); \ - else \ - (work_area).table \ - = (int *) malloc ((work_area).allocated); \ - if ((work_area).table == 0) \ - FREE_STACK_RETURN (REG_ESPACE); \ - } \ +/* Make sure that WORK_AREA can hold more N multibyte characters. + This is used only in set_image_of_range and set_image_of_range_1. + It expects WORK_AREA to be a pointer. + If it can't get the space, it returns from the surrounding function. */ + +#define EXTEND_RANGE_TABLE(work_area, n) \ + do { \ + if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \ + { \ + extend_range_table_work_area (work_area); \ + if ((work_area)->table == 0) \ + return (REG_ESPACE); \ + } \ } while (0) #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \ @@ -1885,12 +1891,15 @@ struct range_table_work_area #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 -/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ -#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ +/* Set a range START..END to WORK_AREA. + The range is passed through TRANSLATE, so START and END + should be untranslated. */ +#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \ do { \ - EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ - (work_area).table[(work_area).used++] = (range_start); \ - (work_area).table[(work_area).used++] = (range_end); \ + int tem; \ + tem = set_image_of_range (&work_area, start, end, translate); \ + if (tem > 0) \ + FREE_STACK_RETURN (tem); \ } while (0) /* Free allocated memory for WORK_AREA. */ @@ -1904,12 +1913,10 @@ struct range_table_work_area #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits) #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) - + /* Set the bit for character C in a list. */ -#define SET_LIST_BIT(c) \ - (b[((unsigned char) (c)) / BYTEWIDTH] \ - |= 1 << (((unsigned char) c) % BYTEWIDTH)) +#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH)) /* Get the next unsigned number in the uncompiled pattern. */ @@ -1917,18 +1924,26 @@ struct range_table_work_area do { if (p != pend) \ { \ PATFETCH (c); \ + if (c == ' ') \ + FREE_STACK_RETURN (REG_BADBR); \ while ('0' <= c && c <= '9') \ { \ + int prev; \ if (num < 0) \ - num = 0; \ + num = 0; \ + prev = num; \ num = num * 10 + c - '0'; \ + if (num / 10 != prev) \ + FREE_STACK_RETURN (REG_BADBR); \ if (p == pend) \ - break; \ + break; \ PATFETCH (c); \ } \ + if (c == ' ') \ + FREE_STACK_RETURN (REG_BADBR); \ } \ } while (0) - + #if WIDE_CHAR_SUPPORT /* The GNU C library provides support for user-defined character classes and the functions from ISO C amendement 1. */ @@ -1940,6 +1955,7 @@ struct range_table_work_area # define CHAR_CLASS_MAX_LENGTH 256 # endif typedef wctype_t re_wctype_t; +typedef wchar_t re_wchar_t; # define re_wctype wctype # define re_iswctype iswctype # define re_wctype_to_bit(cc) 0 @@ -1947,7 +1963,7 @@ typedef wctype_t re_wctype_t; # define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ # define btowc(c) c -/* Character classes' indices. */ +/* Character classes. */ typedef enum { RECC_ERROR = 0, RECC_ALNUM, RECC_ALPHA, RECC_WORD, RECC_GRAPH, RECC_PRINT, @@ -1959,11 +1975,14 @@ typedef enum { RECC_ERROR = 0, RECC_ASCII, RECC_UNIBYTE } re_wctype_t; +typedef int re_wchar_t; + /* Map a string to the char class it names (if any). */ static re_wctype_t -re_wctype (string) - unsigned char *string; +re_wctype (str) + re_char *str; { + const char *string = str; if (STREQ (string, "alnum")) return RECC_ALNUM; else if (STREQ (string, "alpha")) return RECC_ALPHA; else if (STREQ (string, "word")) return RECC_WORD; @@ -2010,6 +2029,8 @@ re_iswctype (ch, cc) case RECC_MULTIBYTE: return !ISUNIBYTE (ch); case RECC_WORD: return ISWORD (ch); case RECC_ERROR: return false; + default: + abort(); } } @@ -2030,15 +2051,261 @@ re_wctype_to_bit (cc) case RECC_SPACE: return BIT_SPACE; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; + default: + abort(); } } #endif + +/* Filling in the work area of a range. */ + +/* Actually extend the space in WORK_AREA. */ + +static void +extend_range_table_work_area (work_area) + struct range_table_work_area *work_area; +{ + work_area->allocated += 16 * sizeof (int); + if (work_area->table) + work_area->table + = (int *) realloc (work_area->table, work_area->allocated); + else + work_area->table + = (int *) malloc (work_area->allocated); +} + +#ifdef emacs + +/* Carefully find the ranges of codes that are equivalent + under case conversion to the range start..end when passed through + TRANSLATE. Handle the case where non-letters can come in between + two upper-case letters (which happens in Latin-1). + Also handle the case of groups of more than 2 case-equivalent chars. + + The basic method is to look at consecutive characters and see + if they can form a run that can be handled as one. + + Returns -1 if successful, REG_ESPACE if ran out of space. */ + +static int +set_image_of_range_1 (work_area, start, end, translate) + RE_TRANSLATE_TYPE translate; + struct range_table_work_area *work_area; + re_wchar_t start, end; +{ + /* `one_case' indicates a character, or a run of characters, + each of which is an isolate (no case-equivalents). + This includes all ASCII non-letters. + + `two_case' indicates a character, or a run of characters, + each of which has two case-equivalent forms. + This includes all ASCII letters. + + `strange' indicates a character that has more than one + case-equivalent. */ + + enum case_type {one_case, two_case, strange}; -/* QUIT is only used on NTemacs. */ -#if !defined WINDOWSNT || !defined emacs || !defined QUIT -# undef QUIT -# define QUIT + /* Describe the run that is in progress, + which the next character can try to extend. + If run_type is strange, that means there really is no run. + If run_type is one_case, then run_start...run_end is the run. + If run_type is two_case, then the run is run_start...run_end, + and the case-equivalents end at run_eqv_end. */ + + enum case_type run_type = strange; + int run_start, run_end, run_eqv_end; + + Lisp_Object eqv_table; + + if (!RE_TRANSLATE_P (translate)) + { + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = (start); + work_area->table[work_area->used++] = (end); + return -1; + } + + eqv_table = XCHAR_TABLE (translate)->extras[2]; + + for (; start <= end; start++) + { + enum case_type this_type; + int eqv = RE_TRANSLATE (eqv_table, start); + int minchar, maxchar; + + /* Classify this character */ + if (eqv == start) + this_type = one_case; + else if (RE_TRANSLATE (eqv_table, eqv) == start) + this_type = two_case; + else + this_type = strange; + + if (start < eqv) + minchar = start, maxchar = eqv; + else + minchar = eqv, maxchar = start; + + /* Can this character extend the run in progress? */ + if (this_type == strange || this_type != run_type + || !(minchar == run_end + 1 + && (run_type == two_case + ? maxchar == run_eqv_end + 1 : 1))) + { + /* No, end the run. + Record each of its equivalent ranges. */ + if (run_type == one_case) + { + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = run_start; + work_area->table[work_area->used++] = run_end; + } + else if (run_type == two_case) + { + EXTEND_RANGE_TABLE (work_area, 4); + work_area->table[work_area->used++] = run_start; + work_area->table[work_area->used++] = run_end; + work_area->table[work_area->used++] + = RE_TRANSLATE (eqv_table, run_start); + work_area->table[work_area->used++] + = RE_TRANSLATE (eqv_table, run_end); + } + run_type = strange; + } + + if (this_type == strange) + { + /* For a strange character, add each of its equivalents, one + by one. Don't start a range. */ + do + { + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = eqv; + work_area->table[work_area->used++] = eqv; + eqv = RE_TRANSLATE (eqv_table, eqv); + } + while (eqv != start); + } + + /* Add this char to the run, or start a new run. */ + else if (run_type == strange) + { + /* Initialize a new range. */ + run_type = this_type; + run_start = start; + run_end = start; + run_eqv_end = RE_TRANSLATE (eqv_table, run_end); + } + else + { + /* Extend a running range. */ + run_end = minchar; + run_eqv_end = RE_TRANSLATE (eqv_table, run_end); + } + } + + /* If a run is still in progress at the end, finish it now + by recording its equivalent ranges. */ + if (run_type == one_case) + { + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = run_start; + work_area->table[work_area->used++] = run_end; + } + else if (run_type == two_case) + { + EXTEND_RANGE_TABLE (work_area, 4); + work_area->table[work_area->used++] = run_start; + work_area->table[work_area->used++] = run_end; + work_area->table[work_area->used++] + = RE_TRANSLATE (eqv_table, run_start); + work_area->table[work_area->used++] + = RE_TRANSLATE (eqv_table, run_end); + } + + return -1; +} + +#endif /* emacs */ + +/* Record the the image of the range start..end when passed through + TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end) + and is not even necessarily contiguous. + Normally we approximate it with the smallest contiguous range that contains + all the chars we need. However, for Latin-1 we go to extra effort + to do a better job. + + This function is not called for ASCII ranges. + + Returns -1 if successful, REG_ESPACE if ran out of space. */ + +static int +set_image_of_range (work_area, start, end, translate) + RE_TRANSLATE_TYPE translate; + struct range_table_work_area *work_area; + re_wchar_t start, end; +{ + re_wchar_t cmin, cmax; + +#ifdef emacs + /* For Latin-1 ranges, use set_image_of_range_1 + to get proper handling of ranges that include letters and nonletters. + For a range that includes the whole of Latin-1, this is not necessary. + For other character sets, we don't bother to get this right. */ + if (RE_TRANSLATE_P (translate) && start < 04400 + && !(start < 04200 && end >= 04377)) + { + int newend; + int tem; + newend = end; + if (newend > 04377) + newend = 04377; + tem = set_image_of_range_1 (work_area, start, newend, translate); + if (tem > 0) + return tem; + + start = 04400; + if (end < 04400) + return -1; + } #endif + + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = (start); + work_area->table[work_area->used++] = (end); + + cmin = -1, cmax = -1; + + if (RE_TRANSLATE_P (translate)) + { + int ch; + + for (ch = start; ch <= end; ch++) + { + re_wchar_t c = TRANSLATE (ch); + if (! (start <= c && c <= end)) + { + if (cmin == -1) + cmin = c, cmax = c; + else + { + cmin = MIN (cmin, c); + cmax = MAX (cmax, c); + } + } + } + + if (cmin != -1) + { + EXTEND_RANGE_TABLE (work_area, 2); + work_area->table[work_area->used++] = (cmin); + work_area->table[work_area->used++] = (cmax); + } + } + + return -1; +} #ifndef MATCH_MAY_ALLOCATE @@ -2124,10 +2391,8 @@ regex_compile (pattern, size, syntax, bufp) reg_syntax_t syntax; struct re_pattern_buffer *bufp; { - /* We fetch characters from PATTERN here. Even though PATTERN is - `char *' (i.e., signed), we declare these variables as unsigned, so - they can be reliably used as array indices. */ - register unsigned int c, c1; + /* We fetch characters from PATTERN here. */ + register re_wchar_t c, c1; /* A random temporary spot in PATTERN. */ re_char *p1; @@ -2354,10 +2619,11 @@ regex_compile (pattern, size, syntax, bufp) boolean simple = skip_one_char (laststart) == b; unsigned int startoffset = 0; re_opcode_t ofj = - (simple || !analyse_first (laststart, b, NULL, 0)) ? - on_failure_jump : on_failure_jump_loop; + /* Check if the loop can match the empty string. */ + (simple || !analyse_first (laststart, b, NULL, 0)) + ? on_failure_jump : on_failure_jump_loop; assert (skip_one_char (laststart) <= b); - + if (!zero_times_ok && simple) { /* Since simple * loops can be made faster by using on_failure_keep_string_jump, we turn simple P+ @@ -2403,8 +2669,9 @@ regex_compile (pattern, size, syntax, bufp) { boolean emptyp = analyse_first (laststart, b, NULL, 0); - /* The non-greedy multiple match looks like a repeat..until: - we only need a conditional jump at the end of the loop */ + /* The non-greedy multiple match looks like + a repeat..until: we only need a conditional jump + at the end of the loop. */ if (emptyp) BUF_PUSH (no_op); STORE_JUMP (emptyp ? on_failure_jump_nastyloop : on_failure_jump, b, laststart); @@ -2413,7 +2680,7 @@ regex_compile (pattern, size, syntax, bufp) { /* The repeat...until naturally matches one or more. To also match zero times, we need to first jump to - the end of the loop (its conditional jump). */ + the end of the loop (its conditional jump). */ INSERT_JUMP (jump, laststart, b); b += 3; } @@ -2478,6 +2745,10 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* Don't translate yet. The range TRANSLATE(X..Y) cannot + always be determined from TRANSLATE(X) and TRANSLATE(Y) + So the translation is done later in a loop. Example: + (let ((case-fold-search t)) (string-match "[A-_]" "A")) */ PATFETCH (c); /* \ might escape characters inside [...] and [^...]. */ @@ -2537,7 +2808,7 @@ regex_compile (pattern, size, syntax, bufp) them). */ if (c == ':' && *p == ']') { - int ch; + re_wchar_t ch; re_wctype_t cc; cc = re_wctype (str); @@ -2606,11 +2877,11 @@ regex_compile (pattern, size, syntax, bufp) starting at the smallest character in the charset of C1 and ending at C1. */ int charset = CHAR_CHARSET (c1); - int c2 = MAKE_CHAR (charset, 0, 0); - + re_wchar_t c2 = MAKE_CHAR (charset, 0, 0); + SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1); - c1 = 377; + c1 = 0377; } } else if (!SAME_CHARSET_P (c, c1)) @@ -2624,8 +2895,8 @@ regex_compile (pattern, size, syntax, bufp) if (SINGLE_BYTE_CHAR_P (c)) /* ... into bitmap. */ { - unsigned this_char; - int range_start = c, range_end = c1; + re_wchar_t this_char; + re_wchar_t range_start = c, range_end = c1; /* If the start is after the end, the range is empty. */ if (range_start > range_end) @@ -2722,7 +2993,7 @@ regex_compile (pattern, size, syntax, bufp) /* Do not translate the character after the \, so that we can distinguish, e.g., \B from \b, even if we normally would translate, e.g., B to b. */ - PATFETCH_RAW (c); + PATFETCH (c); switch (c) { @@ -2946,99 +3217,99 @@ regex_compile (pattern, size, syntax, bufp) goto unfetch_interval; } - if (upper_bound == 0) - /* If the upper bound is zero, just drop the sub pattern - altogether. */ - b = laststart; - else if (lower_bound == 1 && upper_bound == 1) - /* Just match it once: nothing to do here. */ - ; - - /* Otherwise, we have a nontrivial interval. When - we're all done, the pattern will look like: - set_number_at - set_number_at - succeed_n - - jump_n - (The upper bound and `jump_n' are omitted if - `upper_bound' is 1, though.) */ - else - { /* If the upper bound is > 1, we need to insert - more at the end of the loop. */ - unsigned int nbytes = (upper_bound < 0 ? 3 - : upper_bound > 1 ? 5 : 0); - unsigned int startoffset = 0; - - GET_BUFFER_SPACE (20); /* We might use less. */ - - if (lower_bound == 0) - { - /* A succeed_n that starts with 0 is really a - a simple on_failure_jump_loop. */ - INSERT_JUMP (on_failure_jump_loop, laststart, - b + 3 + nbytes); - b += 3; - } - else - { - /* Initialize lower bound of the `succeed_n', even - though it will be set during matching by its - attendant `set_number_at' (inserted next), - because `re_compile_fastmap' needs to know. - Jump to the `jump_n' we might insert below. */ - INSERT_JUMP2 (succeed_n, laststart, - b + 5 + nbytes, - lower_bound); - b += 5; - - /* Code to initialize the lower bound. Insert - before the `succeed_n'. The `5' is the last two - bytes of this `set_number_at', plus 3 bytes of - the following `succeed_n'. */ - insert_op2 (set_number_at, laststart, 5, lower_bound, b); - b += 5; - startoffset += 5; - } - - if (upper_bound < 0) - { - /* A negative upper bound stands for infinity, - in which case it degenerates to a plain jump. */ - STORE_JUMP (jump, b, laststart + startoffset); - b += 3; - } - else if (upper_bound > 1) - { /* More than one repetition is allowed, so - append a backward jump to the `succeed_n' - that starts this interval. - - When we've reached this during matching, - we'll have matched the interval once, so - jump back only `upper_bound - 1' times. */ - STORE_JUMP2 (jump_n, b, laststart + startoffset, - upper_bound - 1); - b += 5; - - /* The location we want to set is the second - parameter of the `jump_n'; that is `b-2' as - an absolute address. `laststart' will be - the `set_number_at' we're about to insert; - `laststart+3' the number to set, the source - for the relative address. But we are - inserting into the middle of the pattern -- - so everything is getting moved up by 5. - Conclusion: (b - 2) - (laststart + 3) + 5, - i.e., b - laststart. - - We insert this at the beginning of the loop - so that if we fail during matching, we'll - reinitialize the bounds. */ - insert_op2 (set_number_at, laststart, b - laststart, - upper_bound - 1, b); - b += 5; - } - } + if (upper_bound == 0) + /* If the upper bound is zero, just drop the sub pattern + altogether. */ + b = laststart; + else if (lower_bound == 1 && upper_bound == 1) + /* Just match it once: nothing to do here. */ + ; + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at + set_number_at + succeed_n + + jump_n + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else + { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned int nbytes = (upper_bound < 0 ? 3 + : upper_bound > 1 ? 5 : 0); + unsigned int startoffset = 0; + + GET_BUFFER_SPACE (20); /* We might use less. */ + + if (lower_bound == 0) + { + /* A succeed_n that starts with 0 is really a + a simple on_failure_jump_loop. */ + INSERT_JUMP (on_failure_jump_loop, laststart, + b + 3 + nbytes); + b += 3; + } + else + { + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + nbytes, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + startoffset += 5; + } + + if (upper_bound < 0) + { + /* A negative upper bound stands for infinity, + in which case it degenerates to a plain jump. */ + STORE_JUMP (jump, b, laststart + startoffset); + b += 3; + } + else if (upper_bound > 1) + { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2 (jump_n, b, laststart + startoffset, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2 (set_number_at, laststart, b - laststart, + upper_bound - 1, b); + b += 5; + } + } pending_exact = 0; beg_interval = NULL; } @@ -3082,13 +3353,13 @@ regex_compile (pattern, size, syntax, bufp) case 'c': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (categoryspec, c); break; case 'C': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (notcategoryspec, c); break; #endif /* emacs */ @@ -3148,20 +3419,21 @@ regex_compile (pattern, size, syntax, bufp) case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - if (syntax & RE_NO_BK_REFS) - goto normal_char; + { + regnum_t reg; - c1 = c - '0'; + if (syntax & RE_NO_BK_REFS) + goto normal_backslash; - if (c1 > regnum) - FREE_STACK_RETURN (REG_ESUBREG); + reg = c - '0'; - /* Can't back reference to a subexpression if inside of it. */ - if (group_in_compile_stack (compile_stack, (regnum_t) c1)) - goto normal_char; + /* Can't back reference to a subexpression before its end. */ + if (reg > regnum || group_in_compile_stack (compile_stack, reg)) + FREE_STACK_RETURN (REG_ESUBREG); - laststart = b; - BUF_PUSH_2 (duplicate, c1); + laststart = b; + BUF_PUSH_2 (duplicate, reg); + } break; @@ -3177,7 +3449,6 @@ regex_compile (pattern, size, syntax, bufp) /* You might think it would be useful for \ to mean not to translate; but if we don't translate it it will never match anything. */ - c = TRANSLATE (c); goto normal_char; } break; @@ -3186,7 +3457,7 @@ regex_compile (pattern, size, syntax, bufp) default: /* Expects the character in `c'. */ normal_char: - /* If no exactn currently being built. */ + /* If no exactn currently being built. */ if (!pending_exact /* If last exactn not at current position. */ @@ -3217,6 +3488,7 @@ regex_compile (pattern, size, syntax, bufp) { int len; + c = TRANSLATE (c); if (multibyte) len = CHAR_STRING (c, b); else @@ -3360,10 +3632,10 @@ insert_op2 (op, loc, arg1, arg2, end) static boolean at_begline_loc_p (pattern, p, syntax) - const unsigned char *pattern, *p; + re_char *pattern, *p; reg_syntax_t syntax; { - const unsigned char *prev = p - 2; + re_char *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return @@ -3384,12 +3656,12 @@ at_begline_loc_p (pattern, p, syntax) static boolean at_endline_loc_p (p, pend, syntax) - const unsigned char *p, *pend; + re_char *p, *pend; reg_syntax_t syntax; { - const unsigned char *next = p; + re_char *next = p; boolean next_backslash = *next == '\\'; - const unsigned char *next_next = p + 1 < pend ? p + 1 : 0; + re_char *next_next = p + 1 < pend ? p + 1 : 0; return /* Before a subexpression? */ @@ -3428,36 +3700,16 @@ group_in_compile_stack (compile_stack, regnum) Return 1 if p..pend might match the empty string. Return 0 if p..pend matches at least one char. - Return -1 if p..pend matches at least one char, but fastmap was not - updated accurately. - Return -2 if an error occurred. */ + Return -1 if fastmap was not updated accurately. */ static int analyse_first (p, pend, fastmap, multibyte) - unsigned char *p, *pend; + re_char *p, *pend; char *fastmap; const int multibyte; { int j, k; boolean not; -#ifdef MATCH_MAY_ALLOCATE - fail_stack_type fail_stack; -#endif -#ifndef REGEX_MALLOC - char *destination; -#endif - -#if defined REL_ALLOC && defined REGEX_MALLOC - /* This holds the pointer to the failure stack, when - it is allocated relocatably. */ - fail_stack_elt_t *failure_stack_ptr; -#endif - - /* Assume that each path through the pattern can be null until - proven otherwise. We set this false at the bottom of switch - statement, to which we get only if a particular path doesn't - match the empty string. */ - boolean path_can_be_null = true; /* If all elements for base leading-codes in fastmap is set, this flag is set true. */ @@ -3465,8 +3717,6 @@ analyse_first (p, pend, fastmap, multibyte) assert (p); - INIT_FAIL_STACK (); - /* The loop below works as follows: - It has a working-list kept in the PATTERN_STACK and which basically starts by only containing a pointer to the first operation. @@ -3482,7 +3732,7 @@ analyse_first (p, pend, fastmap, multibyte) so that `p' is monotonically increasing. More to the point, we never set `p' (or push) anything `<= p1'. */ - while (1) + while (p < pend) { /* `p1' is used as a marker of how far back a `on_failure_jump' can go without being ignored. It is normally equal to `p' @@ -3492,29 +3742,12 @@ analyse_first (p, pend, fastmap, multibyte) 3..9: 10: on_failure_jump 3 as used for the *? operator. */ - unsigned char *p1 = p; - - if (p >= pend) - { - if (path_can_be_null) - return (RESET_FAIL_STACK (), 1); - - /* We have reached the (effective) end of pattern. */ - if (PATTERN_STACK_EMPTY ()) - return (RESET_FAIL_STACK (), 0); - - p = (unsigned char*) POP_PATTERN_OP (); - path_can_be_null = true; - continue; - } - - /* We should never be about to go beyond the end of the pattern. */ - assert (p < pend); + re_char *p1 = p; switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) { case succeed: - p = pend; + return 1; continue; case duplicate: @@ -3546,7 +3779,7 @@ analyse_first (p, pend, fastmap, multibyte) /* We could put all the chars except for \n (and maybe \0) but we don't bother since it is generally not worth it. */ if (!fastmap) break; - return (RESET_FAIL_STACK (), -1); + return -1; case charset_not: @@ -3621,7 +3854,7 @@ analyse_first (p, pend, fastmap, multibyte) #else /* emacs */ /* This match depends on text properties. These end with aborting optimizations. */ - return (RESET_FAIL_STACK (), -1); + return -1; case categoryspec: case notcategoryspec: @@ -3688,8 +3921,14 @@ analyse_first (p, pend, fastmap, multibyte) EXTRACT_NUMBER_AND_INCR (j, p); if (p + j <= p1) ; /* Backward jump to be ignored. */ - else if (!PUSH_PATTERN_OP (p + j, fail_stack)) - return (RESET_FAIL_STACK (), -2); + else + { /* We have to look down both arms. + We first go down the "straight" path so as to minimize + stack usage when going through alternatives. */ + int r = analyse_first (p, pend, fastmap, multibyte); + if (r) return r; + p += j; + } continue; @@ -3701,7 +3940,7 @@ analyse_first (p, pend, fastmap, multibyte) case has already been handled, so we only need to look at the fallthrough case. */ continue; - + case succeed_n: /* If N == 0, it should be an on_failure_jump_loop instead. */ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0)); @@ -3729,15 +3968,13 @@ analyse_first (p, pend, fastmap, multibyte) /* Getting here means we have found the possible starting characters for one path of the pattern -- and that the empty - string does not match. We need not follow this path further. - Instead, look at the next alternative (remembered on the - stack), or quit if no more. The test at the top of the loop - does these things. */ - path_can_be_null = false; - p = pend; + string does not match. We need not follow this path further. */ + return 0; } /* while p */ - return (RESET_FAIL_STACK (), 0); + /* We reached the end without matching anything. */ + return 1; + } /* analyse_first */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in @@ -3772,8 +4009,6 @@ re_compile_fastmap (bufp) analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, fastmap, RE_MULTIBYTE_P (bufp)); bufp->can_be_null = (analysis != 0); - if (analysis < -1) - return analysis; return 0; } /* re_compile_fastmap */ @@ -3830,6 +4065,10 @@ re_search (bufp, string, size, startpos, range, regs) } WEAK_ALIAS (__re_search, re_search) +/* Head address of virtual concatenation of string. */ +#define HEAD_ADDR_VSTRING(P) \ + (((P) >= size1 ? string2 : string1)) + /* End address of virtual concatenation of string. */ #define STOP_ADDR_VSTRING(P) \ (((P) >= size1 ? string2 + size2 : string1 + size1)) @@ -3916,8 +4155,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) /* Update the fastmap now if not correct already. */ if (fastmap && !bufp->fastmap_accurate) - if (re_compile_fastmap (bufp) == -2) - return -2; + re_compile_fastmap (bufp); /* See whether the pattern is anchored. */ anchored_start = (bufp->buffer[0] == begline); @@ -3953,7 +4191,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) if (fastmap && startpos < total_size && !bufp->can_be_null) { register re_char *d; - register unsigned int buf_ch; + register re_wchar_t buf_ch; d = POS_ADDR_VSTRING (startpos); @@ -4066,26 +4304,17 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) /* Update STARTPOS to the previous character boundary. */ if (multibyte) { - re_char *p = POS_ADDR_VSTRING (startpos); - int len = 0; + re_char *p = POS_ADDR_VSTRING (startpos) + 1; + re_char *p0 = p; + re_char *phead = HEAD_ADDR_VSTRING (startpos); /* Find the head of multibyte form. */ - while (!CHAR_HEAD_P (*p)) - p--, len++; - - /* Adjust it. */ -#if 0 /* XXX */ - if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1)) - ; - else -#endif - { - range += len; - if (range > 0) - break; + PREV_CHAR_BOUNDARY (p, phead); + range += p0 - 1 - p; + if (range > 0) + break; - startpos -= len; - } + startpos -= p0 - 1 - p; } } } @@ -4186,15 +4415,15 @@ static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2, /* If the operation is a match against one or more chars, return a pointer to the next operation, else return NULL. */ -static unsigned char * +static re_char * skip_one_char (p) - unsigned char *p; + re_char *p; { switch (SWITCH_ENUM_CAST (*p++)) { case anychar: break; - + case exactn: p += *p + 1; break; @@ -4211,7 +4440,7 @@ skip_one_char (p) else p += 1 + CHARSET_BITMAP_SIZE (p - 1); break; - + case syntaxspec: case notsyntaxspec: #ifdef emacs @@ -4294,13 +4523,13 @@ mutually_exclusive_p (bufp, p1, p2) return 1; } break; - + case endline: case exactn: { - register unsigned int c + register re_wchar_t c = (re_opcode_t) *p2 == endline ? '\n' - : RE_STRING_CHAR(p2 + 2, pend - p2 - 2); + : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); if ((re_opcode_t) *p1 == exactn) { @@ -4345,13 +4574,11 @@ mutually_exclusive_p (bufp, p1, p2) break; case charset: - case charset_not: { if ((re_opcode_t) *p1 == exactn) /* Reuse the code above. */ return mutually_exclusive_p (bufp, p2, p1); - /* It is hard to list up all the character in charset P2 if it includes multibyte character. Give up in such case. */ @@ -4367,7 +4594,7 @@ mutually_exclusive_p (bufp, p1, p2) P2 is ASCII, it is enough to test only bitmap table of P1. */ - if (*p1 == *p2) + if ((re_opcode_t) *p1 == charset) { int idx; /* We win if the charset inside the loop @@ -4386,8 +4613,7 @@ mutually_exclusive_p (bufp, p1, p2) return 1; } } - else if ((re_opcode_t) *p1 == charset - || (re_opcode_t) *p1 == charset_not) + else if ((re_opcode_t) *p1 == charset_not) { int idx; /* We win if the charset_not inside the loop lists @@ -4406,7 +4632,24 @@ mutually_exclusive_p (bufp, p1, p2) } } } - + break; + + case charset_not: + switch (SWITCH_ENUM_CAST (*p1)) + { + case exactn: + case charset: + /* Reuse the code above. */ + return mutually_exclusive_p (bufp, p2, p1); + case charset_not: + /* When we have two charset_not, it's very unlikely that + they don't overlap. The union of the two sets of excluded + chars should cover all possible chars, which, as a matter of + fact, is virtually impossible in multibyte buffers. */ + break; + } + break; + case wordend: case notsyntaxspec: return ((re_opcode_t) *p1 == syntaxspec @@ -4520,8 +4763,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* General temporaries. */ int mcnt; + size_t reg; boolean not; - unsigned char *p1; /* Just past the end of the corresponding string. */ re_char *end1, *end2; @@ -4540,8 +4783,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) re_char *dfail; /* Where we are in the pattern, and the end of the pattern. */ - unsigned char *p = bufp->buffer; - register unsigned char *pend = p + bufp->used; + re_char *p = bufp->buffer; + re_char *pend = p + bufp->used; /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -4650,8 +4893,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Initialize subexpression text positions to -1 to mark ones that no start_memory/stop_memory has been seen for. Also initialize the register information struct. */ - for (mcnt = 1; mcnt < num_regs; mcnt++) - regstart[mcnt] = regend[mcnt] = NULL; + for (reg = 1; reg < num_regs; reg++) + regstart[reg] = regend[reg] = NULL; /* We move `string1' into `string2' if the latter's empty -- but not if `string1' is null. */ @@ -4753,10 +4996,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); - for (mcnt = 1; mcnt < num_regs; mcnt++) + for (reg = 1; reg < num_regs; reg++) { - best_regstart[mcnt] = regstart[mcnt]; - best_regend[mcnt] = regend[mcnt]; + best_regstart[reg] = regstart[reg]; + best_regend[reg] = regend[reg]; } } goto fail; @@ -4779,10 +5022,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) dend = ((d >= string1 && d <= end1) ? end_match_1 : end_match_2); - for (mcnt = 1; mcnt < num_regs; mcnt++) + for (reg = 1; reg < num_regs; reg++) { - regstart[mcnt] = best_regstart[mcnt]; - regend[mcnt] = best_regend[mcnt]; + regstart[reg] = best_regstart[reg]; + regend[reg] = best_regend[reg]; } } } /* d != end_match_2 */ @@ -4842,16 +5085,16 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Go through the first `min (num_regs, regs->num_regs)' registers, since that is all we initialized. */ - for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++) { - if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) - regs->start[mcnt] = regs->end[mcnt] = -1; + if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg])) + regs->start[reg] = regs->end[reg] = -1; else { - regs->start[mcnt] - = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); - regs->end[mcnt] - = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); + regs->start[reg] + = (regoff_t) POINTER_TO_OFFSET (regstart[reg]); + regs->end[reg] + = (regoff_t) POINTER_TO_OFFSET (regend[reg]); } } @@ -4860,8 +5103,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) we (re)allocated the registers, this is the case, because we always allocate enough to have at least one -1 at the end. */ - for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) - regs->start[mcnt] = regs->end[mcnt] = -1; + for (reg = num_regs; reg < regs->num_regs; reg++) + regs->start[reg] = regs->end[reg] = -1; } /* regs && !bufp->no_sub */ DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", @@ -4959,7 +5202,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case anychar: { int buf_charlen; - unsigned int buf_ch; + re_wchar_t buf_ch; DEBUG_PRINT1 ("EXECUTING anychar.\n"); @@ -4988,7 +5231,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Start of actual range_table, or end of bitmap if there is no range table. */ - unsigned char *range_table; + re_char *range_table; /* Nonzero if there is a range table. */ int range_table_exists; @@ -5074,7 +5317,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) assert (!REG_UNSET (regstart[*p])); /* Strictly speaking, there should be code such as: - + assert (REG_UNSET (regend[*p])); PUSH_FAILURE_REGSTOP ((unsigned int)*p); @@ -5246,7 +5489,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) cycle detection cannot work. Worse yet, such a detection can not only fail to detect a cycle, but it can also wrongly detect a cycle (between different instantiations of the same - loop. + loop). So the method used for those nasty loops is a little different: We use a special cycle-detection-stack-frame which is pushed when the on_failure_jump_nastyloop failure-point is *popped*. @@ -5260,11 +5503,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) mcnt, p + mcnt); assert ((re_opcode_t)p[-4] == no_op); - CHECK_INFINITE_LOOP (p - 4, d); - PUSH_FAILURE_POINT (p - 3, d); + { + int cycle = 0; + CHECK_INFINITE_LOOP (p - 4, d); + if (!cycle) + /* If there's a cycle, just continue without pushing + this failure point. The failure point is the "try again" + option, which shouldn't be tried. + We want (x?)*?y\1z to match both xxyz and xxyxz. */ + PUSH_FAILURE_POINT (p - 3, d); + } break; - /* Simple loop detecting on_failure_jump: just check on the failure stack if the same spot was already hit earlier. */ case on_failure_jump_loop: @@ -5272,9 +5522,19 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n", mcnt, p + mcnt); - - CHECK_INFINITE_LOOP (p - 3, d); - PUSH_FAILURE_POINT (p - 3, d); + { + int cycle = 0; + CHECK_INFINITE_LOOP (p - 3, d); + if (cycle) + /* If there's a cycle, get out of the loop, as if the matching + had failed. We used to just `goto fail' here, but that was + aborting the search a bit too early: we want to keep the + empty-loop-match and keep matching after the loop. + We want (x?)*y\1z to match both xxyz and xxyxz. */ + p += mcnt; + else + PUSH_FAILURE_POINT (p - 3, d); + } break; @@ -5291,7 +5551,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) the repetition text and either the following jump or pop_failure_jump back to this on_failure_jump. */ case on_failure_jump: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n", mcnt, p + mcnt); @@ -5307,13 +5567,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) then we can use a non-backtracking loop based on on_failure_keep_string_jump instead of on_failure_jump. */ case on_failure_jump_smart: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n", mcnt, p + mcnt); { - unsigned char *p1 = p; /* Next operation. */ - unsigned char *p2 = p + mcnt; /* Destination of the jump. */ + re_char *p1 = p; /* Next operation. */ + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */ + unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */ p -= 3; /* Reset so that we will re-execute the instruction once it's been changed. */ @@ -5329,14 +5591,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* Use a fast `on_failure_keep_string_jump' loop. */ DEBUG_PRINT1 (" smart exclusive => fast loop.\n"); - *p = (unsigned char) on_failure_keep_string_jump; + *p3 = (unsigned char) on_failure_keep_string_jump; STORE_NUMBER (p2 - 2, mcnt + 3); } else { /* Default to a safe `on_failure_jump' loop. */ DEBUG_PRINT1 (" smart default => slow loop.\n"); - *p = (unsigned char) on_failure_jump; + *p3 = (unsigned char) on_failure_jump; } DEBUG_STATEMENT (debug -= 2); } @@ -5345,7 +5607,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ @@ -5356,17 +5618,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Have to succeed matching what follows at least n times. After that, handle like `on_failure_jump'. */ case succeed_n: + /* Signedness doesn't matter since we only compare MCNT to 0. */ EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); /* Originally, mcnt is how many times we HAVE to succeed. */ if (mcnt != 0) { + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; - p += 2; - PUSH_FAILURE_COUNT (p); - DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt); - STORE_NUMBER_AND_INCR (p, mcnt); + p += 4; + PUSH_NUMBER (p2, mcnt); } else /* The two bytes encoding mcnt == 0 are two no_op opcodes. */ @@ -5374,15 +5637,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) break; case jump_n: + /* Signedness doesn't matter since we only compare MCNT to 0. */ EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); /* Originally, this is how many times we CAN jump. */ if (mcnt != 0) { + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; - PUSH_FAILURE_COUNT (p + 2); - STORE_NUMBER (p + 2, mcnt); + PUSH_NUMBER (p2, mcnt); goto unconditional_jump; } /* If don't have to jump any more, skip over the rest of command. */ @@ -5392,14 +5657,16 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case set_number_at: { + unsigned char *p2; /* Location of the counter. */ DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); EXTRACT_NUMBER_AND_INCR (mcnt, p); - p1 = p + mcnt; + /* Here, we discard `const', making re_match non-reentrant. */ + p2 = (unsigned char*) p + mcnt; + /* Signedness doesn't matter since we only copy MCNT's bits . */ EXTRACT_NUMBER_AND_INCR (mcnt, p); - DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); - PUSH_FAILURE_COUNT (p1); - STORE_NUMBER (p1, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt); + PUSH_NUMBER (p2, mcnt); break; } @@ -5417,7 +5684,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d - 1); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5456,7 +5724,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5465,7 +5734,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PREFETCH (); c2 = RE_STRING_CHAR (d, dend - d); s2 = SYNTAX (c2); - + /* Case 2: S2 is not Sword. */ if (s2 != Sword) goto fail; @@ -5499,7 +5768,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d) - 1; int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5544,7 +5814,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } #endif { - int c, len; + int len; + re_wchar_t c; c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); @@ -5580,7 +5851,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); PREFETCH (); { - int c, len; + int len; + re_wchar_t c; + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) @@ -5599,11 +5872,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We goto here if a matching operation fails. */ fail: - QUIT; + IMMEDIATE_QUIT_CHECK; if (!FAIL_STACK_EMPTY ()) { - re_char *str; - unsigned char *pat; + re_char *str, *pat; /* A restart point is known. Restore to that state. */ DEBUG_PRINT1 ("\nFAIL:\n"); POP_FAILURE_POINT (str, pat); @@ -5673,7 +5945,7 @@ bcmp_translate (s1, s2, len, translate, multibyte) while (p1 < p1_end && p2 < p2_end) { int p1_charlen, p2_charlen; - int p1_ch, p2_ch; + re_wchar_t p1_ch, p2_ch; p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); @@ -5835,8 +6107,8 @@ re_exec (s) int regcomp (preg, pattern, cflags) - regex_t *preg; - const char *pattern; + regex_t *__restrict preg; + const char *__restrict pattern; int cflags; { reg_errcode_t ret; @@ -5920,10 +6192,10 @@ WEAK_ALIAS (__regcomp, regcomp) int regexec (preg, string, nmatch, pmatch, eflags) - const regex_t *preg; - const char *string; + const regex_t *__restrict preg; + const char *__restrict string; size_t nmatch; - regmatch_t pmatch[]; + regmatch_t pmatch[__restrict_arr]; int eflags; { int ret;