X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=3b4eb502596b4b6a0689620cff984a8da5e0c6d6;hb=871e5b2ce2391c85ae9288ea1e92895005817e21;hp=711f7c7afa9fe847b52e5f6f3442795db00c143d;hpb=7540e1406f34e8083740a2954fd2e97b9878e5e6;p=gnulib.git diff --git a/regex.c b/regex.c index 711f7c7af..3b4eb5025 100644 --- a/regex.c +++ b/regex.c @@ -20,12 +20,9 @@ USA. */ /* TODO: - - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac". - - use `keep_string' more often than just .*\n. - structure the opcode space into opcode+flag. - - merge with glic's regex.[ch] - - That's it for now -sm */ + - merge with glibc's regex.[ch] + */ /* AIX requires this to be the first thing in the file. */ #if defined (_AIX) && !defined (REGEX_MALLOC) @@ -39,8 +36,6 @@ /* Converts the pointer to the char to BEG-based offset from the start. */ #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) -#else -#define PTR_TO_OFFSET(d) 0 #endif #ifdef HAVE_CONFIG_H @@ -81,8 +76,28 @@ #define realloc xrealloc #define free xfree +#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) #define RE_STRING_CHAR(p, s) \ (multibyte ? (STRING_CHAR (p, s)) : (*(p))) +#define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ + (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) + +/* Set C a (possibly multibyte) character before P. P points into a + string which is the virtual concatenation of STR1 (which ends at + END1) or STR2 (which ends at END2). */ +#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ + do { \ + if (multibyte) \ + { \ + re_char *dtemp = (p) == (str2) ? (end1) : (p); \ + re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ + while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ + c = STRING_CHAR (dtemp, (p) - dtemp); \ + } \ + else \ + (c = ((p) == (str2) ? (end1) : (p))[-1]); \ + } while (0) + #else /* not emacs */ @@ -130,11 +145,8 @@ char *realloc (); /* Define the syntax stuff for \<, \>, etc. */ -/* This must be nonzero for the wordchar and notwordchar pattern - commands in re_match_2. */ -#ifndef Sword -#define Sword 1 -#endif +/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */ +enum syntaxcode { Swhitespace = 0, Sword = 1 }; #ifdef SWITCH_ENUM_BUG #define SWITCH_ENUM_CAST(x) ((int)(x)) @@ -184,6 +196,10 @@ init_syntax_once () /* Dummy macros for non-Emacs environments. */ #define BASE_LEADING_CODE_P(c) (0) +#define CHAR_CHARSET(c) 0 +#define CHARSET_LEADING_CODE_BASE(c) 0 +#define MAX_MULTIBYTE_LENGTH 1 +#define RE_MULTIBYTE_P(x) 0 #define WORD_BOUNDARY_P(c1, c2) (0) #define CHAR_HEAD_P(p) (1) #define SINGLE_BYTE_CHAR_P(c) (1) @@ -191,7 +207,9 @@ init_syntax_once () #define MULTIBYTE_FORM_LENGTH(p, s) (1) #define STRING_CHAR(p, s) (*(p)) #define RE_STRING_CHAR STRING_CHAR +#define CHAR_STRING(c, s) (*(s) = (c), 1) #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) +#define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) #endif /* not emacs */ @@ -411,7 +429,7 @@ char *alloca (); #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ REGEX_REALLOCATE (source, osize, nsize) /* No need to explicitly free anything. */ -#define REGEX_FREE_STACK(arg) +#define REGEX_FREE_STACK(arg) ((void)0) #endif /* not REGEX_MALLOC */ #endif /* not using relocating allocator */ @@ -531,6 +549,12 @@ typedef enum indefinitely). */ on_failure_jump_loop, + /* Just like `on_failure_jump_loop', except that it checks for + a different kind of loop (the kind that shows up with non-greedy + operators). This operation has to be immediately preceded + by a `no_op'. */ + on_failure_jump_nastyloop, + /* A smart `on_failure_jump' used for greedy * and + operators. It analyses the loop before which it is put and if the loop does not require backtracking, it changes itself to @@ -540,7 +564,9 @@ typedef enum on_failure_jump_smart, /* Followed by two-byte relative address and two-byte number n. - After matching N times, jump to the address upon failure. */ + After matching N times, jump to the address upon failure. + Does not work if N starts at 0: use on_failure_jump_loop + instead. */ succeed_n, /* Followed by two-byte relative address, and two-byte number n. @@ -552,26 +578,23 @@ typedef enum bytes of number. */ set_number_at, - wordchar, /* Matches any word-constituent character. */ - notwordchar, /* Matches any char that is not a word-constituent. */ - wordbeg, /* Succeeds if at word beginning. */ wordend, /* Succeeds if at word end. */ wordbound, /* Succeeds if at a word boundary. */ - notwordbound /* Succeeds if not at a word boundary. */ - -#ifdef emacs - ,before_dot, /* Succeeds if before point. */ - at_dot, /* Succeeds if at point. */ - after_dot, /* Succeeds if after point. */ + notwordbound, /* Succeeds if not at a word boundary. */ /* Matches any character whose syntax is specified. Followed by a byte which contains a syntax code, e.g., Sword. */ syntaxspec, /* Matches any character whose syntax is not that specified. */ - notsyntaxspec, + notsyntaxspec + +#ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ /* Matches any character whose category-set contains the specified category. The operator is followed by a byte which contains a @@ -944,6 +967,11 @@ print_partial_compiled_pattern (start, end) printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); break; + case on_failure_jump_nastyloop: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start); + break; + case on_failure_jump_loop: extract_number_and_incr (&mcnt, &p); printf ("/on_failure_jump_loop to %d", p + mcnt - start); @@ -992,6 +1020,18 @@ print_partial_compiled_pattern (start, end) case wordend: printf ("/wordend"); + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + #ifdef emacs case before_dot: printf ("/before_dot"); @@ -1005,27 +1045,19 @@ print_partial_compiled_pattern (start, end) printf ("/after_dot"); break; - case syntaxspec: - printf ("/syntaxspec"); + case categoryspec: + printf ("/categoryspec"); mcnt = *p++; printf ("/%d", mcnt); break; - case notsyntaxspec: - printf ("/notsyntaxspec"); + case notcategoryspec: + printf ("/notcategoryspec"); mcnt = *p++; printf ("/%d", mcnt); break; #endif /* emacs */ - case wordchar: - printf ("/wordchar"); - break; - - case notwordchar: - printf ("/notwordchar"); - break; - case begbuf: printf ("/begbuf"); break; @@ -1277,7 +1309,7 @@ typedef struct fail_stack.frame = 0; \ } while (0) -#define RESET_FAIL_STACK() +#define RESET_FAIL_STACK() ((void)0) #endif @@ -1529,29 +1561,29 @@ static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, const unsigned char *pend, reg_syntax_t syntax)); static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); +static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend, + char *fastmap, const int multibyte)); /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use as an array index (in, e.g., `translate'). */ -#ifndef PATFETCH #define PATFETCH(c) \ do { \ PATFETCH_RAW (c); \ - if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ + c = TRANSLATE (c); \ } while (0) -#endif /* Fetch the next character in the uncompiled pattern, with no translation. */ #define PATFETCH_RAW(c) \ - do {if (p == pend) return REG_EEND; \ - c = *p++; \ + do { \ + int len; \ + if (p == pend) return REG_EEND; \ + c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ + p += len; \ } while (0) -/* Go backwards one character in the pattern. */ -#define PATUNFETCH p-- - /* If `translate' is non-null, return translate[D], else just D. We cast the subscript to translate because some data is declared as @@ -1946,6 +1978,9 @@ regex_compile (pattern, size, syntax, bufp) /* Work area for range table of charset. */ struct range_table_work_area range_table_work; + /* If the object matched can contain multibyte characters. */ + const boolean multibyte = RE_MULTIBYTE_P (bufp); + #ifdef DEBUG debug++; DEBUG_PRINT1 ("\nCompiling pattern: "); @@ -1983,14 +2018,6 @@ regex_compile (pattern, size, syntax, bufp) /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0; -#ifdef emacs - /* bufp->multibyte is set before regex_compile is called, so don't alter - it. */ -#else /* not emacs */ - /* Nothing is recognized as a multibyte character. */ - bufp->multibyte = 0; -#endif - #if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once (); @@ -2091,35 +2118,24 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) break; - - PATFETCH (c); - - if (c == '*' - || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + else if (*p == '*' + || (!(syntax & RE_BK_PLUS_QM) + && (*p == '+' || *p == '?'))) ; - - else if (syntax & RE_BK_PLUS_QM && c == '\\') + else if (syntax & RE_BK_PLUS_QM && *p == '\\') { - if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); - - PATFETCH (c1); - if (!(c1 == '+' || c1 == '?')) - { - PATUNFETCH; - PATUNFETCH; - break; - } - - c = c1; + if (p+1 == pend) + FREE_STACK_RETURN (REG_EESCAPE); + if (p[1] == '+' || p[1] == '?') + PATFETCH (c); /* Gobble up the backslash. */ + else + break; } else - { - PATUNFETCH; - break; - } - + break; /* If we get here, we found another repeat character. */ - } + PATFETCH (c); + } /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ @@ -2134,6 +2150,9 @@ regex_compile (pattern, size, syntax, bufp) { boolean simple = skip_one_char (laststart) == b; unsigned int startoffset = 0; + re_opcode_t ofj = + (simple || !analyse_first (laststart, b, NULL, 0)) ? + on_failure_jump : on_failure_jump_loop; assert (skip_one_char (laststart) <= b); if (!zero_times_ok && simple) @@ -2152,14 +2171,13 @@ regex_compile (pattern, size, syntax, bufp) GET_BUFFER_SPACE (6); if (!zero_times_ok) /* A + loop. */ - STORE_JUMP (on_failure_jump_loop, b, b + 6); + STORE_JUMP (ofj, b, b + 6); else /* Simple * loops can use on_failure_keep_string_jump depending on what follows. But since we don't know that yet, we leave the decision up to on_failure_jump_smart. */ - INSERT_JUMP (simple ? on_failure_jump_smart - : on_failure_jump_loop, + INSERT_JUMP (simple ? on_failure_jump_smart : ofj, laststart + startoffset, b + 6); b += 3; STORE_JUMP (jump, b, laststart + startoffset); @@ -2177,19 +2195,22 @@ regex_compile (pattern, size, syntax, bufp) else /* not greedy */ { /* I wish the greedy and non-greedy cases could be merged. */ + GET_BUFFER_SPACE (7); /* We might use less. */ if (many_times_ok) { + boolean emptyp = analyse_first (laststart, b, NULL, 0); + /* The non-greedy multiple match looks like a repeat..until: we only need a conditional jump at the end of the loop */ - GET_BUFFER_SPACE (3); - STORE_JUMP (on_failure_jump, b, laststart); + if (emptyp) BUF_PUSH (no_op); + STORE_JUMP (emptyp ? on_failure_jump_nastyloop + : on_failure_jump, b, laststart); b += 3; if (zero_times_ok) { /* The repeat...until naturally matches one or more. To also match zero times, we need to first jump to the end of the loop (its conditional jump). */ - GET_BUFFER_SPACE (3); INSERT_JUMP (jump, laststart, b); b += 3; } @@ -2197,7 +2218,6 @@ regex_compile (pattern, size, syntax, bufp) else { /* non-greedy a?? */ - GET_BUFFER_SPACE (6); INSERT_JUMP (jump, laststart, b + 3); b += 3; INSERT_JUMP (on_failure_jump, laststart, laststart + 6); @@ -2250,8 +2270,8 @@ regex_compile (pattern, size, syntax, bufp) /* Read in characters and ranges, setting map bits. */ for (;;) { - int len; boolean escaped_char = false; + const unsigned char *p2 = p; if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -2270,19 +2290,10 @@ regex_compile (pattern, size, syntax, bufp) /* Could be the end of the bracket expression. If it's not (i.e., when the bracket expression is `[]' so far), the ']' character bit gets set way below. */ - if (c == ']' && p != p1 + 1) + if (c == ']' && p2 != p1) break; } - /* If C indicates start of multibyte char, get the - actual character code in C, and set the pattern - pointer P to the next character boundary. */ - if (bufp->multibyte && BASE_LEADING_CODE_P (c)) - { - PATUNFETCH; - c = STRING_CHAR_AND_LENGTH (p, pend - p, len); - p += len; - } /* What should we do for the character which is greater than 0x7F, but not BASE_LEADING_CODE_P? XXX */ @@ -2290,14 +2301,16 @@ regex_compile (pattern, size, syntax, bufp) /* See if we're at the beginning of a possible character class. */ - else if (!escaped_char && - syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + if (!escaped_char && + syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ char str[CHAR_CLASS_MAX_LENGTH + 1]; + const unsigned char *class_beg; PATFETCH (c); c1 = 0; + class_beg = p; /* If pattern is `[[:'. */ if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -2352,7 +2365,7 @@ regex_compile (pattern, size, syntax, bufp) they can only match ASCII characters. We don't need to handle them for multibyte. */ - if (bufp->multibyte) + if (multibyte) { int bit = 0; @@ -2410,9 +2423,8 @@ regex_compile (pattern, size, syntax, bufp) } else { - c1++; - while (c1--) - PATUNFETCH; + /* Go back to right after the "[:". */ + p = class_beg; SET_LIST_BIT ('['); /* Because the `:' may starts the range, we @@ -2430,12 +2442,6 @@ regex_compile (pattern, size, syntax, bufp) /* Fetch the character which ends the range. */ PATFETCH (c1); - if (bufp->multibyte && BASE_LEADING_CODE_P (c1)) - { - PATUNFETCH; - c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len); - p += len; - } if (SINGLE_BYTE_CHAR_P (c) && ! SINGLE_BYTE_CHAR_P (c1)) @@ -2573,9 +2579,9 @@ regex_compile (pattern, size, syntax, bufp) if (p+1 < pend) { /* Look for a special (?...) construct */ - PATFETCH (c); - if ((syntax & RE_SHY_GROUPS) && c == '?') + if ((syntax & RE_SHY_GROUPS) && *p == '?') { + PATFETCH (c); /* Gobble up the '?'. */ PATFETCH (c); switch (c) { @@ -2585,7 +2591,6 @@ regex_compile (pattern, size, syntax, bufp) FREE_STACK_RETURN (REG_BADPAT); } } - else PATUNFETCH; } if (!shy) @@ -2733,8 +2738,9 @@ regex_compile (pattern, size, syntax, bufp) if (!(syntax & RE_INTERVALS) /* If we're at `\{' and it's not the open-interval operator. */ - || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - || (p - 2 == pattern && p == pend)) + || (syntax & RE_NO_BK_BRACES) + /* What is that? -sm */ + /* || (p - 2 == pattern && p == pend) */) goto normal_backslash; handle_interval: @@ -2744,7 +2750,7 @@ regex_compile (pattern, size, syntax, bufp) /* At least (most) this many matches must be made. */ int lower_bound = 0, upper_bound = -1; - beg_interval = p - 1; + beg_interval = p; if (p == pend) { @@ -2757,16 +2763,13 @@ regex_compile (pattern, size, syntax, bufp) GET_UNSIGNED_NUMBER (lower_bound); if (c == ',') - { - GET_UNSIGNED_NUMBER (upper_bound); - if (upper_bound < 0) upper_bound = RE_DUP_MAX; - } + GET_UNSIGNED_NUMBER (upper_bound); else /* Interval such as `{1}' => match exactly once. */ upper_bound = lower_bound; if (lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound) + || (upper_bound >= 0 && lower_bound > upper_bound)) { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; @@ -2802,15 +2805,13 @@ regex_compile (pattern, size, syntax, bufp) goto unfetch_interval; } - /* If the upper bound is zero, don't want to succeed at - all; jump from `laststart' to `b + 3', which will be - the end of the buffer after we insert the jump. */ if (upper_bound == 0) - { - GET_BUFFER_SPACE (3); - INSERT_JUMP (jump, laststart, b + 3); - b += 3; - } + /* If the upper bound is zero, just drop the sub pattern + altogether. */ + b = laststart; + else if (lower_bound == 1 && upper_bound == 1) + /* Just match it once: nothing to do here. */ + ; /* Otherwise, we have a nontrivial interval. When we're all done, the pattern will look like: @@ -2824,28 +2825,49 @@ regex_compile (pattern, size, syntax, bufp) else { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ - unsigned nbytes = 10 + (upper_bound > 1) * 10; - - GET_BUFFER_SPACE (nbytes); - - /* Initialize lower bound of the `succeed_n', even - though it will be set during matching by its - attendant `set_number_at' (inserted next), - because `re_compile_fastmap' needs to know. - Jump to the `jump_n' we might insert below. */ - INSERT_JUMP2 (succeed_n, laststart, - b + 5 + (upper_bound > 1) * 5, - lower_bound); - b += 5; - - /* Code to initialize the lower bound. Insert - before the `succeed_n'. The `5' is the last two - bytes of this `set_number_at', plus 3 bytes of - the following `succeed_n'. */ - insert_op2 (set_number_at, laststart, 5, lower_bound, b); - b += 5; - - if (upper_bound > 1) + unsigned int nbytes = (upper_bound < 0 ? 3 + : upper_bound > 1 ? 5 : 0); + unsigned int startoffset = 0; + + GET_BUFFER_SPACE (20); /* We might use less. */ + + if (lower_bound == 0) + { + /* A succeed_n that starts with 0 is really a + a simple on_failure_jump_loop. */ + INSERT_JUMP (on_failure_jump_loop, laststart, + b + 3 + nbytes); + b += 3; + } + else + { + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + nbytes, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + startoffset += 5; + } + + if (upper_bound < 0) + { + /* A negative upper bound stands for infinity, + in which case it degenerates to a plain jump. */ + STORE_JUMP (jump, b, laststart + startoffset); + b += 3; + } + else if (upper_bound > 1) { /* More than one repetition is allowed, so append a backward jump to the `succeed_n' that starts this interval. @@ -2853,7 +2875,7 @@ regex_compile (pattern, size, syntax, bufp) When we've reached this during matching, we'll have matched the interval once, so jump back only `upper_bound - 1' times. */ - STORE_JUMP2 (jump_n, b, laststart + 5, + STORE_JUMP2 (jump_n, b, laststart + startoffset, upper_bound - 1); b += 5; @@ -2888,14 +2910,15 @@ regex_compile (pattern, size, syntax, bufp) beg_interval = NULL; /* normal_char and normal_backslash need `c'. */ - PATFETCH (c); + c = '{'; if (!(syntax & RE_NO_BK_BRACES)) { - if (p > pattern && p[-1] == '\\') - goto normal_backslash; + assert (p > pattern && p[-1] == '\\'); + goto normal_backslash; } - goto normal_char; + else + goto normal_char; #ifdef emacs /* There is no way to specify the before_dot and after_dot @@ -2932,13 +2955,13 @@ regex_compile (pattern, size, syntax, bufp) case 'w': laststart = b; - BUF_PUSH (wordchar); + BUF_PUSH_2 (syntaxspec, Sword); break; case 'W': laststart = b; - BUF_PUSH (notwordchar); + BUF_PUSH_2 (notsyntaxspec, Sword); break; @@ -3006,16 +3029,6 @@ regex_compile (pattern, size, syntax, bufp) default: /* Expects the character in `c'. */ normal_char: - p1 = p - 1; /* P1 points the head of C. */ -#ifdef emacs - if (bufp->multibyte) - { - c = STRING_CHAR (p1, pend - p1); - c = TRANSLATE (c); - /* Set P to the next character boundary. */ - p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; - } -#endif /* If no exactn currently being built. */ if (!pending_exact @@ -3023,7 +3036,7 @@ regex_compile (pattern, size, syntax, bufp) || pending_exact + *pending_exact + 1 != b /* We have only one byte following the exactn for the count. */ - || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) + || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH /* If followed by a repetition operator. */ || (p != pend && (*p == '*' || *p == '^')) @@ -3043,24 +3056,13 @@ regex_compile (pattern, size, syntax, bufp) pending_exact = b - 1; } -#ifdef emacs - if (! SINGLE_BYTE_CHAR_P (c)) - { - unsigned char str[MAX_MULTIBYTE_LENGTH]; - int i = CHAR_STRING (c, str); - int j; - for (j = 0; j < i; j++) - { - BUF_PUSH (str[j]); - (*pending_exact)++; - } - } - else -#endif - { - BUF_PUSH (c); - (*pending_exact)++; - } + GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH); + { + int len = CHAR_STRING (c, b); + b += len; + (*pending_exact) += len; + } + break; } /* switch (c) */ } /* while p != pend */ @@ -3251,28 +3253,26 @@ group_in_compile_stack (compile_stack, regnum) return false; } -/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in - BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible - characters can start a string that matches the pattern. This fastmap - is used by re_search to skip quickly over impossible starting points. +/* analyse_first. + If fastmap is non-NULL, go through the pattern and fill fastmap + with all the possible leading chars. If fastmap is NULL, don't + bother filling it up (obviously) and only return whether the + pattern could potentially match the empty string. + + Return 1 if p..pend might match the empty string. + Return 0 if p..pend matches at least one char. + Return -1 if p..pend matches at least one char, but fastmap was not + updated accurately. + Return -2 if an error occurred. */ - Character codes above (1 << BYTEWIDTH) are not represented in the - fastmap, but the leading codes are represented. Thus, the fastmap - indicates which character sets could start a match. - - The caller must supply the address of a (1 << BYTEWIDTH)-byte data - area as BUFP->fastmap. - - We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in - the pattern buffer. - - Returns 0 if we succeed, -2 if an internal error. */ - -int -re_compile_fastmap (bufp) - struct re_pattern_buffer *bufp; +static int +analyse_first (p, pend, fastmap, multibyte) + unsigned char *p, *pend; + char *fastmap; + const int multibyte; { int j, k; + boolean not; #ifdef MATCH_MAY_ALLOCATE fail_stack_type fail_stack; #endif @@ -3280,12 +3280,6 @@ re_compile_fastmap (bufp) char *destination; #endif - register char *fastmap = bufp->fastmap; - unsigned char *pattern = bufp->buffer; - unsigned long size = bufp->used; - unsigned char *p = pattern; - register unsigned char *pend = pattern + size; - #if defined (REL_ALLOC) && defined (REGEX_MALLOC) /* This holds the pointer to the failure stack, when it is allocated relocatably. */ @@ -3298,22 +3292,13 @@ re_compile_fastmap (bufp) match the empty string. */ boolean path_can_be_null = true; - /* We aren't doing a `succeed_n' to begin with. */ - boolean succeed_n_p = false; - /* If all elements for base leading-codes in fastmap is set, this flag is set true. */ boolean match_any_multibyte_characters = false; - /* Maximum code of simple (single byte) character. */ - int simple_char_max; - - assert (fastmap != NULL && p != NULL); + assert (p); INIT_FAIL_STACK (); - bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ - bufp->fastmap_accurate = 1; /* It will be when we're done. */ - bufp->can_be_null = 0; /* The loop below works as follows: - It has a working-list kept in the PATTERN_STACK and which basically @@ -3331,7 +3316,7 @@ re_compile_fastmap (bufp) never set `p' (or push) anything `<= p1'. */ /* If can_be_null is set, then the fastmap will not be used anyway. */ - while (!bufp->can_be_null) + while (1) { /* `p1' is used as a marker of how far back a `on_failure_jump' can go without being ignored. It is normally equal to `p' @@ -3343,22 +3328,18 @@ re_compile_fastmap (bufp) as used for the *? operator. */ unsigned char *p1 = p; - if (p == pend || *p == succeed) + if (p >= pend) { - /* We have reached the (effective) end of pattern. */ - if (!PATTERN_STACK_EMPTY ()) - { - bufp->can_be_null |= path_can_be_null; - - /* Reset for next path. */ - path_can_be_null = true; + if (path_can_be_null) + return (RESET_FAIL_STACK (), 1); - p = (unsigned char*) POP_PATTERN_OP (); + /* We have reached the (effective) end of pattern. */ + if (PATTERN_STACK_EMPTY ()) + return (RESET_FAIL_STACK (), 0); - continue; - } - else - break; + p = (unsigned char*) POP_PATTERN_OP (); + path_can_be_null = true; + continue; } /* We should never be about to go beyond the end of the pattern. */ @@ -3366,6 +3347,9 @@ re_compile_fastmap (bufp) switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) { + case succeed: + p = pend; + continue; case duplicate: /* If the first character has to match a backreference, that means @@ -3380,83 +3364,15 @@ re_compile_fastmap (bufp) with `break'. */ case exactn: - fastmap[p[1]] = 1; - break; - - -#ifndef emacs - case charset: - { - int length = (*p & 0x7f);; - p++; - - for (j = length * BYTEWIDTH - 1; j >= 0; j--) - if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) - fastmap[j] = 1; - } - break; - - case charset_not: - /* Chars beyond end of map must be allowed. */ - { - int length = (*p & 0x7f);; - p++; - - for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) - fastmap[j] = 1; - - for (j = length * BYTEWIDTH - 1; j >= 0; j--) - if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) - fastmap[j] = 1; - } - break; - - case wordchar: - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX (j) == Sword) - fastmap[j] = 1; + if (fastmap) fastmap[p[1]] = 1; break; - case notwordchar: - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX (j) != Sword) - fastmap[j] = 1; - break; -#else /* emacs */ - case charset: - for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; - j >= 0; j--) - if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) - fastmap[j] = 1; - - /* If we can match a character class, we can match - any character set. */ - if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) - && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) - goto set_fastmap_for_multibyte_characters; - - if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) - && match_any_multibyte_characters == false) - { - /* Set fastmap[I] 1 where I is a base leading code of each - multibyte character in the range table. */ - int c, count; - - /* Make P points the range table. */ - p += CHARSET_BITMAP_SIZE (&p[-2]); - - /* Extract the number of ranges in range table into COUNT. */ - EXTRACT_NUMBER_AND_INCR (count, p); - for (; count > 0; count--, p += 2 * 3) /* XXX */ - { - /* Extract the start of each range. */ - EXTRACT_CHARACTER (c, p); - j = CHAR_CHARSET (c); - fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1; - } - } - break; + case anychar: + /* We could put all the chars except for \n (and maybe \0) + but we don't bother since it is generally not worth it. */ + if (!fastmap) break; + return (RESET_FAIL_STACK (), -1); case charset_not: @@ -3464,19 +3380,26 @@ re_compile_fastmap (bufp) All the single-byte codes can occur in multibyte buffers. So any that are not listed in the charset are possible matches, even in multibyte buffers. */ - simple_char_max = (1 << BYTEWIDTH); + if (!fastmap) break; for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; - j < simple_char_max; j++) + j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; - + /* Fallthrough */ + case charset: + if (!fastmap) break; + not = (re_opcode_t) *(p - 1) == charset_not; for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; j >= 0; j--) - if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) fastmap[j] = 1; - if (bufp->multibyte) - /* Any character set can possibly contain a character - which doesn't match the specified set of characters. */ + if ((not && multibyte) + /* Any character set can possibly contain a character + which doesn't match the specified set of characters. */ + || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) + && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) + /* If we can match a character class, we can match + any character set. */ { set_fastmap_for_multibyte_characters: if (match_any_multibyte_characters == false) @@ -3487,153 +3410,79 @@ re_compile_fastmap (bufp) match_any_multibyte_characters = true; } } - break; - - - case wordchar: - /* All the single-byte codes can occur in multibyte buffers, - and they may have word syntax. So do consider them. */ - simple_char_max = (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (SYNTAX (j) == Sword) - fastmap[j] = 1; - - if (bufp->multibyte) - /* Any character set can possibly contain a character - whose syntax is `Sword'. */ - goto set_fastmap_for_multibyte_characters; - break; + else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) + && match_any_multibyte_characters == false) + { + /* Set fastmap[I] 1 where I is a base leading code of each + multibyte character in the range table. */ + int c, count; - case notwordchar: - /* All the single-byte codes can occur in multibyte buffers, - and they may not have word syntax. So do consider them. */ - simple_char_max = (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (SYNTAX (j) != Sword) - fastmap[j] = 1; + /* Make P points the range table. `+ 2' is to skip flag + bits for a character class. */ + p += CHARSET_BITMAP_SIZE (&p[-2]) + 2; - if (bufp->multibyte) - /* Any character set can possibly contain a character - whose syntax is not `Sword'. */ - goto set_fastmap_for_multibyte_characters; + /* Extract the number of ranges in range table into COUNT. */ + EXTRACT_NUMBER_AND_INCR (count, p); + for (; count > 0; count--, p += 2 * 3) /* XXX */ + { + /* Extract the start of each range. */ + EXTRACT_CHARACTER (c, p); + j = CHAR_CHARSET (c); + fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1; + } + } break; -#endif - - case anychar: - { - int fastmap_newline = fastmap['\n']; - - /* `.' matches anything, except perhaps newline. - Even in a multibyte buffer, it should match any - conceivable byte value for the fastmap. */ - if (bufp->multibyte) - match_any_multibyte_characters = true; - - simple_char_max = (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - fastmap[j] = 1; - - /* ... except perhaps newline. */ - if (!(bufp->syntax & RE_DOT_NEWLINE)) - fastmap['\n'] = fastmap_newline; - - /* Otherwise, have to check alternative paths. */ - break; - } -#ifdef emacs - case wordbound: - case notwordbound: - case wordbeg: - case wordend: - case notsyntaxspec: case syntaxspec: - /* This match depends on text properties. These end with - aborting optimizations. */ - bufp->can_be_null = 1; - continue; -#if 0 - k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (SYNTAX (j) == (enum syntaxcode) k) - fastmap[j] = 1; - - if (bufp->multibyte) - /* Any character set can possibly contain a character - whose syntax is K. */ - goto set_fastmap_for_multibyte_characters; - break; - case notsyntaxspec: + if (!fastmap) break; +#ifndef emacs + not = (re_opcode_t)p[-1] == notsyntaxspec; k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (SYNTAX (j) != (enum syntaxcode) k) + for (j = 0; j < (1 << BYTEWIDTH); j++) + if ((SYNTAX (j) == (enum syntaxcode) k) ^ not) fastmap[j] = 1; - - if (bufp->multibyte) - /* Any character set can possibly contain a character - whose syntax is not K. */ - goto set_fastmap_for_multibyte_characters; break; -#endif - +#else /* emacs */ + /* This match depends on text properties. These end with + aborting optimizations. */ + return (RESET_FAIL_STACK (), -1); case categoryspec: - k = *p++; - simple_char_max = (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (CHAR_HAS_CATEGORY (j, k)) - fastmap[j] = 1; - - if (bufp->multibyte) - /* Any character set can possibly contain a character - whose category is K. */ - goto set_fastmap_for_multibyte_characters; - break; - - case notcategoryspec: + if (!fastmap) break; + not = (re_opcode_t)p[-1] == notcategoryspec; k = *p++; - simple_char_max = (1 << BYTEWIDTH); - for (j = 0; j < simple_char_max; j++) - if (!CHAR_HAS_CATEGORY (j, k)) + for (j = 0; j < (1 << BYTEWIDTH); j++) + if ((CHAR_HAS_CATEGORY (j, k)) ^ not) fastmap[j] = 1; - if (bufp->multibyte) + if (multibyte) /* Any character set can possibly contain a character - whose category is not K. */ + whose category is K (or not). */ goto set_fastmap_for_multibyte_characters; break; /* All cases after this match the empty string. These end with `continue'. */ - case before_dot: case at_dot: case after_dot: - continue; -#endif /* emacs */ - - +#endif /* !emacs */ case no_op: case begline: case endline: case begbuf: case endbuf: -#ifndef emacs case wordbound: case notwordbound: case wordbeg: case wordend: -#endif continue; - case jump_n: case jump: EXTRACT_NUMBER_AND_INCR (j, p); if (j < 0) @@ -3646,6 +3495,7 @@ re_compile_fastmap (bufp) case on_failure_jump: case on_failure_keep_string_jump: case on_failure_jump_loop: + case on_failure_jump_nastyloop: case on_failure_jump_smart: p++; break; @@ -3658,53 +3508,33 @@ re_compile_fastmap (bufp) case on_failure_jump: case on_failure_keep_string_jump: + case on_failure_jump_nastyloop: case on_failure_jump_loop: case on_failure_jump_smart: - handle_on_failure_jump: EXTRACT_NUMBER_AND_INCR (j, p); - - /* For some patterns, e.g., `(a?)?', `p+j' here points to the - end of the pattern. We don't want to push such a point, - since when we restore it above, entering the switch will - increment `p' past the end of the pattern. We don't need - to push such a point since we obviously won't find any more - fastmap entries beyond `pend'. Such a pattern can match - the null string, though. */ if (p + j <= p1) - /* Backward jump to be ignored. */ - ; - else if (p + j < pend) - { - if (!PUSH_PATTERN_OP (p + j, fail_stack)) - { - RESET_FAIL_STACK (); - return -2; - } - } - else - bufp->can_be_null = 1; - - if (succeed_n_p) - { - EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ - succeed_n_p = false; - } - + ; /* Backward jump to be ignored. */ + else if (!PUSH_PATTERN_OP (p + j, fail_stack)) + return (RESET_FAIL_STACK (), -2); continue; + case jump_n: + /* This code simply does not properly handle forward jump_n. */ + DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0)); + p += 4; + /* jump_n can either jump or fall through. The (backward) jump + case has already been handled, so we only need to look at the + fallthrough case. */ + continue; + case succeed_n: - /* Get to the number of times to succeed. */ - p += 2; - - /* Increment p past the n for when k != 0. */ - EXTRACT_NUMBER_AND_INCR (k, p); - if (k == 0) - { - p -= 4; - succeed_n_p = true; /* Spaghetti code alert. */ - goto handle_on_failure_jump; - } + /* If N == 0, it should be an on_failure_jump_loop instead. */ + DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0)); + p += 4; + /* We only care about one iteration of the loop, so we don't + need to consider the case where this behaves like an + on_failure_jump. */ continue; @@ -3733,10 +3563,43 @@ re_compile_fastmap (bufp) p = pend; } /* while p */ - /* Set `can_be_null' for the last path (also the first path, if the - pattern is empty). */ - bufp->can_be_null |= path_can_be_null; - RESET_FAIL_STACK (); + return (RESET_FAIL_STACK (), 0); +} /* analyse_first */ + +/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + Character codes above (1 << BYTEWIDTH) are not represented in the + fastmap, but the leading codes are represented. Thus, the fastmap + indicates which character sets could start a match. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + char *fastmap = bufp->fastmap; + int analysis; + + assert (fastmap && bufp->buffer); + + bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + + analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, + fastmap, RE_MULTIBYTE_P (bufp)); + if (analysis < -1) + return analysis; + bufp->can_be_null = (analysis != 0); return 0; } /* re_compile_fastmap */ @@ -3840,7 +3703,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) int anchored_start = 0; /* Nonzero if we have to concern multibyte character. */ - int multibyte = bufp->multibyte; + const boolean multibyte = RE_MULTIBYTE_P (bufp); /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) @@ -3967,11 +3830,11 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) } else /* Searching backwards. */ { - buf_ch = STRING_CHAR (d, (startpos >= size1 - ? size2 + size1 - startpos - : size1 - startpos)); - if (RE_TRANSLATE_P (translate)) - buf_ch = RE_TRANSLATE (translate, buf_ch); + int room = (startpos >= size1 + ? size2 + size1 - startpos + : size1 - startpos); + buf_ch = RE_STRING_CHAR (d, room); + buf_ch = TRANSLATE (buf_ch); if (! (buf_ch >= 0400 || fastmap[buf_ch])) @@ -4057,7 +3920,10 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) /* Declarations and macros for re_match_2. */ -static int bcmp_translate (); +static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2, + register int len, + RE_TRANSLATE_TYPE translate, + const int multibyte)); /* This converts PTR, a pointer into one of the search strings `string1' and `string2' into an offset from the beginning of that string. */ @@ -4067,7 +3933,9 @@ static int bcmp_translate (); : ((regoff_t) ((ptr) - string2 + size1))) /* Call before fetching a character with *d. This switches over to - string2 if necessary. */ + string2 if necessary. + Check re_match_2_internal for a discussion of why end_match_2 might + not be within string2 (but be equal to end_match_1 instead). */ #define PREFETCH() \ while (d == dend) \ { \ @@ -4159,9 +4027,9 @@ skip_one_char (p) p += 1 + CHARSET_BITMAP_SIZE (p - 1); break; -#ifdef emacs case syntaxspec: case notsyntaxspec: +#ifdef emacs case categoryspec: case notcategoryspec: #endif /* emacs */ @@ -4210,7 +4078,7 @@ mutually_exclusive_p (bufp, p1, p2) unsigned char *p1, *p2; { re_opcode_t op2; - const boolean multibyte = bufp->multibyte; + const boolean multibyte = RE_MULTIBYTE_P (bufp); unsigned char *pend = bufp->buffer + bufp->used; assert (p1 >= bufp->buffer && p1 < pend @@ -4357,7 +4225,6 @@ mutually_exclusive_p (bufp, p1, p2) } } -#ifdef emacs case wordend: case notsyntaxspec: return ((re_opcode_t) *p1 == syntaxspec @@ -4373,6 +4240,7 @@ mutually_exclusive_p (bufp, p1, p2) || (re_opcode_t) *p1 == syntaxspec) && p1[1] == Sword); +#ifdef emacs case categoryspec: return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]); case notcategoryspec: @@ -4490,7 +4358,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) RE_TRANSLATE_TYPE translate = bufp->translate; /* Nonzero if we have to concern multibyte character. */ - int multibyte = bufp->multibyte; + const boolean multibyte = RE_MULTIBYTE_P (bufp); /* Failure point stack. Each place that can handle a failure further down the line pushes a failure point on this stack. It consists of @@ -4597,15 +4465,6 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) for (mcnt = 1; mcnt < num_regs; mcnt++) regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE; - /* Shorten strings to `stop'. */ - if (stop <= size1) - { - size1 = stop; - size2 = 0; - } - else if (stop <= size1 + size2) - size2 = stop - size1; - /* We move `string1' into `string2' if the latter's empty -- but not if `string1' is null. */ if (size2 == 0 && string1 != NULL) @@ -4618,25 +4477,42 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) end1 = string1 + size1; end2 = string2 + size2; - /* Compute where to stop matching, within the two strings. */ - end_match_1 = end1; - end_match_2 = end2; - /* `p' scans through the pattern as `d' scans through the data. `dend' is the end of the input string that `d' points within. `d' is advanced into the following input string whenever necessary, but this happens before fetching; therefore, at the beginning of the loop, `d' can be pointing at the end of a string, but it cannot equal `string2'. */ - if (size1 > 0 && pos <= size1) + if (pos >= size1) { - d = string1 + pos; - dend = end_match_1; + /* Only match within string2. */ + d = string2 + pos - size1; + dend = end_match_2 = string2 + stop - size1; + end_match_1 = end1; /* Just to give it a value. */ } else { - d = string2 + pos - size1; - dend = end_match_2; + if (stop <= size1) + { + /* Only match within string1. */ + end_match_1 = string1 + stop; + /* BEWARE! + When we reach end_match_1, PREFETCH normally switches to string2. + But in the present case, this means that just doing a PREFETCH + makes us jump from `stop' to `gap' within the string. + What we really want here is for the search to stop as + soon as we hit end_match_1. That's why we set end_match_2 + to end_match_1 (since PREFETCH fails as soon as we hit + end_match_2). */ + end_match_2 = end_match_1; + } + else + { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + d = string1 + pos; + dend = end_match_1; } DEBUG_PRINT1 ("The compiled pattern is: "); @@ -4838,7 +4714,6 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) testing `translate' inside the loop. */ if (RE_TRANSLATE_P (translate)) { -#ifdef emacs if (multibyte) do { @@ -4862,7 +4737,6 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } while (mcnt > 0); else -#endif /* not emacs */ do { PREFETCH (); @@ -4900,17 +4774,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 ("EXECUTING anychar.\n"); PREFETCH (); - -#ifdef emacs - if (multibyte) - buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); - else -#endif /* not emacs */ - { - buf_ch = *d; - buf_charlen = 1; - } - + buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); buf_ch = TRANSLATE (buf_ch); if ((!(bufp->syntax & RE_DOT_NEWLINE) @@ -4945,27 +4809,20 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); - PREFETCH (); - c = *d; - range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); -#ifdef emacs if (range_table_exists) { range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ EXTRACT_NUMBER_AND_INCR (count, range_table); } - if (multibyte && BASE_LEADING_CODE_P (c)) - c = STRING_CHAR_AND_LENGTH (d, dend - d, len); -#endif /* emacs */ + PREFETCH (); + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); + c = TRANSLATE (c); /* The character to match. */ if (SINGLE_BYTE_CHAR_P (c)) { /* Lookup bitmap. */ - c = TRANSLATE (c); /* The character to match. */ - len = 1; - /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) @@ -5111,7 +4968,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ if (RE_TRANSLATE_P (translate) - ? bcmp_translate (d, d2, mcnt, translate) + ? bcmp_translate (d, d2, mcnt, translate, multibyte) : bcmp (d, d2, mcnt)) { d = dfail; @@ -5133,9 +4990,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { if (!bufp->not_bol) break; } - else if (d[-1] == '\n' && bufp->newline_anchor) + else { - break; + unsigned char c; + GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2); + if (c == '\n' && bufp->newline_anchor) + break; } /* In all other cases, we fail. */ goto fail; @@ -5199,6 +5059,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PUSH_FAILURE_POINT (p - 3, NULL); break; + /* A nasty loop is introduced by the non-greedy *? and +?. + With such loops, the stack only ever contains one failure point + at a time, so that a plain on_failure_jump_loop kind of + cycle detection cannot work. Worse yet, such a detection + can not only fail to detect a cycle, but it can also wrongly + detect a cycle (between different instantiations of the same + loop. + So the method used for those nasty loops is a little different: + We use a special cycle-detection-stack-frame which is pushed + when the on_failure_jump_nastyloop failure-point is *popped*. + This special frame thus marks the beginning of one iteration + through the loop and we can hence easily check right here + whether something matched between the beginning and the end of + the loop. */ + case on_failure_jump_nastyloop: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n", + mcnt, p + mcnt); + + assert ((re_opcode_t)p[-4] == no_op); + CHECK_INFINITE_LOOP (p - 4, d); + PUSH_FAILURE_POINT (p - 3, d); + break; + /* Simple loop detecting on_failure_jump: just check on the failure stack if the same spot was already hit earlier. */ @@ -5356,18 +5240,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; #ifdef emacs - int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1)); + int offset = PTR_TO_OFFSET (d - 1); + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); UPDATE_SYNTAX_TABLE (charpos); #endif - /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); #ifdef emacs UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif PREFETCH (); - /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ - c2 = STRING_CHAR (d, dend - d); + c2 = RE_STRING_CHAR (d, dend - d); s2 = SYNTAX (c2); if (/* Case 2: Only one of S1 and S2 is Sword. */ @@ -5396,12 +5279,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; #ifdef emacs - int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); + int offset = PTR_TO_OFFSET (d); + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); UPDATE_SYNTAX_TABLE (charpos); #endif PREFETCH (); - /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ - c2 = STRING_CHAR (d, dend - d); + c2 = RE_STRING_CHAR (d, dend - d); s2 = SYNTAX (c2); /* Case 2: S2 is not Sword. */ @@ -5439,7 +5322,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; #ifdef emacs - int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1); + int offset = PTR_TO_OFFSET (d) - 1; + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); UPDATE_SYNTAX_TABLE (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); @@ -5453,8 +5337,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (!AT_STRINGS_END (d)) { PREFETCH (); - /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ - c2 = STRING_CHAR (d, dend - d); + c2 = RE_STRING_CHAR (d, dend - d); #ifdef emacs UPDATE_SYNTAX_TABLE_FORWARD (charpos); #endif @@ -5468,141 +5351,66 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } break; -#ifdef emacs - case before_dot: - DEBUG_PRINT1 ("EXECUTING before_dot.\n"); - if (PTR_BYTE_POS (d) >= PT_BYTE) - goto fail; - break; - - case at_dot: - DEBUG_PRINT1 ("EXECUTING at_dot.\n"); - if (PTR_BYTE_POS (d) != PT_BYTE) - goto fail; - break; - - case after_dot: - DEBUG_PRINT1 ("EXECUTING after_dot.\n"); - if (PTR_BYTE_POS (d) <= PT_BYTE) - goto fail; - break; - case syntaxspec: - DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); + case notsyntaxspec: + not = (re_opcode_t) *(p - 1) == notsyntaxspec; mcnt = *p++; - goto matchsyntax; - - case wordchar: - DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); - mcnt = (int) Sword; - matchsyntax: + DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt); PREFETCH (); #ifdef emacs { - int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); + int offset = PTR_TO_OFFSET (d); + int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset); UPDATE_SYNTAX_TABLE (pos1); } #endif { int c, len; - if (multibyte) - /* we must concern about multibyte form, ... */ - c = STRING_CHAR_AND_LENGTH (d, dend - d, len); - else - /* everything should be handled as ASCII, even though it - looks like multibyte form. */ - c = *d, len = 1; + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); - if (SYNTAX (c) != (enum syntaxcode) mcnt) - goto fail; + if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) + goto fail; d += len; } break; - case notsyntaxspec: - DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); - mcnt = *p++; - goto matchnotsyntax; - - case notwordchar: - DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); - mcnt = (int) Sword; - matchnotsyntax: - PREFETCH (); #ifdef emacs - { - int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); - UPDATE_SYNTAX_TABLE (pos1); - } -#endif - { - int c, len; - - if (multibyte) - c = STRING_CHAR_AND_LENGTH (d, dend - d, len); - else - c = *d, len = 1; - - if (SYNTAX (c) == (enum syntaxcode) mcnt) + case before_dot: + DEBUG_PRINT1 ("EXECUTING before_dot.\n"); + if (PTR_BYTE_POS (d) >= PT_BYTE) goto fail; - d += len; - } break; - case categoryspec: - DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p); - mcnt = *p++; - PREFETCH (); - { - int c, len; - - if (multibyte) - c = STRING_CHAR_AND_LENGTH (d, dend - d, len); - else - c = *d, len = 1; + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_BYTE_POS (d) != PT_BYTE) + goto fail; + break; - if (!CHAR_HAS_CATEGORY (c, mcnt)) - goto fail; - d += len; - } + case after_dot: + DEBUG_PRINT1 ("EXECUTING after_dot.\n"); + if (PTR_BYTE_POS (d) <= PT_BYTE) + goto fail; break; + case categoryspec: case notcategoryspec: - DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p); + not = (re_opcode_t) *(p - 1) == notcategoryspec; mcnt = *p++; + DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); PREFETCH (); { int c, len; + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); - if (multibyte) - c = STRING_CHAR_AND_LENGTH (d, dend - d, len); - else - c = *d, len = 1; - - if (CHAR_HAS_CATEGORY (c, mcnt)) + if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) goto fail; d += len; } - break; - -#else /* not emacs */ - case wordchar: - DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); - PREFETCH (); - if (!WORDCHAR_P (d)) - goto fail; - d++; break; - case notwordchar: - DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); - PREFETCH (); - if (WORDCHAR_P (d)) - goto fail; - d++; - break; -#endif /* not emacs */ +#endif /* emacs */ default: abort (); @@ -5626,6 +5434,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) assert (str == NULL); goto continue_failure_jump; + case on_failure_jump_nastyloop: + assert ((re_opcode_t)pat[-2] == no_op); + PUSH_FAILURE_POINT (pat - 2, str); + /* Fallthrough */ + case on_failure_jump_loop: case on_failure_jump: case succeed_n: @@ -5635,6 +5448,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) p = pat + mcnt; break; + case no_op: + /* A special frame used for nastyloops. */ + goto fail; + default: abort(); } @@ -5662,23 +5479,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) bytes; nonzero otherwise. */ static int -bcmp_translate (s1, s2, len, translate) - unsigned char *s1, *s2; +bcmp_translate (s1, s2, len, translate, multibyte) + re_char *s1, *s2; register int len; RE_TRANSLATE_TYPE translate; + const int multibyte; { - register unsigned char *p1 = s1, *p2 = s2; - unsigned char *p1_end = s1 + len; - unsigned char *p2_end = s2 + len; + register re_char *p1 = s1, *p2 = s2; + re_char *p1_end = s1 + len; + re_char *p2_end = s2 + len; while (p1 != p1_end && p2 != p2_end) { int p1_charlen, p2_charlen; int p1_ch, p2_ch; - /* FIXME: This assumes `multibyte = true'. */ - p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); - p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); + p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); + p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); if (RE_TRANSLATE (translate, p1_ch) != RE_TRANSLATE (translate, p2_ch))