X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=591d6f14e12bf1094376b66b8943aa351e07b9fc;hb=0ab8a118f1d2e4648c24a10ab0b57f6d07239be4;hp=e10a3565f2fdefc3d8412ef94c260fd6b2e91da3;hpb=7ae0247f5b1d1fcfebca9220862348b33df0d07f;p=gnulib.git diff --git a/regex.c b/regex.c index e10a3565f..591d6f14e 100644 --- a/regex.c +++ b/regex.c @@ -23,7 +23,9 @@ - structure the opcode space into opcode+flag. - merge with glibc's regex.[ch]. - replace (succeed_n + jump_n + set_number_at) with something that doesn't - need to modify the compiled regexp. + need to modify the compiled regexp so that re_match can be reentrant. + - get rid of on_failure_jump_smart by doing the optimization in re_comp + rather than at run-time, so that re_match can be reentrant. */ /* AIX requires this to be the first thing in the file. */ @@ -47,8 +49,12 @@ /* Whether to use ISO C Amendment 1 wide char functions. Those should not be used for Emacs since it uses its own. */ +#if defined _LIBC +#define WIDE_CHAR_SUPPORT 1 +#else #define WIDE_CHAR_SUPPORT \ - (defined _LIBC || HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) + (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) +#endif /* For platform which support the ISO C amendement 1 functionality we support user defined character classes. */ @@ -122,8 +128,17 @@ # include "charset.h" # include "category.h" +# ifdef malloc +# undef malloc +# endif # define malloc xmalloc +# ifdef realloc +# undef realloc +# endif # define realloc xrealloc +# ifdef free +# undef free +# endif # define free xfree /* Converts the pointer to the char to BEG-based offset from the start. */ @@ -1312,7 +1327,8 @@ static const char *re_error_msgid[] = /* Roughly the maximum number of failure points on the stack. Would be exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. This is a variable only so users of regex can assign to it; we never - change it ourselves. */ + change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE + before using it, so it should probably be a byte-count instead. */ # if defined MATCH_MAY_ALLOCATE /* Note that 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. In order for a larger @@ -1487,9 +1503,8 @@ do { \ if (reg == -1) \ { \ /* It's a counter. */ \ - /* Here, we discard `const', which makes re_match non-reentrant. \ - Gcc gives a warning for it, which is good. */ \ - unsigned char *ptr = POP_FAILURE_POINTER (); \ + /* Here, we discard `const', making re_match non-reentrant. */ \ + unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \ reg = POP_FAILURE_INT (); \ STORE_NUMBER (ptr, reg); \ DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \ @@ -1503,19 +1518,43 @@ do { \ } \ } while (0) +/* Discard a saved register off the stack. */ +#define DISCARD_FAILURE_REG_OR_COUNT() \ +do { \ + int reg = POP_FAILURE_INT (); \ + if (reg == -1) \ + { \ + /* It's a counter. */ \ + POP_FAILURE_POINTER (); \ + reg = POP_FAILURE_INT (); \ + DEBUG_PRINT3 (" Discard counter %p = %d\n", ptr, reg); \ + } \ + else \ + { \ + POP_FAILURE_POINTER (); \ + POP_FAILURE_POINTER (); \ + DEBUG_PRINT4 (" Discard reg %d (spanning %p -> %p)\n", \ + reg, regstart[reg], regend[reg]); \ + } \ +} while (0) + /* Check that we are not stuck in an infinite loop. */ #define CHECK_INFINITE_LOOP(pat_cur, string_place) \ do { \ - int failure = TOP_FAILURE_HANDLE(); \ + int failure = TOP_FAILURE_HANDLE (); \ /* Check for infinite matching loops */ \ - while (failure > 0 && \ - (FAILURE_STR (failure) == string_place \ - || FAILURE_STR (failure) == NULL)) \ + while (failure > 0 \ + && (FAILURE_STR (failure) == string_place \ + || FAILURE_STR (failure) == NULL)) \ { \ assert (FAILURE_PAT (failure) >= bufp->buffer \ && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \ if (FAILURE_PAT (failure) == pat_cur) \ - goto fail; \ + { \ + while (fail_stack.frame < fail_stack.avail) \ + DISCARD_FAILURE_REG_OR_COUNT (); \ + goto fail; \ + } \ DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \ failure = NEXT_FAILURE_HANDLE(failure); \ } \ @@ -1565,7 +1604,7 @@ do { \ /* Estimate the size of data pushed by a typical failure stack entry. An estimate is all we need, because all we use this for is to choose a limit for how big to make the failure stack. */ - +/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */ #define TYPICAL_FAILURE_SIZE 20 /* How many items can still be added to the stack without overflowing it. */ @@ -1905,15 +1944,23 @@ struct range_table_work_area do { if (p != pend) \ { \ PATFETCH (c); \ + if (c == ' ') \ + FREE_STACK_RETURN (REG_BADBR); \ while ('0' <= c && c <= '9') \ { \ + int prev; \ if (num < 0) \ - num = 0; \ + num = 0; \ + prev = num; \ num = num * 10 + c - '0'; \ + if (num / 10 != prev) \ + FREE_STACK_RETURN (REG_BADBR); \ if (p == pend) \ - break; \ + break; \ PATFETCH (c); \ } \ + if (c == ' ') \ + FREE_STACK_RETURN (REG_BADBR); \ } \ } while (0) @@ -1952,9 +1999,10 @@ typedef int re_wchar_t; /* Map a string to the char class it names (if any). */ static re_wctype_t -re_wctype (string) - re_char *string; +re_wctype (str) + re_char *str; { + const char *string = str; if (STREQ (string, "alnum")) return RECC_ALNUM; else if (STREQ (string, "alpha")) return RECC_ALPHA; else if (STREQ (string, "word")) return RECC_WORD; @@ -1981,30 +2029,29 @@ re_iswctype (ch, cc) int ch; re_wctype_t cc; { - boolean ret = false; - switch (cc) { - case RECC_ALNUM: ret = ISALNUM (ch); - case RECC_ALPHA: ret = ISALPHA (ch); - case RECC_BLANK: ret = ISBLANK (ch); - case RECC_CNTRL: ret = ISCNTRL (ch); - case RECC_DIGIT: ret = ISDIGIT (ch); - case RECC_GRAPH: ret = ISGRAPH (ch); - case RECC_LOWER: ret = ISLOWER (ch); - case RECC_PRINT: ret = ISPRINT (ch); - case RECC_PUNCT: ret = ISPUNCT (ch); - case RECC_SPACE: ret = ISSPACE (ch); - case RECC_UPPER: ret = ISUPPER (ch); - case RECC_XDIGIT: ret = ISXDIGIT (ch); - case RECC_ASCII: ret = IS_REAL_ASCII (ch); - case RECC_NONASCII: ret = !IS_REAL_ASCII (ch); - case RECC_UNIBYTE: ret = ISUNIBYTE (ch); - case RECC_MULTIBYTE: ret = !ISUNIBYTE (ch); - case RECC_WORD: ret = ISWORD (ch); - case RECC_ERROR: ret = false; + case RECC_ALNUM: return ISALNUM (ch); + case RECC_ALPHA: return ISALPHA (ch); + case RECC_BLANK: return ISBLANK (ch); + case RECC_CNTRL: return ISCNTRL (ch); + case RECC_DIGIT: return ISDIGIT (ch); + case RECC_GRAPH: return ISGRAPH (ch); + case RECC_LOWER: return ISLOWER (ch); + case RECC_PRINT: return ISPRINT (ch); + case RECC_PUNCT: return ISPUNCT (ch); + case RECC_SPACE: return ISSPACE (ch); + case RECC_UPPER: return ISUPPER (ch); + case RECC_XDIGIT: return ISXDIGIT (ch); + case RECC_ASCII: return IS_REAL_ASCII (ch); + case RECC_NONASCII: return !IS_REAL_ASCII (ch); + case RECC_UNIBYTE: return ISUNIBYTE (ch); + case RECC_MULTIBYTE: return !ISUNIBYTE (ch); + case RECC_WORD: return ISWORD (ch); + case RECC_ERROR: return false; + default: + abort(); } - return ret; } /* Return a bit-pattern to use in the range-table bits to match multibyte @@ -2013,21 +2060,20 @@ static int re_wctype_to_bit (cc) re_wctype_t cc; { - int ret = 0; - switch (cc) { case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: - case RECC_MULTIBYTE: ret = BIT_MULTIBYTE; - case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: ret = BIT_WORD; - case RECC_LOWER: ret = BIT_LOWER; - case RECC_UPPER: ret = BIT_UPPER; - case RECC_PUNCT: ret = BIT_PUNCT; - case RECC_SPACE: ret = BIT_SPACE; + case RECC_MULTIBYTE: return BIT_MULTIBYTE; + case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; + case RECC_LOWER: return BIT_LOWER; + case RECC_UPPER: return BIT_UPPER; + case RECC_PUNCT: return BIT_PUNCT; + case RECC_SPACE: return BIT_SPACE; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: - case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: ret = 0; + case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; + default: + abort(); } - return ret; } #endif @@ -3149,20 +3195,21 @@ regex_compile (pattern, size, syntax, bufp) case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - if (syntax & RE_NO_BK_REFS) - goto normal_char; + { + regnum_t reg; - c1 = c - '0'; + if (syntax & RE_NO_BK_REFS) + goto normal_backslash; - if (c1 > regnum) - FREE_STACK_RETURN (REG_ESUBREG); + reg = c - '0'; - /* Can't back reference to a subexpression if inside of it. */ - if (group_in_compile_stack (compile_stack, (regnum_t) c1)) - goto normal_char; + /* Can't back reference to a subexpression before its end. */ + if (reg > regnum || group_in_compile_stack (compile_stack, reg)) + FREE_STACK_RETURN (REG_ESUBREG); - laststart = b; - BUF_PUSH_2 (duplicate, c1); + laststart = b; + BUF_PUSH_2 (duplicate, reg); + } break; @@ -4263,7 +4310,7 @@ mutually_exclusive_p (bufp, p1, p2) { register re_wchar_t c = (re_opcode_t) *p2 == endline ? '\n' - : RE_STRING_CHAR(p2 + 2, pend - p2 - 2); + : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); if ((re_opcode_t) *p1 == exactn) { @@ -4308,13 +4355,11 @@ mutually_exclusive_p (bufp, p1, p2) break; case charset: - case charset_not: { if ((re_opcode_t) *p1 == exactn) /* Reuse the code above. */ return mutually_exclusive_p (bufp, p2, p1); - /* It is hard to list up all the character in charset P2 if it includes multibyte character. Give up in such case. */ @@ -4330,7 +4375,7 @@ mutually_exclusive_p (bufp, p1, p2) P2 is ASCII, it is enough to test only bitmap table of P1. */ - if (*p1 == *p2) + if ((re_opcode_t) *p1 == charset) { int idx; /* We win if the charset inside the loop @@ -4349,8 +4394,7 @@ mutually_exclusive_p (bufp, p1, p2) return 1; } } - else if ((re_opcode_t) *p1 == charset - || (re_opcode_t) *p1 == charset_not) + else if ((re_opcode_t) *p1 == charset_not) { int idx; /* We win if the charset_not inside the loop lists @@ -4369,7 +4413,24 @@ mutually_exclusive_p (bufp, p1, p2) } } } + break; + case charset_not: + switch (SWITCH_ENUM_CAST (*p1)) + { + case exactn: + case charset: + /* Reuse the code above. */ + return mutually_exclusive_p (bufp, p2, p1); + case charset_not: + /* When we have two charset_not, it's very unlikely that + they don't overlap. The union of the two sets of excluded + chars should cover all possible chars, which, as a matter of + fact, is virtually impossible in multibyte buffers. */ + ; + } + break; + case wordend: case notsyntaxspec: return ((re_opcode_t) *p1 == syntaxspec @@ -5276,9 +5337,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) mcnt, p + mcnt); { re_char *p1 = p; /* Next operation. */ - /* Please don't add casts to try and shut up GCC. */ - unsigned char *p2 = p + mcnt; /* Destination of the jump. */ - unsigned char *p3 = p - 3; /* Location of the opcode. */ + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */ + unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */ p -= 3; /* Reset so that we will re-execute the instruction once it's been changed. */ @@ -5328,8 +5389,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Originally, mcnt is how many times we HAVE to succeed. */ if (mcnt != 0) { - /* Please don't add a cast to try and shut up GCC. */ - unsigned char *p2 = p + 2; /* Location of the counter. */ + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; p += 4; PUSH_NUMBER (p2, mcnt); @@ -5347,8 +5408,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Originally, this is how many times we CAN jump. */ if (mcnt != 0) { - /* Please don't add a cast to try and shut up GCC. */ - unsigned char *p2 = p + 2; /* Location of the counter. */ + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; PUSH_NUMBER (p2, mcnt); goto unconditional_jump; @@ -5364,8 +5425,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); EXTRACT_NUMBER_AND_INCR (mcnt, p); - /* Please don't add a cast to try and shut up GCC. */ - p2 = p + mcnt; + /* Here, we discard `const', making re_match non-reentrant. */ + p2 = (unsigned char*) p + mcnt; /* Signedness doesn't matter since we only copy MCNT's bits . */ EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt); @@ -5810,8 +5871,8 @@ re_exec (s) int regcomp (preg, pattern, cflags) - regex_t *preg; - const char *pattern; + regex_t *__restrict preg; + const char *__restrict pattern; int cflags; { reg_errcode_t ret; @@ -5895,8 +5956,8 @@ WEAK_ALIAS (__regcomp, regcomp) int regexec (preg, string, nmatch, pmatch, eflags) - const regex_t *preg; - const char *string; + const regex_t *__restrict preg; + const char *__restrict string; size_t nmatch; regmatch_t pmatch[]; int eflags;