X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=3f951afe637af260675c552be6fcf58b3d4401c9;hb=f979eb0420f34053899f0333b5db42816d294fbc;hp=d30a922abdf2be4c49ea39e9e35d34b162be9107;hpb=3f8c9c6b3e239d83bd7884401cf4efa21050d524;p=gnulib.git diff --git a/regex.c b/regex.c index d30a922ab..3f951afe6 100644 --- a/regex.c +++ b/regex.c @@ -2,7 +2,7 @@ 0.12. (Implements POSIX draft P10003.2/D11.2, except for internationalization features.) - Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,11 +27,15 @@ #undef _GNU_SOURCE #define _GNU_SOURCE +#ifdef emacs /* Converts the pointer to the char to BEG-based offset from the start. */ #define PTR_TO_OFFSET(d) \ POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \ ? (d) - string1 : (d) - (string2 - size1)) -#define POS_AS_IN_BUFFER(p) ((p) + 1) +#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) +#else +#define PTR_TO_OFFSET(d) 0 +#endif #ifdef HAVE_CONFIG_H #include @@ -68,6 +72,7 @@ #include "category.h" #define malloc xmalloc +#define realloc xrealloc #define free xfree #else /* not emacs */ @@ -168,7 +173,7 @@ init_syntax_once () #define SYNTAX(c) re_syntax_table[c] -/* Dummy macro for non emacs environments. */ +/* Dummy macros for non-Emacs environments. */ #define BASE_LEADING_CODE_P(c) (0) #define WORD_BOUNDARY_P(c1, c2) (0) #define CHAR_HEAD_P(p) (1) @@ -1120,23 +1125,25 @@ static const char *re_error_msgid[] = REGEX_ALLOCATE_STACK. */ -/* Number of failure points for which to initially allocate space +/* Approximate number of failure points for which to initially allocate space when matching. If this number is exceeded, we allocate more space, so it is not a hard limit. */ #ifndef INIT_FAILURE_ALLOC -#define INIT_FAILURE_ALLOC 5 +#define INIT_FAILURE_ALLOC 20 #endif /* Roughly the maximum number of failure points on the stack. Would be - exactly that if always used MAX_FAILURE_ITEMS items each time we failed. + exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. This is a variable only so users of regex can assign to it; we never change it ourselves. */ #if defined (MATCH_MAY_ALLOCATE) -/* 4400 was enough to cause a crash on Alpha OSF/1, - whose default stack limit is 2mb. */ -int re_max_failures = 20000; +/* Note that 4400 is enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. In order for a larger + value to work reliably, you have to try to make it accord + with the process stack limit. */ +int re_max_failures = 40000; #else -int re_max_failures = 2000; +int re_max_failures = 4000; #endif union fail_stack_elt @@ -1166,7 +1173,8 @@ typedef struct #define INIT_FAIL_STACK() \ do { \ fail_stack.stack = (fail_stack_elt_t *) \ - REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \ + * sizeof (fail_stack_elt_t)); \ \ if (fail_stack.stack == NULL) \ return -2; \ @@ -1186,24 +1194,40 @@ typedef struct #endif -/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. +/* Double the size of FAIL_STACK, up to a limit + which allows approximately `re_max_failures' items. Return 1 if succeeds, and 0 if either ran out of memory allocating space for it or it was already too large. REGEX_REALLOCATE_STACK requires `destination' be declared. */ -#define DOUBLE_FAIL_STACK(fail_stack) \ - ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ +/* Factor to increase the failure stack size by + when we increase it. + This used to be 2, but 2 was too wasteful + because the old discarded stacks added up to as much space + were as ultimate, maximum-size stack. */ +#define FAIL_STACK_GROWTH_FACTOR 4 + +#define GROW_FAIL_STACK(fail_stack) \ + (((fail_stack).size * sizeof (fail_stack_elt_t) \ + >= re_max_failures * TYPICAL_FAILURE_SIZE) \ ? 0 \ - : ((fail_stack).stack = (fail_stack_elt_t *) \ + : ((fail_stack).stack \ + = (fail_stack_elt_t *) \ REGEX_REALLOCATE_STACK ((fail_stack).stack, \ (fail_stack).size * sizeof (fail_stack_elt_t), \ - ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ + ((fail_stack).size * sizeof (fail_stack_elt_t) \ + * FAIL_STACK_GROWTH_FACTOR))), \ \ (fail_stack).stack == NULL \ ? 0 \ - : ((fail_stack).size <<= 1, \ + : ((fail_stack).size \ + = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ + ((fail_stack).size * sizeof (fail_stack_elt_t) \ + * FAIL_STACK_GROWTH_FACTOR)) \ + / sizeof (fail_stack_elt_t)), \ 1))) @@ -1212,7 +1236,7 @@ typedef struct space to do so. */ #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ ((FAIL_STACK_FULL () \ - && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ + && !GROW_FAIL_STACK (FAIL_STACK)) \ ? 0 \ : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1)) @@ -1255,7 +1279,7 @@ typedef struct if we ever fail back to it. Requires variables fail_stack, regstart, regend, reg_info, and - num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + num_regs be declared. GROW_FAIL_STACK requires `destination' be declared. Does `return FAILURE_CODE' if runs out of memory. */ @@ -1279,7 +1303,7 @@ typedef struct /* Ensure we have enough space allocated for what we will push. */ \ while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ { \ - if (!DOUBLE_FAIL_STACK (fail_stack)) \ + if (!GROW_FAIL_STACK (fail_stack)) \ return failure_code; \ \ DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ @@ -1346,13 +1370,14 @@ typedef struct #define NUM_NONREG_ITEMS 4 #endif -/* We push at most this many items on the stack. */ -/* We used to use (num_regs - 1), which is the number of registers - this regexp will save; but that was changed to 5 - to avoid stack overflow for a regexp with lots of parens. */ -#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) +/* Estimate the size of data pushed by a typical failure stack entry. + An estimate is all we need, because all we use this for + is to choose a limit for how big to make the failure stack. */ -/* We actually push this many items. */ +#define TYPICAL_FAILURE_SIZE 20 + +/* This is how many items we actually use for a failure point. + It depends on the regexp. */ #define NUM_FAILURE_ITEMS \ (((0 \ ? 0 : highest_active_reg - lowest_active_reg + 1) \ @@ -1519,7 +1544,7 @@ static reg_errcode_t compile_range (); #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ - if (translate) c = (unsigned char) translate[c]; \ + if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ } while (0) #endif @@ -1540,7 +1565,8 @@ static reg_errcode_t compile_range (); when we use a character as a subscript we must make it unsigned. */ #ifndef TRANSLATE #define TRANSLATE(d) \ - (translate ? (unsigned char) translate[(unsigned char) (d)] : (d)) + (RE_TRANSLATE_P (translate) \ + ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) #endif @@ -1852,7 +1878,12 @@ regex_compile (pattern, size, syntax, bufp) compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ +#ifdef AIX + /* `const' makes AIX compiler fail. */ + char *p = pattern; +#else const char *p = pattern; +#endif const char *pend = pattern + size; /* How to translate the characters in the pattern. */ @@ -2087,9 +2118,10 @@ regex_compile (pattern, size, syntax, bufp) incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ - if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.') && zero_times_ok - && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && p < pend + && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); @@ -2185,11 +2217,11 @@ regex_compile (pattern, size, syntax, bufp) } else { - /* Could be the end of the bracket expression. If it's - not (i.e., when the bracket expression is `[]' so - far), the ']' character bit gets set way below. */ - if (c == ']' && p != p1 + 1) - break; + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; } /* If C indicates start of multibyte char, get the @@ -2210,7 +2242,8 @@ regex_compile (pattern, size, syntax, bufp) else if (!escaped_char && syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') - { /* Leave room for the null. */ + { + /* Leave room for the null. */ char str[CHAR_CLASS_MAX_LENGTH + 1]; PATFETCH (c); @@ -2312,7 +2345,18 @@ regex_compile (pattern, size, syntax, bufp) p += len; } - if (!SAME_CHARSET_P (c, c1)) + if (SINGLE_BYTE_CHAR_P (c) + && ! SINGLE_BYTE_CHAR_P (c1)) + { + /* Handle a range such as \177-\377 in multibyte mode. + Split that into two ranges,, + the low one ending at 0237, and the high one + starting at ...040. */ + int c1_base = (c1 & ~0177) | 040; + SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); + c1 = 0237; + } + else if (!SAME_CHARSET_P (c, c1)) FREE_STACK_RETURN (REG_ERANGE); } else @@ -2338,8 +2382,8 @@ regex_compile (pattern, size, syntax, bufp) for (this_char = range_start; this_char <= range_end; this_char++) SET_LIST_BIT (TRANSLATE (this_char)); + } } - } else /* ... into range table. */ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); @@ -2859,8 +2903,12 @@ regex_compile (pattern, size, syntax, bufp) p1 = p - 1; /* P1 points the head of C. */ #ifdef emacs if (bufp->multibyte) - /* Set P to the next character boundary. */ - p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; + { + c = STRING_CHAR (p1, pend - p1); + c = TRANSLATE (c); + /* Set P to the next character boundary. */ + p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; + } #endif /* If no exactn currently being built. */ if (!pending_exact @@ -2872,14 +2920,14 @@ regex_compile (pattern, size, syntax, bufp) || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) /* If followed by a repetition operator. */ - || *p == '*' || *p == '^' + || (p != pend && (*p == '*' || *p == '^')) || ((syntax & RE_BK_PLUS_QM) - ? *p == '\\' && (p[1] == '+' || p[1] == '?') - : (*p == '+' || *p == '?')) + ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') + : p != pend && (*p == '+' || *p == '?')) || ((syntax & RE_INTERVALS) && ((syntax & RE_NO_BK_BRACES) - ? *p == '{' - : (p[0] == '\\' && p[1] == '{')))) + ? p != pend && *p == '{' + : p + 1 < pend && p[0] == '\\' && p[1] == '{'))) { /* Start building a new exactn. */ @@ -2889,16 +2937,23 @@ regex_compile (pattern, size, syntax, bufp) pending_exact = b - 1; } - /* Here, C may translated, therefore C may not equal to *P1. */ - while (1) +#ifdef emacs + if (! SINGLE_BYTE_CHAR_P (c)) { - BUF_PUSH (c); - (*pending_exact)++; - if (++p1 == p) - break; - - /* Rest of multibyte form should be copied literally. */ - c = *(unsigned char *)p1; + unsigned char work[4], *str; + int i = CHAR_STRING (c, work, str); + int j; + for (j = 0; j < i; j++) + { + BUF_PUSH (str[j]); + (*pending_exact)++; + } + } + else +#endif + { + BUF_PUSH (c); + (*pending_exact)++; } break; } /* switch (c) */ @@ -2938,12 +2993,9 @@ regex_compile (pattern, size, syntax, bufp) { int num_regs = bufp->re_nsub + 1; - /* Since DOUBLE_FAIL_STACK refuses to double only if the current size - is strictly greater than re_max_failures, the largest possible stack - is 2 * re_max_failures failure points. */ - if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) + if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE) { - fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); + fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE; #ifdef emacs if (! fail_stack.stack) @@ -3103,64 +3155,6 @@ group_in_compile_stack (compile_stack, regnum) return false; } - - -/* Read the ending character of a range (in a bracket expression) from the - uncompiled pattern *P_PTR (which ends at PEND). We assume the - starting character is in `P[-2]'. (`P[-1]' is the character `-'.) - Then we set the translation of all bits between the starting and - ending characters (inclusive) in the compiled pattern B. - - Return an error code. - - We use these short variable names so we can use the same macros as - `regex_compile' itself. */ - -static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) - const char **p_ptr, *pend; - RE_TRANSLATE_TYPE translate; - reg_syntax_t syntax; - unsigned char *b; -{ - unsigned this_char; - - const char *p = *p_ptr; - int range_start, range_end; - - if (p == pend) - return REG_ERANGE; - - /* Even though the pattern is a signed `char *', we need to fetch - with unsigned char *'s; if the high bit of the pattern character - is set, the range endpoints will be negative if we fetch using a - signed char *. - - We also want to fetch the endpoints without translating them; the - appropriate translation is done in the bit-setting loop below. */ - /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ - range_start = ((const unsigned char *) p)[-2]; - range_end = ((const unsigned char *) p)[0]; - - /* Have to increment the pointer into the pattern string, so the - caller isn't still at the ending character. */ - (*p_ptr)++; - - /* If the start is after the end, the range is empty. */ - if (range_start > range_end) - return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; - - /* Here we see why `this_char' has to be larger than an `unsigned - char' -- the range is inclusive, so if `range_end' == 0xff - (assuming 8-bit characters), we would otherwise go into an infinite - loop, since all characters <= 0xff. */ - for (this_char = range_start; this_char <= range_end; this_char++) - { - SET_LIST_BIT (TRANSLATE (this_char)); - } - - return REG_NOERROR; -} /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -3329,9 +3323,11 @@ re_compile_fastmap (bufp) case charset_not: - /* Chars beyond end of map must be allowed. End of map is - `127' if bufp->multibyte is nonzero. */ - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* Chars beyond end of bitmap are possible matches. + All the single-byte codes can occur in multibyte buffers. + So any that are not listed in the charset + are possible matches, even in multibyte buffers. */ + simple_char_max = (1 << BYTEWIDTH); for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; j < simple_char_max; j++) fastmap[j] = 1; @@ -3358,7 +3354,9 @@ re_compile_fastmap (bufp) case wordchar: - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* All the single-byte codes can occur in multibyte buffers, + and they may have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) == Sword) fastmap[j] = 1; @@ -3371,7 +3369,9 @@ re_compile_fastmap (bufp) case notwordchar: - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* All the single-byte codes can occur in multibyte buffers, + and they may not have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) != Sword) fastmap[j] = 1; @@ -3387,21 +3387,13 @@ re_compile_fastmap (bufp) { int fastmap_newline = fastmap['\n']; - /* `.' matches anything (but if bufp->multibyte is - nonzero, matches `\000' .. `\127' and possible multibyte - character) ... */ + /* `.' matches anything, except perhaps newline. + Even in a multibyte buffer, it should match any + conceivable byte value for the fastmap. */ if (bufp->multibyte) - { - simple_char_max = 0x80; - - for (j = 0x80; j < 0xA0; j++) - if (BASE_LEADING_CODE_P (j)) - fastmap[j] = 1; - match_any_multibyte_characters = true; - } - else - simple_char_max = (1 << BYTEWIDTH); + match_any_multibyte_characters = true; + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) fastmap[j] = 1; @@ -3459,7 +3451,7 @@ re_compile_fastmap (bufp) case categoryspec: k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (CHAR_HAS_CATEGORY (j, k)) fastmap[j] = 1; @@ -3473,7 +3465,7 @@ re_compile_fastmap (bufp) case notcategoryspec: k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (!CHAR_HAS_CATEGORY (j, k)) fastmap[j] = 1; @@ -3736,13 +3728,13 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) range = total_size - startpos; /* If the search isn't to be a backwards one, don't waste time in a - search for a pattern that must be anchored. */ + search for a pattern anchored at beginning of buffer. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) { if (startpos > 0) return -1; else - range = 1; + range = 0; } #ifdef emacs @@ -3750,8 +3742,8 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) don't keep searching past point. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) { - range = PT - startpos; - if (range <= 0) + range = PT_BYTE - BEGV_BYTE - startpos; + if (range < 0) return -1; } #endif /* emacs */ @@ -3766,10 +3758,13 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) anchored_start = 1; #ifdef emacs - SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, - POS_AS_IN_BUFFER (startpos > 0 - ? startpos - 1 : startpos), - 1); + gl_state.object = re_match_object; + { + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos); + + SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); + } #endif /* Loop through the string, looking for a place to start matching. */ @@ -3794,37 +3789,69 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) the first null string. */ if (fastmap && startpos < total_size && !bufp->can_be_null) { + register const char *d; + register unsigned int buf_ch; + + d = POS_ADDR_VSTRING (startpos); + if (range > 0) /* Searching forwards. */ { - register const char *d; register int lim = 0; int irange = range; if (startpos < size1 && startpos + range >= size1) lim = range - (size1 - startpos); - d = POS_ADDR_VSTRING (startpos); - /* Written out as an if-else to avoid testing `translate' inside the loop. */ - if (translate) - while (range > lim - && !fastmap[(unsigned char) - translate[(unsigned char) *d++]]) - range--; + if (RE_TRANSLATE_P (translate)) + { + if (multibyte) + while (range > lim) + { + int buf_charlen; + + buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, + buf_charlen); + + buf_ch = RE_TRANSLATE (translate, buf_ch); + if (buf_ch >= 0400 + || fastmap[buf_ch]) + break; + + range -= buf_charlen; + d += buf_charlen; + } + else + while (range > lim + && !fastmap[(unsigned char) + RE_TRANSLATE (translate, (unsigned char) *d)]) + { + d++; + range--; + } + } else - while (range > lim && !fastmap[(unsigned char) *d++]) - range--; + while (range > lim && !fastmap[(unsigned char) *d]) + { + d++; + range--; + } startpos += irange - range; } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); + int room = (size1 == 0 || startpos >= size1 + ? size2 + size1 - startpos + : size1 - startpos); + + buf_ch = STRING_CHAR (d, room); + if (RE_TRANSLATE_P (translate)) + buf_ch = RE_TRANSLATE (translate, buf_ch); - if (!fastmap[(unsigned char) TRANSLATE (c)]) + if (! (buf_ch >= 0400 + || fastmap[buf_ch])) goto advance; } } @@ -3856,8 +3883,10 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) /* Update STARTPOS to the next character boundary. */ if (multibyte) { - const unsigned char *p = POS_ADDR_VSTRING (startpos); - const unsigned char *pend = STOP_ADDR_VSTRING (startpos); + const unsigned char *p + = (const unsigned char *) POS_ADDR_VSTRING (startpos); + const unsigned char *pend + = (const unsigned char *) STOP_ADDR_VSTRING (startpos); int len = MULTIBYTE_FORM_LENGTH (p, pend - p); range -= len; @@ -3867,9 +3896,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) } else { - range--; - startpos++; - } + range--; + startpos++; + } } else { @@ -3879,11 +3908,12 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) /* Update STARTPOS to the previous character boundary. */ if (multibyte) { - const unsigned char *p = POS_ADDR_VSTRING (startpos); + const unsigned char *p + = (const unsigned char *) POS_ADDR_VSTRING (startpos); int len = 0; /* Find the head of multibyte form. */ - while (!CHAR_HEAD_P (p)) + while (!CHAR_HEAD_P (*p)) p--, len++; /* Adjust it. */ @@ -4050,13 +4080,15 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) int result; #ifdef emacs - SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, - POS_AS_IN_BUFFER (pos > 0 ? pos - 1 : pos), - 1); + int charpos; + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); + gl_state.object = re_match_object; + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos); + SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); #endif result = re_match_2_internal (bufp, string1, size1, string2, size2, - pos, regs, stop); + pos, regs, stop); alloca (0); return result; } @@ -4492,16 +4524,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ - if (translate) + if (RE_TRANSLATE_P (translate)) { - do - { - PREFETCH (); - if ((unsigned char) translate[(unsigned char) *d++] - != (unsigned char) *p++) - goto fail; - } - while (--mcnt); +#ifdef emacs + if (multibyte) + do + { + int pat_charlen, buf_charlen; + unsigned int pat_ch, buf_ch; + + PREFETCH (); + pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + + if (RE_TRANSLATE (translate, buf_ch) + != pat_ch) + goto fail; + + p += pat_charlen; + d += buf_charlen; + mcnt -= pat_charlen; + } + while (mcnt > 0); + else +#endif /* not emacs */ + do + { + PREFETCH (); + if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d) + != (unsigned char) *p++) + goto fail; + d++; + } + while (--mcnt); } else { @@ -4518,17 +4573,36 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Match any character except possibly a newline or a null. */ case anychar: - DEBUG_PRINT1 ("EXECUTING anychar.\n"); + { + int buf_charlen; + unsigned int buf_ch; - PREFETCH (); + DEBUG_PRINT1 ("EXECUTING anychar.\n"); - if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') - || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) - goto fail; + PREFETCH (); - SET_REGS_MATCHED (); - DEBUG_PRINT2 (" Matched `%d'.\n", *d); - d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1; +#ifdef emacs + if (multibyte) + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + else +#endif /* not emacs */ + { + buf_ch = (unsigned char) *d; + buf_charlen = 1; + } + + buf_ch = TRANSLATE (buf_ch); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) + && buf_ch == '\n') + || ((bufp->syntax & RE_DOT_NOT_NULL) + && buf_ch == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d += buf_charlen; + } break; @@ -4826,7 +4900,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ - if (translate + if (RE_TRANSLATE_P (translate) ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt)) goto fail; @@ -4933,6 +5007,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) on_failure: DEBUG_PRINT1 ("EXECUTING on_failure_jump"); +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif + EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); @@ -4973,6 +5051,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* A smart repeat ends with `maybe_pop_jump'. We change it to either `pop_failure_jump' or `jump'. */ case maybe_pop_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { @@ -5193,6 +5274,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ @@ -5298,15 +5382,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; int pos1 = PTR_TO_OFFSET (d - 1); + int charpos; GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); #ifdef emacs - UPDATE_SYNTAX_TABLE (pos1 ? pos1 : 1); + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); #endif s1 = SYNTAX (c1); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (pos1 + 1); + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif s2 = SYNTAX (c2); @@ -5333,15 +5419,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; int pos1 = PTR_TO_OFFSET (d - 1); + int charpos; GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); #ifdef emacs - UPDATE_SYNTAX_TABLE (pos1); + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); #endif s1 = SYNTAX (c1); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (pos1 + 1); + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif s2 = SYNTAX (c2); @@ -5368,10 +5456,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; int pos1 = PTR_TO_OFFSET (d); + int charpos; GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); #ifdef emacs - UPDATE_SYNTAX_TABLE (pos1); + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); #endif s2 = SYNTAX (c2); @@ -5384,7 +5474,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); #ifdef emacs - UPDATE_SYNTAX_TABLE_BACKWARD (pos1 - 1); + UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1); #endif s1 = SYNTAX (c1); @@ -5409,8 +5499,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ int c1, c2, s1, s2; + int pos1 = PTR_TO_OFFSET (d); + int charpos; GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); +#ifdef emacs + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1); + UPDATE_SYNTAX_TABLE (charpos); +#endif s1 = SYNTAX (c1); /* Case 2: S1 is not Sword. */ @@ -5421,6 +5517,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (!AT_STRINGS_END (d)) { GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); +#ifdef emacs + UPDATE_SYNTAX_TABLE_FORWARD (charpos); +#endif s2 = SYNTAX (c2); /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2) @@ -5434,19 +5533,19 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) #ifdef emacs case before_dot: DEBUG_PRINT1 ("EXECUTING before_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) >= PT) + if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE) goto fail; break; case at_dot: DEBUG_PRINT1 ("EXECUTING at_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) != PT) + if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE) goto fail; break; case after_dot: DEBUG_PRINT1 ("EXECUTING after_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) <= PT) + if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE) goto fail; break; @@ -5462,7 +5561,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PREFETCH (); #ifdef emacs { - int pos1 = PTR_TO_OFFSET (d); + int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); UPDATE_SYNTAX_TABLE (pos1); } #endif @@ -5496,7 +5595,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PREFETCH (); #ifdef emacs { - int pos1 = PTR_TO_OFFSET (d); + int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); UPDATE_SYNTAX_TABLE (pos1); } #endif @@ -5581,6 +5680,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We goto here if a matching operation fails. */ fail: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif if (!FAIL_STACK_EMPTY ()) { /* A restart point is known. Restore to that state. */ DEBUG_PRINT1 ("\nFAIL:\n"); @@ -5890,11 +5992,27 @@ bcmp_translate (s1, s2, len, translate) RE_TRANSLATE_TYPE translate; { register unsigned char *p1 = s1, *p2 = s2; - while (len) + unsigned char *p1_end = s1 + len; + unsigned char *p2_end = s2 + len; + + while (p1 != p1_end && p2 != p2_end) { - if (translate[*p1++] != translate[*p2++]) return 1; - len--; + int p1_charlen, p2_charlen; + int p1_ch, p2_ch; + + p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); + p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); + + if (RE_TRANSLATE (translate, p1_ch) + != RE_TRANSLATE (translate, p2_ch)) + return 1; + + p1 += p1_charlen, p2 += p2_charlen; } + + if (p1 != p1_end || p2 != p2_end) + return 1; + return 0; }