X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=3f951afe637af260675c552be6fcf58b3d4401c9;hb=2a2630ca8cda9ea5bb9b505d712e883998baeca3;hp=20ea8aebdda8fda11008b6966a07cc22504e3162;hpb=2f64731917c81d726d60c510c82028a0f1c5d72a;p=gnulib.git diff --git a/regex.c b/regex.c index 20ea8aebd..3f951afe6 100644 --- a/regex.c +++ b/regex.c @@ -2,7 +2,7 @@ 0.12. (Implements POSIX draft P10003.2/D11.2, except for internationalization features.) - Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc. + Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,6 +27,16 @@ #undef _GNU_SOURCE #define _GNU_SOURCE +#ifdef emacs +/* Converts the pointer to the char to BEG-based offset from the start. */ +#define PTR_TO_OFFSET(d) \ + POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \ + ? (d) - string1 : (d) - (string2 - size1)) +#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) +#else +#define PTR_TO_OFFSET(d) 0 +#endif + #ifdef HAVE_CONFIG_H #include #endif @@ -53,7 +63,17 @@ #include "lisp.h" #include "buffer.h" + +/* Make syntax table lookup grant data in gl_state. */ +#define SYNTAX_ENTRY_VIA_PROPERTY + #include "syntax.h" +#include "charset.h" +#include "category.h" + +#define malloc xmalloc +#define realloc xrealloc +#define free xfree #else /* not emacs */ @@ -153,6 +173,19 @@ init_syntax_once () #define SYNTAX(c) re_syntax_table[c] +/* Dummy macros for non-Emacs environments. */ +#define BASE_LEADING_CODE_P(c) (0) +#define WORD_BOUNDARY_P(c1, c2) (0) +#define CHAR_HEAD_P(p) (1) +#define SINGLE_BYTE_CHAR_P(c) (1) +#define SAME_CHARSET_P(c1, c2) (1) +#define MULTIBYTE_FORM_LENGTH(p, s) (1) +#define STRING_CHAR(p, s) (*(p)) +#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) +#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \ + (c = ((p) == (end1) ? *(str2) : *(p))) +#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ + (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) #endif /* not emacs */ /* Get the interface, including the syntax bits. */ @@ -462,7 +495,17 @@ typedef enum syntaxspec, /* Matches any character whose syntax is not that specified. */ - notsyntaxspec + notsyntaxspec, + + /* Matches any character whose category-set contains the specified + category. The operator is followed by a byte which contains a + category code (mnemonic ASCII character). */ + categoryspec, + + /* Matches any character whose category-set does not contain the + specified category. The operator is followed by a byte which + contains the category code (mnemonic ASCII character). */ + notcategoryspec #endif /* emacs */ } re_opcode_t; @@ -540,6 +583,93 @@ extract_number_and_incr (destination, source) #endif /* DEBUG */ +/* Store a multibyte character in three contiguous bytes starting + DESTINATION, and increment DESTINATION to the byte after where the + character is stored. Therefore, DESTINATION must be an lvalue. */ + +#define STORE_CHARACTER_AND_INCR(destination, character) \ + do { \ + (destination)[0] = (character) & 0377; \ + (destination)[1] = ((character) >> 8) & 0377; \ + (destination)[2] = (character) >> 16; \ + (destination) += 3; \ + } while (0) + +/* Put into DESTINATION a character stored in three contiguous bytes + starting at SOURCE. */ + +#define EXTRACT_CHARACTER(destination, source) \ + do { \ + (destination) = ((source)[0] \ + | ((source)[1] << 8) \ + | ((source)[2] << 16)); \ + } while (0) + + +/* Macros for charset. */ + +/* Size of bitmap of charset P in bytes. P is a start of charset, + i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */ +#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F) + +/* Nonzero if charset P has range table. */ +#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80) + +/* Return the address of range table of charset P. But not the start + of table itself, but the before where the number of ranges is + stored. `2 +' means to skip re_opcode_t and size of bitmap. */ +#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)]) + +/* Test if C is listed in the bitmap of charset P. */ +#define CHARSET_LOOKUP_BITMAP(p, c) \ + ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \ + && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH))) + +/* Return the address of end of RANGE_TABLE. COUNT is number of + ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2' + is start of range and end of range. `* 3' is size of each start + and end. */ +#define CHARSET_RANGE_TABLE_END(range_table, count) \ + ((range_table) + (count) * 2 * 3) + +/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in. + COUNT is number of ranges in RANGE_TABLE. */ +#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ + do \ + { \ + int range_start, range_end; \ + unsigned char *p; \ + unsigned char *range_table_end \ + = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ + \ + for (p = (range_table); p < range_table_end; p += 2 * 3) \ + { \ + EXTRACT_CHARACTER (range_start, p); \ + EXTRACT_CHARACTER (range_end, p + 3); \ + \ + if (range_start <= (c) && (c) <= range_end) \ + { \ + (not) = !(not); \ + break; \ + } \ + } \ + } \ + while (0) + +/* Test if C is in range table of CHARSET. The flag NOT is negated if + C is listed in it. */ +#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \ + do \ + { \ + /* Number of ranges in range table. */ \ + int count; \ + unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \ + \ + EXTRACT_NUMBER_AND_INCR (count, range_table); \ + CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ + } \ + while (0) + /* If DEBUG is defined, Regex prints many voluminous messages about what it is doing (if the variable `debug' is nonzero). If linked with the main program in `iregex.c', you can enter patterns and strings @@ -995,23 +1125,25 @@ static const char *re_error_msgid[] = REGEX_ALLOCATE_STACK. */ -/* Number of failure points for which to initially allocate space +/* Approximate number of failure points for which to initially allocate space when matching. If this number is exceeded, we allocate more space, so it is not a hard limit. */ #ifndef INIT_FAILURE_ALLOC -#define INIT_FAILURE_ALLOC 5 +#define INIT_FAILURE_ALLOC 20 #endif /* Roughly the maximum number of failure points on the stack. Would be - exactly that if always used MAX_FAILURE_ITEMS items each time we failed. + exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. This is a variable only so users of regex can assign to it; we never change it ourselves. */ #if defined (MATCH_MAY_ALLOCATE) -/* 4400 was enough to cause a crash on Alpha OSF/1, - whose default stack limit is 2mb. */ -int re_max_failures = 20000; +/* Note that 4400 is enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. In order for a larger + value to work reliably, you have to try to make it accord + with the process stack limit. */ +int re_max_failures = 40000; #else -int re_max_failures = 2000; +int re_max_failures = 4000; #endif union fail_stack_elt @@ -1041,7 +1173,8 @@ typedef struct #define INIT_FAIL_STACK() \ do { \ fail_stack.stack = (fail_stack_elt_t *) \ - REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \ + * sizeof (fail_stack_elt_t)); \ \ if (fail_stack.stack == NULL) \ return -2; \ @@ -1061,24 +1194,40 @@ typedef struct #endif -/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. +/* Double the size of FAIL_STACK, up to a limit + which allows approximately `re_max_failures' items. Return 1 if succeeds, and 0 if either ran out of memory allocating space for it or it was already too large. REGEX_REALLOCATE_STACK requires `destination' be declared. */ -#define DOUBLE_FAIL_STACK(fail_stack) \ - ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ +/* Factor to increase the failure stack size by + when we increase it. + This used to be 2, but 2 was too wasteful + because the old discarded stacks added up to as much space + were as ultimate, maximum-size stack. */ +#define FAIL_STACK_GROWTH_FACTOR 4 + +#define GROW_FAIL_STACK(fail_stack) \ + (((fail_stack).size * sizeof (fail_stack_elt_t) \ + >= re_max_failures * TYPICAL_FAILURE_SIZE) \ ? 0 \ - : ((fail_stack).stack = (fail_stack_elt_t *) \ + : ((fail_stack).stack \ + = (fail_stack_elt_t *) \ REGEX_REALLOCATE_STACK ((fail_stack).stack, \ (fail_stack).size * sizeof (fail_stack_elt_t), \ - ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ + ((fail_stack).size * sizeof (fail_stack_elt_t) \ + * FAIL_STACK_GROWTH_FACTOR))), \ \ (fail_stack).stack == NULL \ ? 0 \ - : ((fail_stack).size <<= 1, \ + : ((fail_stack).size \ + = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ + ((fail_stack).size * sizeof (fail_stack_elt_t) \ + * FAIL_STACK_GROWTH_FACTOR)) \ + / sizeof (fail_stack_elt_t)), \ 1))) @@ -1087,7 +1236,7 @@ typedef struct space to do so. */ #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ ((FAIL_STACK_FULL () \ - && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ + && !GROW_FAIL_STACK (FAIL_STACK)) \ ? 0 \ : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1)) @@ -1130,7 +1279,7 @@ typedef struct if we ever fail back to it. Requires variables fail_stack, regstart, regend, reg_info, and - num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + num_regs be declared. GROW_FAIL_STACK requires `destination' be declared. Does `return FAILURE_CODE' if runs out of memory. */ @@ -1154,7 +1303,7 @@ typedef struct /* Ensure we have enough space allocated for what we will push. */ \ while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ { \ - if (!DOUBLE_FAIL_STACK (fail_stack)) \ + if (!GROW_FAIL_STACK (fail_stack)) \ return failure_code; \ \ DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ @@ -1221,13 +1370,14 @@ typedef struct #define NUM_NONREG_ITEMS 4 #endif -/* We push at most this many items on the stack. */ -/* We used to use (num_regs - 1), which is the number of registers - this regexp will save; but that was changed to 5 - to avoid stack overflow for a regexp with lots of parens. */ -#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) +/* Estimate the size of data pushed by a typical failure stack entry. + An estimate is all we need, because all we use this for + is to choose a limit for how big to make the failure stack. */ + +#define TYPICAL_FAILURE_SIZE 20 -/* We actually push this many items. */ +/* This is how many items we actually use for a failure point. + It depends on the regexp. */ #define NUM_FAILURE_ITEMS \ (((0 \ ? 0 : highest_active_reg - lowest_active_reg + 1) \ @@ -1394,7 +1544,7 @@ static reg_errcode_t compile_range (); #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ - if (translate) c = (unsigned char) translate[c]; \ + if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ } while (0) #endif @@ -1415,7 +1565,8 @@ static reg_errcode_t compile_range (); when we use a character as a subscript we must make it unsigned. */ #ifndef TRANSLATE #define TRANSLATE(d) \ - (translate ? (char) translate[(unsigned char) (d)] : (d)) + (RE_TRANSLATE_P (translate) \ + ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) #endif @@ -1553,6 +1704,51 @@ typedef struct #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) +/* Structure to manage work area for range table. */ +struct range_table_work_area +{ + int *table; /* actual work area. */ + int allocated; /* allocated size for work area in bytes. */ + int used; /* actually used size in words. */ +}; + +/* Make sure that WORK_AREA can hold more N multibyte characters. */ +#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \ + do { \ + if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \ + { \ + (work_area).allocated += 16 * sizeof (int); \ + if ((work_area).table) \ + (work_area).table \ + = (int *) realloc ((work_area).table, (work_area).allocated); \ + else \ + (work_area).table \ + = (int *) malloc ((work_area).allocated); \ + if ((work_area).table == 0) \ + FREE_STACK_RETURN (REG_ESPACE); \ + } \ + } while (0) + +/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ +#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ + do { \ + EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ + (work_area).table[(work_area).used++] = (range_start); \ + (work_area).table[(work_area).used++] = (range_end); \ + } while (0) + +/* Free allocated memory for WORK_AREA. */ +#define FREE_RANGE_TABLE_WORK_AREA(work_area) \ + do { \ + if ((work_area).table) \ + free ((work_area).table); \ + } while (0) + +#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0) +#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) +#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) + + /* Set the bit for character C in a list. */ #define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ @@ -1654,7 +1850,11 @@ regex_grow_registers (num_regs) /* Return, freeing storage we allocated. */ #define FREE_STACK_RETURN(value) \ - return (free (compile_stack.stack), value) + do { \ + FREE_RANGE_TABLE_WORK_AREA (range_table_work); \ + free (compile_stack.stack); \ + return value; \ + } while (0) static reg_errcode_t regex_compile (pattern, size, syntax, bufp) @@ -1666,7 +1866,7 @@ regex_compile (pattern, size, syntax, bufp) /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ - register unsigned char c, c1; + register unsigned int c, c1; /* A random temporary spot in PATTERN. */ const char *p1; @@ -1678,7 +1878,12 @@ regex_compile (pattern, size, syntax, bufp) compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ +#ifdef AIX + /* `const' makes AIX compiler fail. */ + char *p = pattern; +#else const char *p = pattern; +#endif const char *pend = pattern + size; /* How to translate the characters in the pattern. */ @@ -1712,6 +1917,9 @@ regex_compile (pattern, size, syntax, bufp) number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0; + /* Work area for range table of charset. */ + struct range_table_work_area range_table_work; + #ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) @@ -1732,6 +1940,9 @@ regex_compile (pattern, size, syntax, bufp) compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; + range_table_work.table = 0; + range_table_work.allocated = 0; + /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; @@ -1745,6 +1956,14 @@ regex_compile (pattern, size, syntax, bufp) /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0; +#ifdef emacs + /* bufp->multibyte is set before regex_compile is called, so don't alter + it. */ +#else /* not emacs */ + /* Nothing is recognized as a multibyte character. */ + bufp->multibyte = 0; +#endif + #if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once (); @@ -1899,9 +2118,10 @@ regex_compile (pattern, size, syntax, bufp) incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ - if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.') && zero_times_ok - && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && p < pend + && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); @@ -1947,7 +2167,7 @@ regex_compile (pattern, size, syntax, bufp) case '[': { - boolean had_char_class = false; + CLEAR_RANGE_TABLE_WORK_USED (range_table_work); if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -1980,6 +2200,9 @@ regex_compile (pattern, size, syntax, bufp) /* Read in characters and ranges, setting map bits. */ for (;;) { + int len; + boolean escaped_char = false; + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); PATFETCH (c); @@ -1989,52 +2212,38 @@ regex_compile (pattern, size, syntax, bufp) { if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); - PATFETCH (c1); - SET_LIST_BIT (c1); - continue; + PATFETCH (c); + escaped_char = true; } - - /* Could be the end of the bracket expression. If it's - not (i.e., when the bracket expression is `[]' so - far), the ']' character bit gets set way below. */ - if (c == ']' && p != p1 + 1) - break; - - /* Look ahead to see if it's a range when the last thing - was a character class. */ - if (had_char_class && c == '-' && *p != ']') - FREE_STACK_RETURN (REG_ERANGE); - - /* Look ahead to see if it's a range when the last thing - was a character: if this is a hyphen not at the - beginning or the end of a list, then it's the range - operator. */ - if (c == '-' - && !(p - 2 >= pattern && p[-2] == '[') - && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') - && *p != ']') + else { - reg_errcode_t ret - = compile_range (&p, pend, translate, syntax, b); - if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; } - else if (p[0] == '-' && p[1] != ']') - { /* This handles ranges made up of characters only. */ - reg_errcode_t ret; - - /* Move past the `-'. */ - PATFETCH (c1); - - ret = compile_range (&p, pend, translate, syntax, b); - if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + /* If C indicates start of multibyte char, get the + actual character code in C, and set the pattern + pointer P to the next character boundary. */ + if (bufp->multibyte && BASE_LEADING_CODE_P (c)) + { + PATUNFETCH; + c = STRING_CHAR_AND_LENGTH (p, pend - p, len); + p += len; } + /* What should we do for the character which is + greater than 0x7F, but not BASE_LEADING_CODE_P? + XXX */ /* See if we're at the beginning of a possible character class. */ - else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') - { /* Leave room for the null. */ + else if (!escaped_char && + syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { + /* Leave room for the null. */ char str[CHAR_CLASS_MAX_LENGTH + 1]; PATFETCH (c); @@ -2053,9 +2262,10 @@ regex_compile (pattern, size, syntax, bufp) } str[c1] = '\0'; - /* If isn't a word bracketed by `[:' and:`]': - undo the ending character, the letters, and leave - the leading `:' and `[' (but set bits for them). */ + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and + leave the leading `:' and `[' (but set bits for + them). */ if (c == ':' && *p == ']') { int ch; @@ -2083,25 +2293,28 @@ regex_compile (pattern, size, syntax, bufp) for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { + int translated = TRANSLATE (ch); /* This was split into 3 if's to avoid an arbitrary limit in some compiler. */ if ( (is_alnum && ISALNUM (ch)) || (is_alpha && ISALPHA (ch)) || (is_blank && ISBLANK (ch)) || (is_cntrl && ISCNTRL (ch))) - SET_LIST_BIT (ch); + SET_LIST_BIT (translated); if ( (is_digit && ISDIGIT (ch)) || (is_graph && ISGRAPH (ch)) || (is_lower && ISLOWER (ch)) || (is_print && ISPRINT (ch))) - SET_LIST_BIT (ch); + SET_LIST_BIT (translated); if ( (is_punct && ISPUNCT (ch)) || (is_space && ISSPACE (ch)) || (is_upper && ISUPPER (ch)) || (is_xdigit && ISXDIGIT (ch))) - SET_LIST_BIT (ch); + SET_LIST_BIT (translated); } - had_char_class = true; + + /* Repeat the loop. */ + continue; } else { @@ -2109,15 +2322,71 @@ regex_compile (pattern, size, syntax, bufp) while (c1--) PATUNFETCH; SET_LIST_BIT ('['); - SET_LIST_BIT (':'); - had_char_class = false; + + /* Because the `:' may starts the range, we + can't simply set bit and repeat the loop. + Instead, just set it to C and handle below. */ + c = ':'; } } + + if (p < pend && p[0] == '-' && p[1] != ']') + { + + /* Discard the `-'. */ + PATFETCH (c1); + + /* Fetch the character which ends the range. */ + PATFETCH (c1); + if (bufp->multibyte && BASE_LEADING_CODE_P (c1)) + { + PATUNFETCH; + c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len); + p += len; + } + + if (SINGLE_BYTE_CHAR_P (c) + && ! SINGLE_BYTE_CHAR_P (c1)) + { + /* Handle a range such as \177-\377 in multibyte mode. + Split that into two ranges,, + the low one ending at 0237, and the high one + starting at ...040. */ + int c1_base = (c1 & ~0177) | 040; + SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); + c1 = 0237; + } + else if (!SAME_CHARSET_P (c, c1)) + FREE_STACK_RETURN (REG_ERANGE); + } else + /* Range from C to C. */ + c1 = c; + + /* Set the range ... */ + if (SINGLE_BYTE_CHAR_P (c)) + /* ... into bitmap. */ { - had_char_class = false; - SET_LIST_BIT (c); + unsigned this_char; + int range_start = c, range_end = c1; + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + { + if (syntax & RE_NO_EMPTY_RANGES) + FREE_STACK_RETURN (REG_ERANGE); + /* Else, repeat the loop. */ + } + else + { + for (this_char = range_start; this_char <= range_end; + this_char++) + SET_LIST_BIT (TRANSLATE (this_char)); + } } + else + /* ... into range table. */ + SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); } /* Discard any (non)matching list bytes that are all 0 at the @@ -2125,6 +2394,25 @@ regex_compile (pattern, size, syntax, bufp) while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; b += b[-1]; + + /* Build real range table from work area. */ + if (RANGE_TABLE_WORK_USED (range_table_work)) + { + int i; + int used = RANGE_TABLE_WORK_USED (range_table_work); + + /* Allocate space for COUNT + RANGE_TABLE. Needs two + bytes for COUNT and three bytes for each character. */ + GET_BUFFER_SPACE (2 + used * 3); + + /* Indicate the existence of range table. */ + laststart[1] |= 0x80; + + STORE_NUMBER_AND_INCR (b, used / 2); + for (i = 0; i < used; i++) + STORE_CHARACTER_AND_INCR + (b, RANGE_TABLE_WORK_ELT (range_table_work, i)); + } } break; @@ -2521,6 +2809,18 @@ regex_compile (pattern, size, syntax, bufp) PATFETCH (c); BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); break; + + case 'c': + laststart = b; + PATFETCH_RAW (c); + BUF_PUSH_2 (categoryspec, c); + break; + + case 'C': + laststart = b; + PATFETCH_RAW (c); + BUF_PUSH_2 (notcategoryspec, c); + break; #endif /* emacs */ @@ -2600,6 +2900,16 @@ regex_compile (pattern, size, syntax, bufp) default: /* Expects the character in `c'. */ normal_char: + p1 = p - 1; /* P1 points the head of C. */ +#ifdef emacs + if (bufp->multibyte) + { + c = STRING_CHAR (p1, pend - p1); + c = TRANSLATE (c); + /* Set P to the next character boundary. */ + p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; + } +#endif /* If no exactn currently being built. */ if (!pending_exact @@ -2607,17 +2917,17 @@ regex_compile (pattern, size, syntax, bufp) || pending_exact + *pending_exact + 1 != b /* We have only one byte following the exactn for the count. */ - || *pending_exact == (1 << BYTEWIDTH) - 1 + || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) /* If followed by a repetition operator. */ - || *p == '*' || *p == '^' + || (p != pend && (*p == '*' || *p == '^')) || ((syntax & RE_BK_PLUS_QM) - ? *p == '\\' && (p[1] == '+' || p[1] == '?') - : (*p == '+' || *p == '?')) + ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') + : p != pend && (*p == '+' || *p == '?')) || ((syntax & RE_INTERVALS) && ((syntax & RE_NO_BK_BRACES) - ? *p == '{' - : (p[0] == '\\' && p[1] == '{')))) + ? p != pend && *p == '{' + : p + 1 < pend && p[0] == '\\' && p[1] == '{'))) { /* Start building a new exactn. */ @@ -2627,8 +2937,24 @@ regex_compile (pattern, size, syntax, bufp) pending_exact = b - 1; } - BUF_PUSH (c); - (*pending_exact)++; +#ifdef emacs + if (! SINGLE_BYTE_CHAR_P (c)) + { + unsigned char work[4], *str; + int i = CHAR_STRING (c, work, str); + int j; + for (j = 0; j < i; j++) + { + BUF_PUSH (str[j]); + (*pending_exact)++; + } + } + else +#endif + { + BUF_PUSH (c); + (*pending_exact)++; + } break; } /* switch (c) */ } /* while p != pend */ @@ -2667,12 +2993,9 @@ regex_compile (pattern, size, syntax, bufp) { int num_regs = bufp->re_nsub + 1; - /* Since DOUBLE_FAIL_STACK refuses to double only if the current size - is strictly greater than re_max_failures, the largest possible stack - is 2 * re_max_failures failure points. */ - if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) + if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE) { - fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); + fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE; #ifdef emacs if (! fail_stack.stack) @@ -2832,64 +3155,6 @@ group_in_compile_stack (compile_stack, regnum) return false; } - - -/* Read the ending character of a range (in a bracket expression) from the - uncompiled pattern *P_PTR (which ends at PEND). We assume the - starting character is in `P[-2]'. (`P[-1]' is the character `-'.) - Then we set the translation of all bits between the starting and - ending characters (inclusive) in the compiled pattern B. - - Return an error code. - - We use these short variable names so we can use the same macros as - `regex_compile' itself. */ - -static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) - const char **p_ptr, *pend; - RE_TRANSLATE_TYPE translate; - reg_syntax_t syntax; - unsigned char *b; -{ - unsigned this_char; - - const char *p = *p_ptr; - int range_start, range_end; - - if (p == pend) - return REG_ERANGE; - - /* Even though the pattern is a signed `char *', we need to fetch - with unsigned char *'s; if the high bit of the pattern character - is set, the range endpoints will be negative if we fetch using a - signed char *. - - We also want to fetch the endpoints without translating them; the - appropriate translation is done in the bit-setting loop below. */ - /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ - range_start = ((const unsigned char *) p)[-2]; - range_end = ((const unsigned char *) p)[0]; - - /* Have to increment the pointer into the pattern string, so the - caller isn't still at the ending character. */ - (*p_ptr)++; - - /* If the start is after the end, the range is empty. */ - if (range_start > range_end) - return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; - - /* Here we see why `this_char' has to be larger than an `unsigned - char' -- the range is inclusive, so if `range_end' == 0xff - (assuming 8-bit characters), we would otherwise go into an infinite - loop, since all characters <= 0xff. */ - for (this_char = range_start; this_char <= range_end; this_char++) - { - SET_LIST_BIT (TRANSLATE (this_char)); - } - - return REG_NOERROR; -} /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -2908,7 +3173,7 @@ int re_compile_fastmap (bufp) struct re_pattern_buffer *bufp; { - int j, k; + int i, j, k; #ifdef MATCH_MAY_ALLOCATE fail_stack_type fail_stack; #endif @@ -2937,6 +3202,13 @@ re_compile_fastmap (bufp) /* We aren't doing a `succeed_n' to begin with. */ boolean succeed_n_p = false; + /* If all elements for base leading-codes in fastmap is set, this + flag is set true. */ + boolean match_any_multibyte_characters = false; + + /* Maximum code of simple (single byte) character. */ + int simple_char_max; + assert (fastmap != NULL && p != NULL); INIT_FAIL_STACK (); @@ -2988,6 +3260,7 @@ re_compile_fastmap (bufp) break; +#ifndef emacs case charset: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) @@ -3018,14 +3291,110 @@ re_compile_fastmap (bufp) if (SYNTAX (j) != Sword) fastmap[j] = 1; break; +#else /* emacs */ + case charset: + for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; + j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + + if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) + && match_any_multibyte_characters == false) + { + /* Set fastmap[I] 1 where I is a base leading code of each + multibyte character in the range table. */ + int c, count; + + /* Make P points the range table. */ + p += CHARSET_BITMAP_SIZE (&p[-2]); + + /* Extract the number of ranges in range table into + COUNT. */ + EXTRACT_NUMBER_AND_INCR (count, p); + for (; count > 0; count--, p += 2 * 3) /* XXX */ + { + /* Extract the start of each range. */ + EXTRACT_CHARACTER (c, p); + j = CHAR_CHARSET (c); + fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1; + } + } + break; + + + case charset_not: + /* Chars beyond end of bitmap are possible matches. + All the single-byte codes can occur in multibyte buffers. + So any that are not listed in the charset + are possible matches, even in multibyte buffers. */ + simple_char_max = (1 << BYTEWIDTH); + for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; + j < simple_char_max; j++) + fastmap[j] = 1; + + for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; + j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + + if (bufp->multibyte) + /* Any character set can possibly contain a character + which doesn't match the specified set of characters. */ + { + set_fastmap_for_multibyte_characters: + if (match_any_multibyte_characters == false) + { + for (j = 0x80; j < 0xA0; j++) /* XXX */ + if (BASE_LEADING_CODE_P (j)) + fastmap[j] = 1; + match_any_multibyte_characters = true; + } + } + break; + + + case wordchar: + /* All the single-byte codes can occur in multibyte buffers, + and they may have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) + if (SYNTAX (j) == Sword) + fastmap[j] = 1; + + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose syntax is `Sword'. */ + goto set_fastmap_for_multibyte_characters; + break; + + + case notwordchar: + /* All the single-byte codes can occur in multibyte buffers, + and they may not have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) + if (SYNTAX (j) != Sword) + fastmap[j] = 1; + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose syntax is not `Sword'. */ + goto set_fastmap_for_multibyte_characters; + break; +#endif case anychar: { int fastmap_newline = fastmap['\n']; - /* `.' matches anything ... */ - for (j = 0; j < (1 << BYTEWIDTH); j++) + /* `.' matches anything, except perhaps newline. + Even in a multibyte buffer, it should match any + conceivable byte value for the fastmap. */ + if (bufp->multibyte) + match_any_multibyte_characters = true; + + simple_char_max = (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) fastmap[j] = 1; /* ... except perhaps newline. */ @@ -3042,21 +3411,70 @@ re_compile_fastmap (bufp) } #ifdef emacs + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case notsyntaxspec: case syntaxspec: + /* This match depends on text properties. These end with + aborting optimizations. */ + bufp->can_be_null = 1; + goto done; +#if 0 k = *p++; - for (j = 0; j < (1 << BYTEWIDTH); j++) + simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) == (enum syntaxcode) k) fastmap[j] = 1; - break; + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose syntax is K. */ + goto set_fastmap_for_multibyte_characters; + break; case notsyntaxspec: k = *p++; - for (j = 0; j < (1 << BYTEWIDTH); j++) + simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) != (enum syntaxcode) k) fastmap[j] = 1; + + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose syntax is not K. */ + goto set_fastmap_for_multibyte_characters; break; +#endif + + case categoryspec: + k = *p++; + simple_char_max = (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) + if (CHAR_HAS_CATEGORY (j, k)) + fastmap[j] = 1; + + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose category is K. */ + goto set_fastmap_for_multibyte_characters; + break; + + + case notcategoryspec: + k = *p++; + simple_char_max = (1 << BYTEWIDTH); + for (j = 0; j < simple_char_max; j++) + if (!CHAR_HAS_CATEGORY (j, k)) + fastmap[j] = 1; + + if (bufp->multibyte) + /* Any character set can possibly contain a character + whose category is not K. */ + goto set_fastmap_for_multibyte_characters; + break; /* All cases after this match the empty string. These end with `continue'. */ @@ -3074,10 +3492,12 @@ re_compile_fastmap (bufp) case endline: case begbuf: case endbuf: +#ifndef emacs case wordbound: case notwordbound: case wordbeg: case wordend: +#endif case push_dummy_failure: continue; @@ -3246,6 +3666,13 @@ re_search (bufp, string, size, startpos, range, regs) regs, size); } +/* End address of virtual concatenation of string. */ +#define STOP_ADDR_VSTRING(P) \ + (((P) >= size1 ? string2 + size2 : string1 + size1)) + +/* Address of POS in the concatenation of virtual string. */ +#define POS_ADDR_VSTRING(POS) \ + (((POS) >= size1 ? string2 - size1 : string1) + (POS)) /* Using the compiled pattern in BUFP->buffer, first tries to match the virtual concatenation of STRING1 and STRING2, starting first at index @@ -3285,6 +3712,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) int endpos = startpos + range; int anchored_start = 0; + /* Nonzero if we have to concern multibyte character. */ + int multibyte = bufp->multibyte; + /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) return -1; @@ -3298,13 +3728,13 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) range = total_size - startpos; /* If the search isn't to be a backwards one, don't waste time in a - search for a pattern that must be anchored. */ + search for a pattern anchored at beginning of buffer. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) { if (startpos > 0) return -1; else - range = 1; + range = 0; } #ifdef emacs @@ -3312,8 +3742,8 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) don't keep searching past point. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) { - range = PT - startpos; - if (range <= 0) + range = PT_BYTE - BEGV_BYTE - startpos; + if (range < 0) return -1; } #endif /* emacs */ @@ -3327,6 +3757,16 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) if (bufp->buffer[0] == begline) anchored_start = 1; +#ifdef emacs + gl_state.object = re_match_object; + { + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos); + + SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); + } +#endif + /* Loop through the string, looking for a place to start matching. */ for (;;) { @@ -3349,37 +3789,69 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) the first null string. */ if (fastmap && startpos < total_size && !bufp->can_be_null) { + register const char *d; + register unsigned int buf_ch; + + d = POS_ADDR_VSTRING (startpos); + if (range > 0) /* Searching forwards. */ { - register const char *d; register int lim = 0; int irange = range; if (startpos < size1 && startpos + range >= size1) lim = range - (size1 - startpos); - d = (startpos >= size1 ? string2 - size1 : string1) + startpos; - /* Written out as an if-else to avoid testing `translate' inside the loop. */ - if (translate) - while (range > lim - && !fastmap[(unsigned char) - translate[(unsigned char) *d++]]) - range--; + if (RE_TRANSLATE_P (translate)) + { + if (multibyte) + while (range > lim) + { + int buf_charlen; + + buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, + buf_charlen); + + buf_ch = RE_TRANSLATE (translate, buf_ch); + if (buf_ch >= 0400 + || fastmap[buf_ch]) + break; + + range -= buf_charlen; + d += buf_charlen; + } + else + while (range > lim + && !fastmap[(unsigned char) + RE_TRANSLATE (translate, (unsigned char) *d)]) + { + d++; + range--; + } + } else - while (range > lim && !fastmap[(unsigned char) *d++]) - range--; + while (range > lim && !fastmap[(unsigned char) *d]) + { + d++; + range--; + } startpos += irange - range; } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); + int room = (size1 == 0 || startpos >= size1 + ? size2 + size1 - startpos + : size1 - startpos); - if (!fastmap[(unsigned char) TRANSLATE (c)]) + buf_ch = STRING_CHAR (d, room); + if (RE_TRANSLATE_P (translate)) + buf_ch = RE_TRANSLATE (translate, buf_ch); + + if (! (buf_ch >= 0400 + || fastmap[buf_ch])) goto advance; } } @@ -3408,13 +3880,56 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) break; else if (range > 0) { - range--; - startpos++; + /* Update STARTPOS to the next character boundary. */ + if (multibyte) + { + const unsigned char *p + = (const unsigned char *) POS_ADDR_VSTRING (startpos); + const unsigned char *pend + = (const unsigned char *) STOP_ADDR_VSTRING (startpos); + int len = MULTIBYTE_FORM_LENGTH (p, pend - p); + + range -= len; + if (range < 0) + break; + startpos += len; + } + else + { + range--; + startpos++; + } } else { range++; startpos--; + + /* Update STARTPOS to the previous character boundary. */ + if (multibyte) + { + const unsigned char *p + = (const unsigned char *) POS_ADDR_VSTRING (startpos); + int len = 0; + + /* Find the head of multibyte form. */ + while (!CHAR_HEAD_P (*p)) + p--, len++; + + /* Adjust it. */ +#if 0 /* XXX */ + if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1)) + ; + else +#endif + { + range += len; + if (range > 0) + break; + + startpos -= len; + } + } } } return -1; @@ -3468,6 +3983,15 @@ static boolean alt_match_null_string_p (), == Sword) /* Disabled due to a compiler bug -- see comment at case wordbound */ + +/* The comment at case wordbound is following one, but we don't use + AT_WORD_BOUNDARY anymore to support multibyte form. + + The DEC Alpha C compiler 3.x generates incorrect code for the + test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of + AT_WORD_BOUNDARY, so this code is disabled. Expanding the + macro and introducing temporary variables works around the bug. */ + #if 0 /* Test if the character before D and the one at D differ with respect to being word-constituent. */ @@ -3525,6 +4049,11 @@ re_match (bufp, string, size, pos, regs) } #endif /* not emacs */ +#ifdef emacs +/* In Emacs, this is the string or buffer in which we + are matching. It is used for looking up syntax properties. */ +Lisp_Object re_match_object; +#endif /* re_match_2 matches the compiled pattern in BUFP against the the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 @@ -3548,8 +4077,18 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) struct re_registers *regs; int stop; { - int result = re_match_2_internal (bufp, string1, size1, string2, size2, - pos, regs, stop); + int result; + +#ifdef emacs + int charpos; + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); + gl_state.object = re_match_object; + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos); + SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); +#endif + + result = re_match_2_internal (bufp, string1, size1, string2, size2, + pos, regs, stop); alloca (0); return result; } @@ -3590,6 +4129,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; + /* Nonzero if we have to concern multibyte character. */ + int multibyte = bufp->multibyte; + /* Failure point stack. Each place that can handle a failure further down the line pushes a failure point on this stack. It consists of restart, regend, and reg_info for all registers corresponding to @@ -3982,16 +4524,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ - if (translate) + if (RE_TRANSLATE_P (translate)) { - do - { - PREFETCH (); - if ((unsigned char) translate[(unsigned char) *d++] - != (unsigned char) *p++) - goto fail; - } - while (--mcnt); +#ifdef emacs + if (multibyte) + do + { + int pat_charlen, buf_charlen; + unsigned int pat_ch, buf_ch; + + PREFETCH (); + pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + + if (RE_TRANSLATE (translate, buf_ch) + != pat_ch) + goto fail; + + p += pat_charlen; + d += buf_charlen; + mcnt -= pat_charlen; + } + while (mcnt > 0); + else +#endif /* not emacs */ + do + { + PREFETCH (); + if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d) + != (unsigned char) *p++) + goto fail; + d++; + } + while (--mcnt); } else { @@ -4008,43 +4573,91 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Match any character except possibly a newline or a null. */ case anychar: - DEBUG_PRINT1 ("EXECUTING anychar.\n"); + { + int buf_charlen; + unsigned int buf_ch; - PREFETCH (); + DEBUG_PRINT1 ("EXECUTING anychar.\n"); - if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') - || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) - goto fail; + PREFETCH (); - SET_REGS_MATCHED (); - DEBUG_PRINT2 (" Matched `%d'.\n", *d); - d++; +#ifdef emacs + if (multibyte) + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + else +#endif /* not emacs */ + { + buf_ch = (unsigned char) *d; + buf_charlen = 1; + } + + buf_ch = TRANSLATE (buf_ch); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) + && buf_ch == '\n') + || ((bufp->syntax & RE_DOT_NOT_NULL) + && buf_ch == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d += buf_charlen; + } break; case charset: case charset_not: { - register unsigned char c; + register unsigned int c; boolean not = (re_opcode_t) *(p - 1) == charset_not; + int len; + + /* Start of actual range_table, or end of bitmap if there is no + range table. */ + unsigned char *range_table; + + /* Nonzero if there is range table. */ + int range_table_exists; + + /* Number of ranges of range table. Not in bytes. */ + int count; DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); PREFETCH (); - c = TRANSLATE (*d); /* The character to match. */ + c = (unsigned char) *d; + + range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ + range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); + if (range_table_exists) + EXTRACT_NUMBER_AND_INCR (count, range_table); + else + count = 0; + + if (multibyte && BASE_LEADING_CODE_P (c)) + c = STRING_CHAR_AND_LENGTH (d, dend - d, len); - /* Cast to `unsigned' instead of `unsigned char' in case the - bit list is a full 32 bytes long. */ - if (c < (unsigned) (*p * BYTEWIDTH) + if (SINGLE_BYTE_CHAR_P (c)) + { /* Lookup bitmap. */ + c = TRANSLATE (c); /* The character to match. */ + len = 1; + + /* Cast to `unsigned' instead of `unsigned char' in + case the bit list is a full 32 bytes long. */ + if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) not = !not; + } + else if (range_table_exists) + CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); - p += 1 + *p; + p = CHARSET_RANGE_TABLE_END (range_table, count); if (!not) goto fail; SET_REGS_MATCHED (); - d++; + d += len; break; } @@ -4287,7 +4900,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ - if (translate + if (RE_TRANSLATE_P (translate) ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt)) goto fail; @@ -4394,6 +5007,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) on_failure: DEBUG_PRINT1 ("EXECUTING on_failure_jump"); +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif + EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); @@ -4434,6 +5051,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* A smart repeat ends with `maybe_pop_jump'. We change it to either `pop_failure_jump' or `jump'. */ case maybe_pop_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { @@ -4488,24 +5108,42 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) else if ((re_opcode_t) *p2 == exactn || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) { - register unsigned char c + register unsigned int c = *p2 == (unsigned char) endline ? '\n' : p2[2]; - if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + if ((re_opcode_t) p1[3] == exactn) + { + if (!(multibyte /* && (c != '\n') */ + && BASE_LEADING_CODE_P (c)) + ? c != p1[5] + : (STRING_CHAR (&p2[2], pend - &p2[2]) + != STRING_CHAR (&p1[5], pend - &p1[5]))) { p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", c, p1[5]); } + } else if ((re_opcode_t) p1[3] == charset || (re_opcode_t) p1[3] == charset_not) { int not = (re_opcode_t) p1[3] == charset_not; - if (c < (unsigned char) (p1[4] * BYTEWIDTH) + if (multibyte /* && (c != '\n') */ + && BASE_LEADING_CODE_P (c)) + c = STRING_CHAR (&p2[2], pend - &p2[2]); + + /* Test if C is listed in charset (or charset_not) + at `&p1[3]'. */ + if (SINGLE_BYTE_CHAR_P (c)) + { + if (c < CHARSET_BITMAP_SIZE (&p1[3]) * BYTEWIDTH && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) not = !not; + } + else if (CHARSET_RANGE_TABLE_EXISTS_P (&p1[3])) + CHARSET_LOOKUP_RANGE_TABLE (not, c, &p1[3]); /* `not' is equal to 1 if c would match, which means that we can't change to pop_failure_jump. */ @@ -4518,29 +5156,55 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } else if ((re_opcode_t) *p2 == charset) { -#ifdef DEBUG - register unsigned char c - = *p2 == (unsigned char) endline ? '\n' : p2[2]; -#endif + if ((re_opcode_t) p1[3] == exactn) + { + register unsigned int c = p1[5]; + int not = 0; + + if (multibyte && BASE_LEADING_CODE_P (c)) + c = STRING_CHAR (&p1[5], pend - &p1[5]); + + /* Test if C is listed in charset at `p2'. */ + if (SINGLE_BYTE_CHAR_P (c)) + { + if (c < CHARSET_BITMAP_SIZE (p2) * BYTEWIDTH + && (p2[2 + c / BYTEWIDTH] + & (1 << (c % BYTEWIDTH)))) + not = !not; + } + else if (CHARSET_RANGE_TABLE_EXISTS_P (p2)) + CHARSET_LOOKUP_RANGE_TABLE (not, c, p2); - if ((re_opcode_t) p1[3] == exactn - && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] - && (p2[2 + p1[5] / BYTEWIDTH] - & (1 << (p1[5] % BYTEWIDTH))))) + if (!not) { p[-3] = (unsigned char) pop_failure_jump; - DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", - c, p1[5]); + DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); + } } - else if ((re_opcode_t) p1[3] == charset_not) + /* It is hard to list up all the character in charset + P2 if it includes multibyte character. Give up in + such case. */ + else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2)) + { + /* Now, we are sure that P2 has no range table. + So, for the size of bitmap in P2, `p2[1]' is + enough. But P1 may have range table, so the + size of bitmap table of P1 is extracted by + using macro `CHARSET_BITMAP_SIZE'. + + Since we know that all the character listed in + P2 is ASCII, it is enough to test only bitmap + table of P1. */ + + if ((re_opcode_t) p1[3] == charset_not) { int idx; - /* We win if the charset_not inside the loop - lists every character listed in the charset after. */ + /* We win if the charset_not inside the loop lists + every character listed in the charset after. */ for (idx = 0; idx < (int) p2[1]; idx++) if (! (p2[2 + idx] == 0 - || (idx < (int) p1[4] + || (idx < CHARSET_BITMAP_SIZE (&p1[3]) && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) break; @@ -4556,12 +5220,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We win if the charset inside the loop has no overlap with the one after the loop. */ for (idx = 0; - idx < (int) p2[1] && idx < (int) p1[4]; + (idx < (int) p2[1] + && idx < CHARSET_BITMAP_SIZE (&p1[3])); idx++) if ((p2[2 + idx] & p1[5 + idx]) != 0) break; - if (idx == p2[1] || idx == p1[4]) + if (idx == p2[1] + || idx == CHARSET_BITMAP_SIZE (&p1[3])) { p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); @@ -4569,6 +5235,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } } } + } p -= 2; /* Point at relative address again. */ if ((re_opcode_t) p[-1] != pop_failure_jump) { @@ -4607,6 +5274,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ @@ -4698,84 +5368,184 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) break; } -#if 0 - /* The DEC Alpha C compiler 3.x generates incorrect code for the - test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of - AT_WORD_BOUNDARY, so this code is disabled. Expanding the - macro and introducing temporary variables works around the bug. */ - case wordbound: DEBUG_PRINT1 ("EXECUTING wordbound.\n"); - if (AT_WORD_BOUNDARY (d)) - break; - goto fail; - case notwordbound: - DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); - if (AT_WORD_BOUNDARY (d)) - goto fail; - break; -#else - case wordbound: - { - boolean prevchar, thischar; + /* We SUCCEED in one of the following cases: */ - DEBUG_PRINT1 ("EXECUTING wordbound.\n"); + /* Case 1: D is at the beginning or the end of string. */ if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) break; + else + { + /* C1 is the character before D, S1 is the syntax of C1, C2 + is the character at D, and S2 is the syntax of C2. */ + int c1, c2, s1, s2; + int pos1 = PTR_TO_OFFSET (d - 1); + int charpos; + + GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); + GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); +#ifdef emacs + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); +#endif + s1 = SYNTAX (c1); +#ifdef emacs + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); +#endif + s2 = SYNTAX (c2); - prevchar = WORDCHAR_P (d - 1); - thischar = WORDCHAR_P (d); - if (prevchar != thischar) + if (/* Case 2: Only one of S1 and S2 is Sword. */ + ((s1 == Sword) != (s2 == Sword)) + /* Case 3: Both of S1 and S2 are Sword, and macro + WORD_BOUNDARY_P (C1, C2) returns nonzero. */ + || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2))) break; - goto fail; } + goto fail; case notwordbound: - { - boolean prevchar, thischar; - DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); + + /* We FAIL in one of the following cases: */ + + /* Case 1: D is at the beginning or the end of string. */ if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) goto fail; + else + { + /* C1 is the character before D, S1 is the syntax of C1, C2 + is the character at D, and S2 is the syntax of C2. */ + int c1, c2, s1, s2; + int pos1 = PTR_TO_OFFSET (d - 1); + int charpos; + + GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); + GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); +#ifdef emacs + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); +#endif + s1 = SYNTAX (c1); +#ifdef emacs + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); +#endif + s2 = SYNTAX (c2); - prevchar = WORDCHAR_P (d - 1); - thischar = WORDCHAR_P (d); - if (prevchar != thischar) + if (/* Case 2: Only one of S1 and S2 is Sword. */ + ((s1 == Sword) != (s2 == Sword)) + /* Case 3: Both of S1 and S2 are Sword, and macro + WORD_BOUNDARY_P (C1, C2) returns nonzero. */ + || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2))) goto fail; - break; } -#endif + break; case wordbeg: DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); - if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) - break; + + /* We FAIL in one of the following cases: */ + + /* Case 1: D is at the end of string. */ + if (AT_STRINGS_END (d)) goto fail; + else + { + /* C1 is the character before D, S1 is the syntax of C1, C2 + is the character at D, and S2 is the syntax of C2. */ + int c1, c2, s1, s2; + int pos1 = PTR_TO_OFFSET (d); + int charpos; + + GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); +#ifdef emacs + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); + UPDATE_SYNTAX_TABLE (charpos); +#endif + s2 = SYNTAX (c2); + + /* Case 2: S2 is not Sword. */ + if (s2 != Sword) + goto fail; + + /* Case 3: D is not at the beginning of string ... */ + if (!AT_STRINGS_BEG (d)) + { + GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); +#ifdef emacs + UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1); +#endif + s1 = SYNTAX (c1); + + /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2) + returns 0. */ + if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2)) + goto fail; + } + } + break; case wordend: DEBUG_PRINT1 ("EXECUTING wordend.\n"); - if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) - && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) - break; + + /* We FAIL in one of the following cases: */ + + /* Case 1: D is at the beginning of string. */ + if (AT_STRINGS_BEG (d)) + goto fail; + else + { + /* C1 is the character before D, S1 is the syntax of C1, C2 + is the character at D, and S2 is the syntax of C2. */ + int c1, c2, s1, s2; + int pos1 = PTR_TO_OFFSET (d); + int charpos; + + GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); +#ifdef emacs + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1); + UPDATE_SYNTAX_TABLE (charpos); +#endif + s1 = SYNTAX (c1); + + /* Case 2: S1 is not Sword. */ + if (s1 != Sword) + goto fail; + + /* Case 3: D is not at the end of string ... */ + if (!AT_STRINGS_END (d)) + { + GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); +#ifdef emacs + UPDATE_SYNTAX_TABLE_FORWARD (charpos); +#endif + s2 = SYNTAX (c2); + + /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2) + returns 0. */ + if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2)) goto fail; + } + } + break; #ifdef emacs case before_dot: DEBUG_PRINT1 ("EXECUTING before_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) >= point) + if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE) goto fail; break; case at_dot: DEBUG_PRINT1 ("EXECUTING at_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) != point) + if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE) goto fail; break; case after_dot: DEBUG_PRINT1 ("EXECUTING after_dot.\n"); - if (PTR_CHAR_POS ((unsigned char *) d) <= point) + if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE) goto fail; break; @@ -4789,10 +5559,27 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) mcnt = (int) Sword; matchsyntax: PREFETCH (); - /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ - d++; - if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) +#ifdef emacs + { + int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); + UPDATE_SYNTAX_TABLE (pos1); + } +#endif + { + int c, len; + + if (multibyte) + /* we must concern about multibyte form, ... */ + c = STRING_CHAR_AND_LENGTH (d, dend - d, len); + else + /* everything should be handled as ASCII, even though it + looks like multibyte form. */ + c = *d, len = 1; + + if (SYNTAX (c) != (enum syntaxcode) mcnt) goto fail; + d += len; + } SET_REGS_MATCHED (); break; @@ -4806,86 +5593,141 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) mcnt = (int) Sword; matchnotsyntax: PREFETCH (); - /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ - d++; - if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) +#ifdef emacs + { + int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); + UPDATE_SYNTAX_TABLE (pos1); + } +#endif + { + int c, len; + + if (multibyte) + c = STRING_CHAR_AND_LENGTH (d, dend - d, len); + else + c = *d, len = 1; + + if (SYNTAX (c) == (enum syntaxcode) mcnt) goto fail; + d += len; + } + SET_REGS_MATCHED (); + break; + + case categoryspec: + DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p); + mcnt = *p++; + PREFETCH (); + { + int c, len; + + if (multibyte) + c = STRING_CHAR_AND_LENGTH (d, dend - d, len); + else + c = *d, len = 1; + + if (!CHAR_HAS_CATEGORY (c, mcnt)) + goto fail; + d += len; + } SET_REGS_MATCHED (); break; + case notcategoryspec: + DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p); + mcnt = *p++; + PREFETCH (); + { + int c, len; + + if (multibyte) + c = STRING_CHAR_AND_LENGTH (d, dend - d, len); + else + c = *d, len = 1; + + if (CHAR_HAS_CATEGORY (c, mcnt)) + goto fail; + d += len; + } + SET_REGS_MATCHED (); + break; + #else /* not emacs */ case wordchar: - DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); + DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); PREFETCH (); - if (!WORDCHAR_P (d)) - goto fail; + if (!WORDCHAR_P (d)) + goto fail; SET_REGS_MATCHED (); - d++; + d++; break; case notwordchar: - DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); + DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); PREFETCH (); if (WORDCHAR_P (d)) - goto fail; - SET_REGS_MATCHED (); - d++; + goto fail; + SET_REGS_MATCHED (); + d++; break; #endif /* not emacs */ - default: - abort (); + default: + abort (); } - continue; /* Successfully executed one pattern command; keep going. */ + continue; /* Successfully executed one pattern command; keep going. */ /* We goto here if a matching operation fails. */ fail: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif if (!FAIL_STACK_EMPTY ()) - { /* A restart point is known. Restore to that state. */ - DEBUG_PRINT1 ("\nFAIL:\n"); - POP_FAILURE_POINT (d, p, - lowest_active_reg, highest_active_reg, - regstart, regend, reg_info); - - /* If this failure point is a dummy, try the next one. */ - if (!p) + { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1 ("\nFAIL:\n"); + POP_FAILURE_POINT (d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) goto fail; - /* If we failed to the end of the pattern, don't examine *p. */ + /* If we failed to the end of the pattern, don't examine *p. */ assert (p <= pend); - if (p < pend) - { - boolean is_a_jump_n = false; - - /* If failed to a backwards jump that's part of a repetition - loop, need to pop this failure point and use the next one. */ - switch ((re_opcode_t) *p) - { - case jump_n: - is_a_jump_n = true; - case maybe_pop_jump: - case pop_failure_jump: - case jump: - p1 = p + 1; - EXTRACT_NUMBER_AND_INCR (mcnt, p1); - p1 += mcnt; - - if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) - || (!is_a_jump_n - && (re_opcode_t) *p1 == on_failure_jump)) - goto fail; - break; - default: - /* do nothing */ ; - } - } - - if (d >= string1 && d <= end1) + if (p < pend) + { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) *p) + { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) dend = end_match_1; - } + } else - break; /* Matching at this starting point really fails. */ + break; /* Matching at this starting point really fails. */ } /* for (;;) */ if (best_regs_set) @@ -4893,7 +5735,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) FREE_VARIABLES (); - return -1; /* Failure to match. */ + return -1; /* Failure to match. */ } /* re_match_2 */ /* Subroutine definitions for re_match_2. */ @@ -4922,92 +5764,92 @@ group_match_null_string_p (p, end, reg_info) { /* Skip over opcodes that can match nothing, and return true or false, as appropriate, when we get to one that can't, or to the - matching stop_memory. */ + matching stop_memory. */ switch ((re_opcode_t) *p1) - { - /* Could be either a loop or a series of alternatives. */ - case on_failure_jump: - p1++; - EXTRACT_NUMBER_AND_INCR (mcnt, p1); + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); - /* If the next operation is not a jump backwards in the + /* If the next operation is not a jump backwards in the pattern. */ if (mcnt >= 0) { - /* Go through the on_failure_jumps of the alternatives, - seeing if any of the alternatives cannot match nothing. - The last alternative starts with only a jump, - whereas the rest start with on_failure_jump and end - with a jump, e.g., here is the pattern for `a|b|c': + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': - /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 - /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 - /exactn/1/c + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c - So, we have to first go through the first (n-1) - alternatives and then deal with the last one separately. */ + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ - /* Deal with the first (n-1) alternatives, which start - with an on_failure_jump (see above) that jumps to right - past a jump_past_alt. */ + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ - while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) - { - /* `mcnt' holds how many bytes long the alternative - is, including the ending `jump_past_alt' and - its number. */ + while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ - if (!alt_match_null_string_p (p1, p1 + mcnt - 3, - reg_info)) - return false; + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return false; - /* Move to right after this alternative, including the + /* Move to right after this alternative, including the jump_past_alt. */ - p1 += mcnt; + p1 += mcnt; - /* Break if it's the beginning of an n-th alternative - that doesn't begin with an on_failure_jump. */ - if ((re_opcode_t) *p1 != on_failure_jump) - break; + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) *p1 != on_failure_jump) + break; /* Still have to check that it's not an n-th alternative that starts with an on_failure_jump. */ p1++; - EXTRACT_NUMBER_AND_INCR (mcnt, p1); - if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) - { - /* Get to the beginning of the n-th alternative. */ - p1 -= 3; - break; - } - } + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } - /* Deal with the last alternative: go back and get number - of the `jump_past_alt' just before it. `mcnt' contains - the length of the alternative. */ - EXTRACT_NUMBER (mcnt, p1 - 2); + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); - if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) - return false; + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return false; - p1 += mcnt; /* Get past the n-th alternative. */ - } /* if mcnt > 0 */ - break; + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; - case stop_memory: + case stop_memory: assert (p1[1] == **p); - *p = p1 + 2; - return true; + *p = p1 + 2; + return true; - default: - if (!common_op_match_null_string_p (&p1, end, reg_info)) - return false; - } + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } } /* while p1 < end */ return false; @@ -5029,21 +5871,21 @@ alt_match_null_string_p (p, end, reg_info) while (p1 < end) { /* Skip over opcodes that can match nothing, and break when we get - to one that can't. */ + to one that can't. */ switch ((re_opcode_t) *p1) - { - /* It's a loop. */ - case on_failure_jump: - p1++; - EXTRACT_NUMBER_AND_INCR (mcnt, p1); - p1 += mcnt; - break; + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; default: - if (!common_op_match_null_string_p (&p1, end, reg_info)) - return false; - } + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } } /* while p1 < end */ return true; @@ -5089,42 +5931,42 @@ common_op_match_null_string_p (p, end, reg_info) ret = group_match_null_string_p (&p1, end, reg_info); /* Have to set this here in case we're checking a group which - contains a group and a back reference to it. */ + contains a group and a back reference to it. */ if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) - REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; if (!ret) - return false; + return false; break; - /* If this is an optimized succeed_n for zero times, make the jump. */ + /* If this is an optimized succeed_n for zero times, make the jump. */ case jump: EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt >= 0) - p1 += mcnt; + p1 += mcnt; else - return false; + return false; break; case succeed_n: - /* Get to the number of times to succeed. */ + /* Get to the number of times to succeed. */ p1 += 2; EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt == 0) - { - p1 -= 4; - EXTRACT_NUMBER_AND_INCR (mcnt, p1); - p1 += mcnt; - } + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } else - return false; + return false; break; case duplicate: if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) - return false; + return false; break; case set_number_at: @@ -5150,11 +5992,27 @@ bcmp_translate (s1, s2, len, translate) RE_TRANSLATE_TYPE translate; { register unsigned char *p1 = s1, *p2 = s2; - while (len) + unsigned char *p1_end = s1 + len; + unsigned char *p2_end = s2 + len; + + while (p1 != p1_end && p2 != p2_end) { - if (translate[*p1++] != translate[*p2++]) return 1; - len--; + int p1_charlen, p2_charlen; + int p1_ch, p2_ch; + + p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); + p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); + + if (RE_TRANSLATE (translate, p1_ch) + != RE_TRANSLATE (translate, p2_ch)) + return 1; + + p1 += p1_charlen, p2 += p2_charlen; } + + if (p1 != p1_end || p2 != p2_end) + return 1; + return 0; } @@ -5167,7 +6025,7 @@ bcmp_translate (s1, s2, len, translate) Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. - We call regex_compile to do the actual compilation. */ + We call regex_compile to do the actual compilation. */ const char * re_compile_pattern (pattern, length, bufp) @@ -5186,7 +6044,7 @@ re_compile_pattern (pattern, length, bufp) setting no_sub. */ bufp->no_sub = 0; - /* Match anchors at newline. */ + /* Match anchors at newline. */ bufp->newline_anchor = 1; ret = regex_compile (pattern, length, re_syntax_options, bufp); @@ -5196,8 +6054,8 @@ re_compile_pattern (pattern, length, bufp) return gettext (re_error_msgid[(int) ret]); } -/* Entry points compatible with 4.2 BSD regex library. We don't define - them unless specifically requested. */ +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ #if defined (_REGEX_RE_COMP) || defined (_LIBC) @@ -5227,7 +6085,7 @@ re_comp (s) { re_comp_buf.buffer = (unsigned char *) malloc (200); if (re_comp_buf.buffer == NULL) - return gettext (re_error_msgid[(int) REG_ESPACE]); + return gettext (re_error_msgid[(int) REG_ESPACE]); re_comp_buf.allocated = 200; re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); @@ -5238,7 +6096,7 @@ re_comp (s) /* Since `re_exec' always passes NULL for the `regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ - /* Match anchors at newlines. */ + /* Match anchors at newlines. */ re_comp_buf.newline_anchor = 1; ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); @@ -5270,7 +6128,7 @@ re_exec (s) /* regcomp takes a regular expression as a string and compiles it. - PREG is a regex_t *. We do not expect any fields to be initialized, + PREG is a regex_t *. We do not expect any fields to be initialized, since POSIX says we shouldn't. Thus, we set `buffer' to the compiled pattern; @@ -5299,7 +6157,7 @@ re_exec (s) routine will report only success or failure, and nothing about the registers. - It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for the return codes and their meanings.) */ int @@ -5332,11 +6190,11 @@ regcomp (preg, pattern, cflags) = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE * sizeof (*(RE_TRANSLATE_TYPE)0)); if (preg->translate == NULL) - return (int) REG_ESPACE; + return (int) REG_ESPACE; /* Map uppercase characters to corresponding lowercase ones. */ for (i = 0; i < CHAR_SET_SIZE; i++) - preg->translate[i] = ISUPPER (i) ? tolower (i) : i; + preg->translate[i] = ISUPPER (i) ? tolower (i) : i; } else preg->translate = NULL; @@ -5346,7 +6204,7 @@ regcomp (preg, pattern, cflags) { /* REG_NEWLINE implies neither . nor [^...] match newline. */ syntax &= ~RE_DOT_NEWLINE; syntax |= RE_HAT_LISTS_NOT_NEWLINE; - /* It also changes the matching behavior. */ + /* It also changes the matching behavior. */ preg->newline_anchor = 1; } else @@ -5370,7 +6228,7 @@ regcomp (preg, pattern, cflags) string STRING. If NMATCH is zero or REG_NOSUB was set in the cflags argument to - `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at least NMATCH elements, and we set them to the offsets of the corresponding matched substrings. @@ -5401,7 +6259,7 @@ regexec (preg, string, nmatch, pmatch, eflags) /* The user has told us exactly how many registers to return information about, via `nmatch'. We have to pass that on to the - matching routines. */ + matching routines. */ private_preg.regs_allocated = REGS_FIXED; if (want_reg_info) @@ -5410,29 +6268,29 @@ regexec (preg, string, nmatch, pmatch, eflags) regs.start = TALLOC (nmatch, regoff_t); regs.end = TALLOC (nmatch, regoff_t); if (regs.start == NULL || regs.end == NULL) - return (int) REG_NOMATCH; + return (int) REG_NOMATCH; } /* Perform the searching operation. */ ret = re_search (&private_preg, string, len, - /* start: */ 0, /* range: */ len, - want_reg_info ? ®s : (struct re_registers *) 0); + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); /* Copy the register information to the POSIX structure. */ if (want_reg_info) { if (ret >= 0) - { - unsigned r; + { + unsigned r; - for (r = 0; r < nmatch; r++) - { - pmatch[r].rm_so = regs.start[r]; - pmatch[r].rm_eo = regs.end[r]; - } - } + for (r = 0; r < nmatch; r++) + { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } - /* If we needed the temporary register info, free the space now. */ + /* If we needed the temporary register info, free the space now. */ free (regs.start); free (regs.end); } @@ -5458,7 +6316,7 @@ regerror (errcode, preg, errbuf, errbuf_size) if (errcode < 0 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0]))) /* Only error codes returned by the rest of the code should be passed - to this routine. If we are given anything else, or if other regex + to this routine. If we are given anything else, or if other regex code generates an invalid error code, then the program has a bug. Dump core so we can fix it. */ abort (); @@ -5470,12 +6328,12 @@ regerror (errcode, preg, errbuf, errbuf_size) if (errbuf_size != 0) { if (msg_size > errbuf_size) - { - strncpy (errbuf, msg, errbuf_size - 1); - errbuf[errbuf_size - 1] = 0; - } + { + strncpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; + } else - strcpy (errbuf, msg); + strcpy (errbuf, msg); } return msg_size;