X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=regex.c;h=c9219c66c83624e8f2273ed11577ef22967a558b;hb=040e6d8836a3ca85e1d2d9b76a2503864e89dc66;hp=f8e2dcb27f5fc4b11753f8142dd20b4f147d016a;hpb=7a847ad3827234497a9ad1172109d457d21e707f;p=gnulib.git diff --git a/regex.c b/regex.c index f8e2dcb27..c9219c66c 100644 --- a/regex.c +++ b/regex.c @@ -2,7 +2,7 @@ 0.12. (Implements POSIX draft P10003.2/D11.2, except for internationalization features.) - Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1993, 1994-1998, 1999 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,11 +27,15 @@ #undef _GNU_SOURCE #define _GNU_SOURCE +#ifdef emacs /* Converts the pointer to the char to BEG-based offset from the start. */ #define PTR_TO_OFFSET(d) \ POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \ ? (d) - string1 : (d) - (string2 - size1)) -#define POS_AS_IN_BUFFER(p) ((p) + 1) +#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) +#else +#define PTR_TO_OFFSET(d) 0 +#endif #ifdef HAVE_CONFIG_H #include @@ -190,6 +194,64 @@ init_syntax_once () /* isalpha etc. are used for the character classes. */ #include +#ifdef emacs + +/* 1 if C is an ASCII character. */ +#define IS_REAL_ASCII(c) ((c) < 0200) + +/* 1 if C is a unibyte character. */ +#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c))) + +/* The Emacs definitions should not be directly affected by locales. */ + +/* In Emacs, these are only used for single-byte characters. */ +#define ISDIGIT(c) ((c) >= '0' && (c) <= '9') +#define ISCNTRL(c) ((c) < ' ') +#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \ + || ((c) >= 'a' && (c) <= 'f') \ + || ((c) >= 'A' && (c) <= 'F')) + +/* This is only used for single-byte characters. */ +#define ISBLANK(c) ((c) == ' ' || (c) == '\t') + +/* The rest must handle multibyte characters. */ + +#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ + ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ + : 1) + +#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ + ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ + : 1) + +#define ISALNUM(c) (IS_REAL_ASCII (c) \ + ? (((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z') \ + || ((c) >= '0' && (c) <= '9')) \ + : SYNTAX (c) == Sword) + +#define ISALPHA(c) (IS_REAL_ASCII (c) \ + ? (((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z')) \ + : SYNTAX (c) == Sword) + +#define ISLOWER(c) (LOWERCASEP (c)) + +#define ISPUNCT(c) (IS_REAL_ASCII (c) \ + ? ((c) > ' ' && (c) < 0177 \ + && !(((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z') \ + || ((c) >= '0' && (c) <= '9'))) \ + : SYNTAX (c) != Sword) + +#define ISSPACE(c) (SYNTAX (c) == Swhitespace) + +#define ISUPPER(c) (UPPERCASEP (c)) + +#define ISWORD(c) (SYNTAX (c) == Sword) + +#else /* not emacs */ + /* Jim Meyering writes: "... Some ctype macros are valid only for character codes that @@ -207,6 +269,16 @@ init_syntax_once () #define ISASCII(c) isascii(c) #endif +/* 1 if C is an ASCII character. */ +#define IS_REAL_ASCII(c) ((c) < 0200) + +/* This distinction is not meaningful, except in Emacs. */ +#define ISUNIBYTE(c) 1 + +#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) +#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) +#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) + #ifdef isblank #define ISBLANK(c) (ISASCII (c) && isblank (c)) #else @@ -229,6 +301,10 @@ init_syntax_once () #define ISUPPER(c) (ISASCII (c) && isupper (c)) #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) +#define ISWORD(c) ISALPHA(c) + +#endif /* not emacs */ + #ifndef NULL #define NULL (void *)0 #endif @@ -379,7 +455,15 @@ typedef enum for a bitmap saying which chars are in. Bits in each byte are ordered low-bit-first. A character is in the set if its bit is 1. A character too large to have a bit in the map is - automatically not in the set. */ + automatically not in the set. + + If the length byte has the 0x80 bit set, then that stuff + is followed by a range table: + 2 bytes of flags for character sets (low 8 bits, high 8 bits) + See RANGE_TABLE_WORK_BITS below. + 2 bytes, the number of pairs that follow + pairs, each 2 multibyte characters, + each multibyte character represented as 3 bytes. */ charset, /* Same parameters as charset, but match any character that is @@ -613,8 +697,14 @@ extract_number_and_incr (destination, source) /* Return the address of range table of charset P. But not the start of table itself, but the before where the number of ranges is - stored. `2 +' means to skip re_opcode_t and size of bitmap. */ -#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)]) + stored. `2 +' means to skip re_opcode_t and size of bitmap, + and the 2 bytes of flags at the start of the range table. */ +#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)]) + +/* Extract the bit flags that start a range table. */ +#define CHARSET_RANGE_TABLE_BITS(p) \ + ((p)[2 + CHARSET_BITMAP_SIZE (p)] \ + + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100) /* Test if C is listed in the bitmap of charset P. */ #define CHARSET_LOOKUP_BITMAP(p, c) \ @@ -787,6 +877,9 @@ print_partial_compiled_pattern (start, end) { register int c, last = -100; register int in_range = 0; + int length = *p & 0x7f; + int has_range_table = *p & 0x80; + int range_length = p[length + 2] + p[length + 3] * 0x100; printf ("/charset [%s", (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); @@ -794,7 +887,7 @@ print_partial_compiled_pattern (start, end) assert (p + *p < pend); for (c = 0; c < 256; c++) - if (c / 8 < *p + if (c / 8 < length && (p[1 + (c/8)] & (1 << (c % 8)))) { /* Are we starting a range? */ @@ -805,7 +898,7 @@ print_partial_compiled_pattern (start, end) } /* Have we broken a range? */ else if (last + 1 != c && in_range) - { + { putchar (last); in_range = 0; } @@ -816,12 +909,20 @@ print_partial_compiled_pattern (start, end) last = c; } + p += 1 + length; + if (in_range) putchar (last); putchar (']'); - p += 1 + *p; + if (has_range_table) + printf ("has-range-table"); + + /* ??? Should print the range table; for now, + just skip it. */ + if (has_range_table) + p += 4 + 6 * range_length; } break; @@ -1411,8 +1512,8 @@ typedef struct \ assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ \ - DEBUG_POP (&failure_id); \ - DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + DEBUG_POP (&failure_id.integer); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id.integer); \ \ /* If the saved string location is NULL, it came from an \ on_failure_keep_string_jump opcode, and we want to throw away the \ @@ -1540,7 +1641,7 @@ static reg_errcode_t compile_range (); #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ - if (translate) c = RE_TRANSLATE (translate, c); \ + if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ } while (0) #endif @@ -1561,7 +1662,8 @@ static reg_errcode_t compile_range (); when we use a character as a subscript we must make it unsigned. */ #ifndef TRANSLATE #define TRANSLATE(d) \ - (translate ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) + (RE_TRANSLATE_P (translate) \ + ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) #endif @@ -1705,6 +1807,7 @@ struct range_table_work_area int *table; /* actual work area. */ int allocated; /* allocated size for work area in bytes. */ int used; /* actually used size in words. */ + int bits; /* flag to record character classes */ }; /* Make sure that WORK_AREA can hold more N multibyte characters. */ @@ -1724,6 +1827,25 @@ struct range_table_work_area } \ } while (0) +#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \ + (work_area).bits |= (bit) + +/* These bits represent the various character classes such as [:alnum:] + in a charset's range table. */ +#define BIT_ALNUM 0x1 +#define BIT_ALPHA 0x2 +#define BIT_WORD 0x4 +#define BIT_ASCII 0x8 +#define BIT_NONASCII 0x10 +#define BIT_GRAPH 0x20 +#define BIT_LOWER 0x40 +#define BIT_PRINT 0x80 +#define BIT_PUNCT 0x100 +#define BIT_SPACE 0x200 +#define BIT_UPPER 0x400 +#define BIT_UNIBYTE 0x800 +#define BIT_MULTIBYTE 0x1000 + /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ do { \ @@ -1739,8 +1861,9 @@ struct range_table_work_area free ((work_area).table); \ } while (0) -#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0) +#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0) #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) +#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits) #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) @@ -1775,7 +1898,10 @@ struct range_table_work_area || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ - || STREQ (string, "cntrl") || STREQ (string, "blank")) + || STREQ (string, "cntrl") || STREQ (string, "blank") \ + || STREQ (string, "word") \ + || STREQ (string, "ascii") || STREQ (string, "nonascii") \ + || STREQ (string, "unibyte") || STREQ (string, "multibyte")) #ifndef MATCH_MAY_ALLOCATE @@ -1873,7 +1999,12 @@ regex_compile (pattern, size, syntax, bufp) compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ +#ifdef AIX + /* `const' makes AIX compiler fail. */ + char *p = pattern; +#else const char *p = pattern; +#endif const char *pend = pattern + size; /* How to translate the characters in the pattern. */ @@ -2037,6 +2168,7 @@ regex_compile (pattern, size, syntax, bufp) /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; + char greedy = 1; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine @@ -2045,8 +2177,14 @@ regex_compile (pattern, size, syntax, bufp) for (;;) { - zero_times_ok |= c != '+'; - many_times_ok |= c != '?'; + if (!(syntax & RE_ALL_GREEDY) + && c == '?' && (zero_times_ok || many_times_ok)) + greedy = 0; + else + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + } if (p == pend) break; @@ -2087,6 +2225,8 @@ regex_compile (pattern, size, syntax, bufp) /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ + if (greedy) + { if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next @@ -2145,7 +2285,39 @@ regex_compile (pattern, size, syntax, bufp) INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); b += 3; } - } + + } + else /* not greedy */ + { /* I wish the greedy and non-greedy cases could be merged. */ + + if (many_times_ok) + { + /* The greedy multiple match looks like a repeat..until: + we only need a conditional jump at the end of the loop */ + GET_BUFFER_SPACE (3); + STORE_JUMP (on_failure_jump, b, laststart); + b += 3; + if (zero_times_ok) + { + /* The repeat...until naturally matches one or more. + To also match zero times, we need to first jump to + the end of the loop (its conditional jump). */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b); + b += 3; + } + } + else + { + /* non-greedy a?? */ + GET_BUFFER_SPACE (6); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + INSERT_JUMP (on_failure_jump, laststart, laststart + 6); + b += 3; + } + } + } break; @@ -2261,15 +2433,20 @@ regex_compile (pattern, size, syntax, bufp) int ch; boolean is_alnum = STREQ (str, "alnum"); boolean is_alpha = STREQ (str, "alpha"); + boolean is_ascii = STREQ (str, "ascii"); boolean is_blank = STREQ (str, "blank"); boolean is_cntrl = STREQ (str, "cntrl"); boolean is_digit = STREQ (str, "digit"); boolean is_graph = STREQ (str, "graph"); boolean is_lower = STREQ (str, "lower"); + boolean is_multibyte = STREQ (str, "multibyte"); + boolean is_nonascii = STREQ (str, "nonascii"); boolean is_print = STREQ (str, "print"); boolean is_punct = STREQ (str, "punct"); boolean is_space = STREQ (str, "space"); + boolean is_unibyte = STREQ (str, "unibyte"); boolean is_upper = STREQ (str, "upper"); + boolean is_word = STREQ (str, "word"); boolean is_xdigit = STREQ (str, "xdigit"); if (!IS_CHAR_CLASS (str)) @@ -2281,6 +2458,35 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* Most character classes in a multibyte match + just set a flag. Exceptions are is_blank, + is_digit, is_cntrl, and is_xdigit, since + they can only match ASCII characters. We + don't need to handle them for multibyte. */ + + if (bufp->multibyte) + { + int bit = 0; + + if (is_alnum) bit = BIT_ALNUM; + if (is_alpha) bit = BIT_ALPHA; + if (is_ascii) bit = BIT_ASCII; + if (is_graph) bit = BIT_GRAPH; + if (is_lower) bit = BIT_LOWER; + if (is_multibyte) bit = BIT_MULTIBYTE; + if (is_nonascii) bit = BIT_NONASCII; + if (is_print) bit = BIT_PRINT; + if (is_punct) bit = BIT_PUNCT; + if (is_space) bit = BIT_SPACE; + if (is_unibyte) bit = BIT_UNIBYTE; + if (is_upper) bit = BIT_UPPER; + if (is_word) bit = BIT_WORD; + if (bit) + SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, + bit); + } + + /* Handle character classes for ASCII characters. */ for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { int translated = TRANSLATE (ch); @@ -2301,6 +2507,14 @@ regex_compile (pattern, size, syntax, bufp) || (is_upper && ISUPPER (ch)) || (is_xdigit && ISXDIGIT (ch))) SET_LIST_BIT (translated); + if ( (is_ascii && IS_REAL_ASCII (ch)) + || (is_nonascii && !IS_REAL_ASCII (ch)) + || (is_unibyte && ISUNIBYTE (ch)) + || (is_multibyte && !ISUNIBYTE (ch))) + SET_LIST_BIT (translated); + + if ( (is_word && ISWORD (ch))) + SET_LIST_BIT (translated); } /* Repeat the loop. */ @@ -2385,19 +2599,26 @@ regex_compile (pattern, size, syntax, bufp) b[-1]--; b += b[-1]; - /* Build real range table from work area. */ - if (RANGE_TABLE_WORK_USED (range_table_work)) + /* Build real range table from work area. */ + if (RANGE_TABLE_WORK_USED (range_table_work) + || RANGE_TABLE_WORK_BITS (range_table_work)) { int i; int used = RANGE_TABLE_WORK_USED (range_table_work); /* Allocate space for COUNT + RANGE_TABLE. Needs two - bytes for COUNT and three bytes for each character. */ - GET_BUFFER_SPACE (2 + used * 3); + bytes for flags, two for COUNT, and three bytes for + each character. */ + GET_BUFFER_SPACE (4 + used * 3); /* Indicate the existence of range table. */ laststart[1] |= 0x80; + /* Store the character class flag bits into the range table. + If not in emacs, these flag bits are always 0. */ + *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff; + *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8; + STORE_NUMBER_AND_INCR (b, used / 2); for (i = 0; i < used; i++) STORE_CHARACTER_AND_INCR @@ -2893,8 +3114,12 @@ regex_compile (pattern, size, syntax, bufp) p1 = p - 1; /* P1 points the head of C. */ #ifdef emacs if (bufp->multibyte) - /* Set P to the next character boundary. */ - p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; + { + c = STRING_CHAR (p1, pend - p1); + c = TRANSLATE (c); + /* Set P to the next character boundary. */ + p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; + } #endif /* If no exactn currently being built. */ if (!pending_exact @@ -2906,14 +3131,14 @@ regex_compile (pattern, size, syntax, bufp) || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) /* If followed by a repetition operator. */ - || *p == '*' || *p == '^' + || (p != pend && (*p == '*' || *p == '^')) || ((syntax & RE_BK_PLUS_QM) - ? *p == '\\' && (p[1] == '+' || p[1] == '?') - : (*p == '+' || *p == '?')) + ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') + : p != pend && (*p == '+' || *p == '?')) || ((syntax & RE_INTERVALS) && ((syntax & RE_NO_BK_BRACES) - ? *p == '{' - : (p[0] == '\\' && p[1] == '{')))) + ? p != pend && *p == '{' + : p + 1 < pend && p[0] == '\\' && p[1] == '{'))) { /* Start building a new exactn. */ @@ -2923,16 +3148,23 @@ regex_compile (pattern, size, syntax, bufp) pending_exact = b - 1; } - /* Here, C may translated, therefore C may not equal to *P1. */ - while (1) +#ifdef emacs + if (! SINGLE_BYTE_CHAR_P (c)) + { + unsigned char str[MAX_MULTIBYTE_LENGTH]; + int i = CHAR_STRING (c, str); + int j; + for (j = 0; j < i; j++) + { + BUF_PUSH (str[j]); + (*pending_exact)++; + } + } + else +#endif { BUF_PUSH (c); (*pending_exact)++; - if (++p1 == p) - break; - - /* Rest of multibyte form should be copied literally. */ - c = *(unsigned char *)p1; } break; } /* switch (c) */ @@ -3140,6 +3372,10 @@ group_in_compile_stack (compile_stack, regnum) characters can start a string that matches the pattern. This fastmap is used by re_search to skip quickly over impossible starting points. + Character codes above (1 << BYTEWIDTH) are not represented in the + fastmap, but the leading codes are represented. Thus, the fastmap + indicates which character sets could start a match. + The caller must supply the address of a (1 << BYTEWIDTH)-byte data area as BUFP->fastmap. @@ -3241,22 +3477,30 @@ re_compile_fastmap (bufp) #ifndef emacs case charset: - for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) - if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) - fastmap[j] = 1; - break; + { + int length = (*p & 0x7f);; + p++; + for (j = length * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + } + break; case charset_not: /* Chars beyond end of map must be allowed. */ - for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) - fastmap[j] = 1; + { + int length = (*p & 0x7f);; + p++; - for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) - if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; - break; + for (j = length * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + } + break; case wordchar: for (j = 0; j < (1 << BYTEWIDTH); j++) @@ -3277,6 +3521,12 @@ re_compile_fastmap (bufp) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) fastmap[j] = 1; + /* If we can match a character class, we can match + any character set. */ + if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) + && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) + goto set_fastmap_for_multibyte_characters; + if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) && match_any_multibyte_characters == false) { @@ -3287,8 +3537,7 @@ re_compile_fastmap (bufp) /* Make P points the range table. */ p += CHARSET_BITMAP_SIZE (&p[-2]); - /* Extract the number of ranges in range table into - COUNT. */ + /* Extract the number of ranges in range table into COUNT. */ EXTRACT_NUMBER_AND_INCR (count, p); for (; count > 0; count--, p += 2 * 3) /* XXX */ { @@ -3302,9 +3551,11 @@ re_compile_fastmap (bufp) case charset_not: - /* Chars beyond end of map must be allowed. End of map is - `127' if bufp->multibyte is nonzero. */ - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* Chars beyond end of bitmap are possible matches. + All the single-byte codes can occur in multibyte buffers. + So any that are not listed in the charset + are possible matches, even in multibyte buffers. */ + simple_char_max = (1 << BYTEWIDTH); for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; j < simple_char_max; j++) fastmap[j] = 1; @@ -3331,7 +3582,9 @@ re_compile_fastmap (bufp) case wordchar: - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* All the single-byte codes can occur in multibyte buffers, + and they may have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) == Sword) fastmap[j] = 1; @@ -3344,7 +3597,9 @@ re_compile_fastmap (bufp) case notwordchar: - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + /* All the single-byte codes can occur in multibyte buffers, + and they may not have word syntax. So do consider them. */ + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (SYNTAX (j) != Sword) fastmap[j] = 1; @@ -3360,21 +3615,13 @@ re_compile_fastmap (bufp) { int fastmap_newline = fastmap['\n']; - /* `.' matches anything (but if bufp->multibyte is - nonzero, matches `\000' .. `\127' and possible multibyte - character) ... */ + /* `.' matches anything, except perhaps newline. + Even in a multibyte buffer, it should match any + conceivable byte value for the fastmap. */ if (bufp->multibyte) - { - simple_char_max = 0x80; - - for (j = 0x80; j < 0xA0; j++) - if (BASE_LEADING_CODE_P (j)) - fastmap[j] = 1; - match_any_multibyte_characters = true; - } - else - simple_char_max = (1 << BYTEWIDTH); + match_any_multibyte_characters = true; + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) fastmap[j] = 1; @@ -3432,7 +3679,7 @@ re_compile_fastmap (bufp) case categoryspec: k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (CHAR_HAS_CATEGORY (j, k)) fastmap[j] = 1; @@ -3446,7 +3693,7 @@ re_compile_fastmap (bufp) case notcategoryspec: k = *p++; - simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH); + simple_char_max = (1 << BYTEWIDTH); for (j = 0; j < simple_char_max; j++) if (!CHAR_HAS_CATEGORY (j, k)) fastmap[j] = 1; @@ -3709,13 +3956,13 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) range = total_size - startpos; /* If the search isn't to be a backwards one, don't waste time in a - search for a pattern that must be anchored. */ + search for a pattern anchored at beginning of buffer. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) { if (startpos > 0) return -1; else - range = 1; + range = 0; } #ifdef emacs @@ -3723,8 +3970,8 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) don't keep searching past point. */ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) { - range = PT - startpos; - if (range <= 0) + range = PT_BYTE - BEGV_BYTE - startpos; + if (range < 0) return -1; } #endif /* emacs */ @@ -3741,8 +3988,8 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) #ifdef emacs gl_state.object = re_match_object; { - int charpos - = SYNTAX_TABLE_BYTE_TO_CHAR (startpos > 0 ? startpos : startpos + 1); + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); + int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos); SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); } @@ -3785,7 +4032,7 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) /* Written out as an if-else to avoid testing `translate' inside the loop. */ - if (translate) + if (RE_TRANSLATE_P (translate)) { if (multibyte) while (range > lim) @@ -3806,12 +4053,18 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) else while (range > lim && !fastmap[(unsigned char) - RE_TRANSLATE (translate, (unsigned char) *d++)]) - range--; + RE_TRANSLATE (translate, (unsigned char) *d)]) + { + d++; + range--; + } } else - while (range > lim && !fastmap[(unsigned char) *d++]) - range--; + while (range > lim && !fastmap[(unsigned char) *d]) + { + d++; + range--; + } startpos += irange - range; } @@ -3822,7 +4075,7 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) : size1 - startpos); buf_ch = STRING_CHAR (d, room); - if (translate) + if (RE_TRANSLATE_P (translate)) buf_ch = RE_TRANSLATE (translate, buf_ch); if (! (buf_ch >= 0400 @@ -4056,8 +4309,9 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) #ifdef emacs int charpos; + int adjpos = NILP (re_match_object) || BUFFERP (re_match_object); gl_state.object = re_match_object; - charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos)); + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos); SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1); #endif @@ -4498,7 +4752,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ - if (translate) + if (RE_TRANSLATE_P (translate)) { #ifdef emacs if (multibyte) @@ -4525,9 +4779,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) do { PREFETCH (); - if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++) + if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d) != (unsigned char) *p++) goto fail; + d++; } while (--mcnt); } @@ -4590,26 +4845,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) range table. */ unsigned char *range_table; - /* Nonzero if there is range table. */ + /* Nonzero if there is a range table. */ int range_table_exists; - /* Number of ranges of range table. Not in bytes. */ - int count; + /* Number of ranges of range table. This is not included + in the initial byte-length of the command. */ + int count = 0; DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); PREFETCH (); c = (unsigned char) *d; - range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); + +#ifdef emacs if (range_table_exists) - EXTRACT_NUMBER_AND_INCR (count, range_table); - else - count = 0; + { + range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ + EXTRACT_NUMBER_AND_INCR (count, range_table); + } if (multibyte && BASE_LEADING_CODE_P (c)) c = STRING_CHAR_AND_LENGTH (d, dend - d, len); +#endif /* emacs */ if (SINGLE_BYTE_CHAR_P (c)) { /* Lookup bitmap. */ @@ -4619,13 +4878,37 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) - && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) - not = !not; + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; } +#ifdef emacs else if (range_table_exists) - CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); + { + int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); + + if ( (class_bits & BIT_ALNUM && ISALNUM (c)) + | (class_bits & BIT_ALPHA && ISALPHA (c)) + | (class_bits & BIT_ASCII && IS_REAL_ASCII (c)) + | (class_bits & BIT_GRAPH && ISGRAPH (c)) + | (class_bits & BIT_LOWER && ISLOWER (c)) + | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c)) + | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c)) + | (class_bits & BIT_PRINT && ISPRINT (c)) + | (class_bits & BIT_PUNCT && ISPUNCT (c)) + | (class_bits & BIT_SPACE && ISSPACE (c)) + | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c)) + | (class_bits & BIT_UPPER && ISUPPER (c)) + | (class_bits & BIT_WORD && ISWORD (c))) + not = !not; + else + CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); + } +#endif /* emacs */ - p = CHARSET_RANGE_TABLE_END (range_table, count); + if (range_table_exists) + p = CHARSET_RANGE_TABLE_END (range_table, count); + else + p += CHARSET_BITMAP_SIZE (&p[-1]) + 1; if (!not) goto fail; @@ -4873,7 +5156,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ - if (translate + if (RE_TRANSLATE_P (translate) ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt)) goto fail; @@ -4980,6 +5263,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) on_failure: DEBUG_PRINT1 ("EXECUTING on_failure_jump"); +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif + EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); @@ -5020,6 +5307,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* A smart repeat ends with `maybe_pop_jump'. We change it to either `pop_failure_jump' or `jump'. */ case maybe_pop_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { @@ -5240,6 +5530,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ @@ -5350,7 +5643,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2); #ifdef emacs - charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 ? pos1 : 1); + charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1); UPDATE_SYNTAX_TABLE (charpos); #endif s1 = SYNTAX (c1); @@ -5643,6 +5936,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We goto here if a matching operation fails. */ fail: +#if defined (WINDOWSNT) && defined (emacs) + QUIT; +#endif if (!FAIL_STACK_EMPTY ()) { /* A restart point is known. Restore to that state. */ DEBUG_PRINT1 ("\nFAIL:\n");