0.12. (Implements POSIX draft P10003.2/D11.2, except for
internationalization features.)
- Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#undef _GNU_SOURCE
#define _GNU_SOURCE
+#ifdef emacs
/* Converts the pointer to the char to BEG-based offset from the start. */
#define PTR_TO_OFFSET(d) \
POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \
? (d) - string1 : (d) - (string2 - size1))
-#define POS_AS_IN_BUFFER(p) ((p) + 1)
+#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
+#else
+#define PTR_TO_OFFSET(d) 0
+#endif
#ifdef HAVE_CONFIG_H
#include <config.h>
#include "category.h"
#define malloc xmalloc
+#define realloc xrealloc
#define free xfree
#else /* not emacs */
#define SYNTAX(c) re_syntax_table[c]
-/* Dummy macro for non emacs environments. */
+/* Dummy macros for non-Emacs environments. */
#define BASE_LEADING_CODE_P(c) (0)
#define WORD_BOUNDARY_P(c1, c2) (0)
#define CHAR_HEAD_P(p) (1)
/* isalpha etc. are used for the character classes. */
#include <ctype.h>
+#ifdef emacs
+
+/* 1 if C is an ASCII character. */
+#define IS_REAL_ASCII(c) ((c) < 0200)
+
+/* 1 if C is a unibyte character. */
+#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
+
+/* The Emacs definitions should not be directly affected by locales. */
+
+/* In Emacs, these are only used for single-byte characters. */
+#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
+#define ISCNTRL(c) ((c) < ' ')
+#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
+ || ((c) >= 'a' && (c) <= 'f') \
+ || ((c) >= 'A' && (c) <= 'F'))
+
+/* This is only used for single-byte characters. */
+#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
+
+/* The rest must handle multibyte characters. */
+
+#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
+ ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
+ : 1)
+
+#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
+ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
+ : 1)
+
+#define ISALNUM(c) (IS_REAL_ASCII (c) \
+ ? (((c) >= 'a' && (c) <= 'z') \
+ || ((c) >= 'A' && (c) <= 'Z') \
+ || ((c) >= '0' && (c) <= '9')) \
+ : SYNTAX (c) == Sword)
+
+#define ISALPHA(c) (IS_REAL_ASCII (c) \
+ ? (((c) >= 'a' && (c) <= 'z') \
+ || ((c) >= 'A' && (c) <= 'Z')) \
+ : SYNTAX (c) == Sword)
+
+#define ISLOWER(c) (LOWERCASEP (c))
+
+#define ISPUNCT(c) (IS_REAL_ASCII (c) \
+ ? ((c) > ' ' && (c) < 0177 \
+ && !(((c) >= 'a' && (c) <= 'z') \
+ || ((c) >= 'A' && (c) <= 'Z') \
+ || ((c) >= '0' && (c) <= '9'))) \
+ : SYNTAX (c) != Sword)
+
+#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
+
+#define ISUPPER(c) (UPPERCASEP (c))
+
+#define ISWORD(c) (SYNTAX (c) == Sword)
+
+#else /* not emacs */
+
/* Jim Meyering writes:
"... Some ctype macros are valid only for character codes that
#define ISASCII(c) isascii(c)
#endif
+/* 1 if C is an ASCII character. */
+#define IS_REAL_ASCII(c) ((c) < 0200)
+
+/* This distinction is not meaningful, except in Emacs. */
+#define ISUNIBYTE(c) 1
+
+#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
+#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
+#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
+
#ifdef isblank
#define ISBLANK(c) (ISASCII (c) && isblank (c))
#else
#define ISUPPER(c) (ISASCII (c) && isupper (c))
#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
+#define ISWORD(c) ISALPHA(c)
+
+#endif /* not emacs */
+\f
#ifndef NULL
#define NULL (void *)0
#endif
for a bitmap saying which chars are in. Bits in each byte
are ordered low-bit-first. A character is in the set if its
bit is 1. A character too large to have a bit in the map is
- automatically not in the set. */
+ automatically not in the set.
+
+ If the length byte has the 0x80 bit set, then that stuff
+ is followed by a range table:
+ 2 bytes of flags for character sets (low 8 bits, high 8 bits)
+ See RANGE_TABLE_WORK_BITS below.
+ 2 bytes, the number of pairs that follow
+ pairs, each 2 multibyte characters,
+ each multibyte character represented as 3 bytes. */
charset,
/* Same parameters as charset, but match any character that is
/* Return the address of range table of charset P. But not the start
of table itself, but the before where the number of ranges is
- stored. `2 +' means to skip re_opcode_t and size of bitmap. */
-#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)])
+ stored. `2 +' means to skip re_opcode_t and size of bitmap,
+ and the 2 bytes of flags at the start of the range table. */
+#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
+
+/* Extract the bit flags that start a range table. */
+#define CHARSET_RANGE_TABLE_BITS(p) \
+ ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
+ + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
/* Test if C is listed in the bitmap of charset P. */
#define CHARSET_LOOKUP_BITMAP(p, c) \
{
register int c, last = -100;
register int in_range = 0;
+ int length = *p & 0x7f;
+ int has_range_table = *p & 0x80;
+ int range_length = p[length + 2] + p[length + 3] * 0x100;
printf ("/charset [%s",
(re_opcode_t) *(p - 1) == charset_not ? "^" : "");
assert (p + *p < pend);
for (c = 0; c < 256; c++)
- if (c / 8 < *p
+ if (c / 8 < length
&& (p[1 + (c/8)] & (1 << (c % 8))))
{
/* Are we starting a range? */
}
/* Have we broken a range? */
else if (last + 1 != c && in_range)
- {
+ {
putchar (last);
in_range = 0;
}
last = c;
}
+ p += 1 + length;
+
if (in_range)
putchar (last);
putchar (']');
- p += 1 + *p;
+ if (has_range_table)
+ printf ("has-range-table");
+
+ /* ??? Should print the range table; for now,
+ just skip it. */
+ if (has_range_table)
+ p += 4 + 6 * range_length;
}
break;
#define FAIL_STACK_GROWTH_FACTOR 4
#define GROW_FAIL_STACK(fail_stack) \
- ((fail_stack).size >= re_max_failures * TYPICAL_FAILURE_SIZE \
+ (((fail_stack).size * sizeof (fail_stack_elt_t) \
+ >= re_max_failures * TYPICAL_FAILURE_SIZE) \
? 0 \
: ((fail_stack).stack \
= (fail_stack_elt_t *) \
\
(fail_stack).stack == NULL \
? 0 \
- : (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
- ((fail_stack).size * sizeof (fail_stack_elt_t) \
- * FAIL_STACK_GROWTH_FACTOR)), \
+ : ((fail_stack).size \
+ = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
+ ((fail_stack).size * sizeof (fail_stack_elt_t) \
+ * FAIL_STACK_GROWTH_FACTOR)) \
+ / sizeof (fail_stack_elt_t)), \
1)))
\
assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
\
- DEBUG_POP (&failure_id); \
- DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
+ DEBUG_POP (&failure_id.integer); \
+ DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id.integer); \
\
/* If the saved string location is NULL, it came from an \
on_failure_keep_string_jump opcode, and we want to throw away the \
#define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
- if (translate) c = (unsigned char) translate[c]; \
+ if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
} while (0)
#endif
when we use a character as a subscript we must make it unsigned. */
#ifndef TRANSLATE
#define TRANSLATE(d) \
- (translate ? (unsigned char) RE_TRANSLATE (translate, (unsigned char) (d)) : (d))
+ (RE_TRANSLATE_P (translate) \
+ ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
#endif
int *table; /* actual work area. */
int allocated; /* allocated size for work area in bytes. */
int used; /* actually used size in words. */
+ int bits; /* flag to record character classes */
};
/* Make sure that WORK_AREA can hold more N multibyte characters. */
} \
} while (0)
+#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
+ (work_area).bits |= (bit)
+
+/* These bits represent the various character classes such as [:alnum:]
+ in a charset's range table. */
+#define BIT_ALNUM 0x1
+#define BIT_ALPHA 0x2
+#define BIT_WORD 0x4
+#define BIT_ASCII 0x8
+#define BIT_NONASCII 0x10
+#define BIT_GRAPH 0x20
+#define BIT_LOWER 0x40
+#define BIT_PRINT 0x80
+#define BIT_PUNCT 0x100
+#define BIT_SPACE 0x200
+#define BIT_UPPER 0x400
+#define BIT_UNIBYTE 0x800
+#define BIT_MULTIBYTE 0x1000
+
/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
do { \
free ((work_area).table); \
} while (0)
-#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0)
+#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
+#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
|| STREQ (string, "alnum") || STREQ (string, "xdigit") \
|| STREQ (string, "space") || STREQ (string, "print") \
|| STREQ (string, "punct") || STREQ (string, "graph") \
- || STREQ (string, "cntrl") || STREQ (string, "blank"))
+ || STREQ (string, "cntrl") || STREQ (string, "blank") \
+ || STREQ (string, "word") \
+ || STREQ (string, "ascii") || STREQ (string, "nonascii") \
+ || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
\f
#ifndef MATCH_MAY_ALLOCATE
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
+#ifdef AIX
+ /* `const' makes AIX compiler fail. */
+ char *p = pattern;
+#else
const char *p = pattern;
+#endif
const char *pend = pattern + size;
/* How to translate the characters in the pattern. */
/* 1 means zero (many) matches is allowed. */
char zero_times_ok = 0, many_times_ok = 0;
+ char greedy = 1;
/* If there is a sequence of repetition chars, collapse it
down to just one (the right one). We can't combine
for (;;)
{
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
+ if (!(syntax & RE_ALL_GREEDY)
+ && c == '?' && (zero_times_ok || many_times_ok))
+ greedy = 0;
+ else
+ {
+ zero_times_ok |= c != '+';
+ many_times_ok |= c != '?';
+ }
if (p == pend)
break;
/* Now we know whether or not zero matches is allowed
and also whether or not two or more matches is allowed. */
+ if (greedy)
+ {
if (many_times_ok)
{ /* More than one repetition is allowed, so put in at the
end a backward relative jump from `b' to before the next
incremented `p', by the way, to be the character after
the `*'. Do we have to do something analogous here
for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
+ if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
&& zero_times_ok
- && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
+ && p < pend
+ && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
&& !(syntax & RE_DOT_NEWLINE))
{ /* We have .*\n. */
STORE_JUMP (jump, b, laststart);
INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
b += 3;
}
- }
+
+ }
+ else /* not greedy */
+ { /* I wish the greedy and non-greedy cases could be merged. */
+
+ if (many_times_ok)
+ {
+ /* The greedy multiple match looks like a repeat..until:
+ we only need a conditional jump at the end of the loop */
+ GET_BUFFER_SPACE (3);
+ STORE_JUMP (on_failure_jump, b, laststart);
+ b += 3;
+ if (zero_times_ok)
+ {
+ /* The repeat...until naturally matches one or more.
+ To also match zero times, we need to first jump to
+ the end of the loop (its conditional jump). */
+ GET_BUFFER_SPACE (3);
+ INSERT_JUMP (jump, laststart, b);
+ b += 3;
+ }
+ }
+ else
+ {
+ /* non-greedy a?? */
+ GET_BUFFER_SPACE (6);
+ INSERT_JUMP (jump, laststart, b + 3);
+ b += 3;
+ INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
+ b += 3;
+ }
+ }
+ }
break;
int ch;
boolean is_alnum = STREQ (str, "alnum");
boolean is_alpha = STREQ (str, "alpha");
+ boolean is_ascii = STREQ (str, "ascii");
boolean is_blank = STREQ (str, "blank");
boolean is_cntrl = STREQ (str, "cntrl");
boolean is_digit = STREQ (str, "digit");
boolean is_graph = STREQ (str, "graph");
boolean is_lower = STREQ (str, "lower");
+ boolean is_multibyte = STREQ (str, "multibyte");
+ boolean is_nonascii = STREQ (str, "nonascii");
boolean is_print = STREQ (str, "print");
boolean is_punct = STREQ (str, "punct");
boolean is_space = STREQ (str, "space");
+ boolean is_unibyte = STREQ (str, "unibyte");
boolean is_upper = STREQ (str, "upper");
+ boolean is_word = STREQ (str, "word");
boolean is_xdigit = STREQ (str, "xdigit");
if (!IS_CHAR_CLASS (str))
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+ /* Most character classes in a multibyte match
+ just set a flag. Exceptions are is_blank,
+ is_digit, is_cntrl, and is_xdigit, since
+ they can only match ASCII characters. We
+ don't need to handle them for multibyte. */
+
+ if (bufp->multibyte)
+ {
+ int bit = 0;
+
+ if (is_alnum) bit = BIT_ALNUM;
+ if (is_alpha) bit = BIT_ALPHA;
+ if (is_ascii) bit = BIT_ASCII;
+ if (is_graph) bit = BIT_GRAPH;
+ if (is_lower) bit = BIT_LOWER;
+ if (is_multibyte) bit = BIT_MULTIBYTE;
+ if (is_nonascii) bit = BIT_NONASCII;
+ if (is_print) bit = BIT_PRINT;
+ if (is_punct) bit = BIT_PUNCT;
+ if (is_space) bit = BIT_SPACE;
+ if (is_unibyte) bit = BIT_UNIBYTE;
+ if (is_upper) bit = BIT_UPPER;
+ if (is_word) bit = BIT_WORD;
+ if (bit)
+ SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
+ bit);
+ }
+
+ /* Handle character classes for ASCII characters. */
for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
{
int translated = TRANSLATE (ch);
|| (is_upper && ISUPPER (ch))
|| (is_xdigit && ISXDIGIT (ch)))
SET_LIST_BIT (translated);
+ if ( (is_ascii && IS_REAL_ASCII (ch))
+ || (is_nonascii && !IS_REAL_ASCII (ch))
+ || (is_unibyte && ISUNIBYTE (ch))
+ || (is_multibyte && !ISUNIBYTE (ch)))
+ SET_LIST_BIT (translated);
+
+ if ( (is_word && ISWORD (ch)))
+ SET_LIST_BIT (translated);
}
/* Repeat the loop. */
p += len;
}
- if (!SAME_CHARSET_P (c, c1))
+ if (SINGLE_BYTE_CHAR_P (c)
+ && ! SINGLE_BYTE_CHAR_P (c1))
+ {
+ /* Handle a range such as \177-\377 in multibyte mode.
+ Split that into two ranges,,
+ the low one ending at 0237, and the high one
+ starting at ...040. */
+ int c1_base = (c1 & ~0177) | 040;
+ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
+ c1 = 0237;
+ }
+ else if (!SAME_CHARSET_P (c, c1))
FREE_STACK_RETURN (REG_ERANGE);
}
else
for (this_char = range_start; this_char <= range_end;
this_char++)
SET_LIST_BIT (TRANSLATE (this_char));
+ }
}
- }
else
/* ... into range table. */
SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
b[-1]--;
b += b[-1];
- /* Build real range table from work area. */
- if (RANGE_TABLE_WORK_USED (range_table_work))
+ /* Build real range table from work area. */
+ if (RANGE_TABLE_WORK_USED (range_table_work)
+ || RANGE_TABLE_WORK_BITS (range_table_work))
{
int i;
int used = RANGE_TABLE_WORK_USED (range_table_work);
/* Allocate space for COUNT + RANGE_TABLE. Needs two
- bytes for COUNT and three bytes for each character. */
- GET_BUFFER_SPACE (2 + used * 3);
+ bytes for flags, two for COUNT, and three bytes for
+ each character. */
+ GET_BUFFER_SPACE (4 + used * 3);
/* Indicate the existence of range table. */
laststart[1] |= 0x80;
+ /* Store the character class flag bits into the range table.
+ If not in emacs, these flag bits are always 0. */
+ *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
+ *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
+
STORE_NUMBER_AND_INCR (b, used / 2);
for (i = 0; i < used; i++)
STORE_CHARACTER_AND_INCR
p1 = p - 1; /* P1 points the head of C. */
#ifdef emacs
if (bufp->multibyte)
- /* Set P to the next character boundary. */
- p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
+ {
+ c = STRING_CHAR (p1, pend - p1);
+ c = TRANSLATE (c);
+ /* Set P to the next character boundary. */
+ p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
+ }
#endif
/* If no exactn currently being built. */
if (!pending_exact
|| *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
/* If followed by a repetition operator. */
- || *p == '*' || *p == '^'
+ || (p != pend && (*p == '*' || *p == '^'))
|| ((syntax & RE_BK_PLUS_QM)
- ? *p == '\\' && (p[1] == '+' || p[1] == '?')
- : (*p == '+' || *p == '?'))
+ ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
+ : p != pend && (*p == '+' || *p == '?'))
|| ((syntax & RE_INTERVALS)
&& ((syntax & RE_NO_BK_BRACES)
- ? *p == '{'
- : (p[0] == '\\' && p[1] == '{'))))
+ ? p != pend && *p == '{'
+ : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
{
/* Start building a new exactn. */
pending_exact = b - 1;
}
- /* Here, C may translated, therefore C may not equal to *P1. */
- while (1)
+#ifdef emacs
+ if (! SINGLE_BYTE_CHAR_P (c))
{
- BUF_PUSH (c);
- (*pending_exact)++;
- if (++p1 == p)
- break;
-
- /* Rest of multibyte form should be copied literally. */
- c = *(unsigned char *)p1;
+ unsigned char str[MAX_MULTIBYTE_LENGTH];
+ int i = CHAR_STRING (c, str);
+ int j;
+ for (j = 0; j < i; j++)
+ {
+ BUF_PUSH (str[j]);
+ (*pending_exact)++;
+ }
+ }
+ else
+#endif
+ {
+ BUF_PUSH (c);
+ (*pending_exact)++;
}
break;
} /* switch (c) */
if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
{
- fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE);
+ fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
#ifdef emacs
if (! fail_stack.stack)
return false;
}
-
-
-/* Read the ending character of a range (in a bracket expression) from the
- uncompiled pattern *P_PTR (which ends at PEND). We assume the
- starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
- Then we set the translation of all bits between the starting and
- ending characters (inclusive) in the compiled pattern B.
-
- Return an error code.
-
- We use these short variable names so we can use the same macros as
- `regex_compile' itself. */
-
-static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
-{
- unsigned this_char;
-
- const char *p = *p_ptr;
- int range_start, range_end;
-
- if (p == pend)
- return REG_ERANGE;
-
- /* Even though the pattern is a signed `char *', we need to fetch
- with unsigned char *'s; if the high bit of the pattern character
- is set, the range endpoints will be negative if we fetch using a
- signed char *.
-
- We also want to fetch the endpoints without translating them; the
- appropriate translation is done in the bit-setting loop below. */
- /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
- range_start = ((const unsigned char *) p)[-2];
- range_end = ((const unsigned char *) p)[0];
-
- /* Have to increment the pointer into the pattern string, so the
- caller isn't still at the ending character. */
- (*p_ptr)++;
-
- /* If the start is after the end, the range is empty. */
- if (range_start > range_end)
- return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-
- /* Here we see why `this_char' has to be larger than an `unsigned
- char' -- the range is inclusive, so if `range_end' == 0xff
- (assuming 8-bit characters), we would otherwise go into an infinite
- loop, since all characters <= 0xff. */
- for (this_char = range_start; this_char <= range_end; this_char++)
- {
- SET_LIST_BIT (TRANSLATE (this_char));
- }
-
- return REG_NOERROR;
-}
\f
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
characters can start a string that matches the pattern. This fastmap
is used by re_search to skip quickly over impossible starting points.
+ Character codes above (1 << BYTEWIDTH) are not represented in the
+ fastmap, but the leading codes are represented. Thus, the fastmap
+ indicates which character sets could start a match.
+
The caller must supply the address of a (1 << BYTEWIDTH)-byte data
area as BUFP->fastmap.
#ifndef emacs
case charset:
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
- if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- fastmap[j] = 1;
- break;
+ {
+ int length = (*p & 0x7f);;
+ p++;
+ for (j = length * BYTEWIDTH - 1; j >= 0; j--)
+ if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
+ fastmap[j] = 1;
+ }
+ break;
case charset_not:
/* Chars beyond end of map must be allowed. */
- for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
+ {
+ int length = (*p & 0x7f);;
+ p++;
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
- if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
+ for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
fastmap[j] = 1;
- break;
+ for (j = length * BYTEWIDTH - 1; j >= 0; j--)
+ if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
+ fastmap[j] = 1;
+ }
+ break;
case wordchar:
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
fastmap[j] = 1;
+ /* If we can match a character class, we can match
+ any character set. */
+ if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
+ goto set_fastmap_for_multibyte_characters;
+
if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& match_any_multibyte_characters == false)
{
multibyte character in the range table. */
int c, count;
- /* Make P points the range table. */
- p += CHARSET_BITMAP_SIZE (&p[-2]);
+ /* Make P points the range table. `+ 2' is to skip flag
+ bits for a character class. */
+ p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
- /* Extract the number of ranges in range table into
- COUNT. */
+ /* Extract the number of ranges in range table into COUNT. */
EXTRACT_NUMBER_AND_INCR (count, p);
for (; count > 0; count--, p += 2 * 3) /* XXX */
{
case charset_not:
- /* Chars beyond end of map must be allowed. End of map is
- `127' if bufp->multibyte is nonzero. */
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ /* Chars beyond end of bitmap are possible matches.
+ All the single-byte codes can occur in multibyte buffers.
+ So any that are not listed in the charset
+ are possible matches, even in multibyte buffers. */
+ simple_char_max = (1 << BYTEWIDTH);
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
j < simple_char_max; j++)
fastmap[j] = 1;
case wordchar:
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ /* All the single-byte codes can occur in multibyte buffers,
+ and they may have word syntax. So do consider them. */
+ simple_char_max = (1 << BYTEWIDTH);
for (j = 0; j < simple_char_max; j++)
if (SYNTAX (j) == Sword)
fastmap[j] = 1;
case notwordchar:
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ /* All the single-byte codes can occur in multibyte buffers,
+ and they may not have word syntax. So do consider them. */
+ simple_char_max = (1 << BYTEWIDTH);
for (j = 0; j < simple_char_max; j++)
if (SYNTAX (j) != Sword)
fastmap[j] = 1;
{
int fastmap_newline = fastmap['\n'];
- /* `.' matches anything (but if bufp->multibyte is
- nonzero, matches `\000' .. `\127' and possible multibyte
- character) ... */
+ /* `.' matches anything, except perhaps newline.
+ Even in a multibyte buffer, it should match any
+ conceivable byte value for the fastmap. */
if (bufp->multibyte)
- {
- simple_char_max = 0x80;
-
- for (j = 0x80; j < 0xA0; j++)
- if (BASE_LEADING_CODE_P (j))
- fastmap[j] = 1;
- match_any_multibyte_characters = true;
- }
- else
- simple_char_max = (1 << BYTEWIDTH);
+ match_any_multibyte_characters = true;
+ simple_char_max = (1 << BYTEWIDTH);
for (j = 0; j < simple_char_max; j++)
fastmap[j] = 1;
case categoryspec:
k = *p++;
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ simple_char_max = (1 << BYTEWIDTH);
for (j = 0; j < simple_char_max; j++)
if (CHAR_HAS_CATEGORY (j, k))
fastmap[j] = 1;
case notcategoryspec:
k = *p++;
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ simple_char_max = (1 << BYTEWIDTH);
for (j = 0; j < simple_char_max; j++)
if (!CHAR_HAS_CATEGORY (j, k))
fastmap[j] = 1;
range = total_size - startpos;
/* If the search isn't to be a backwards one, don't waste time in a
- search for a pattern that must be anchored. */
+ search for a pattern anchored at beginning of buffer. */
if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
{
if (startpos > 0)
return -1;
else
- range = 1;
+ range = 0;
}
#ifdef emacs
don't keep searching past point. */
if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
{
- range = PT - startpos;
- if (range <= 0)
+ range = PT_BYTE - BEGV_BYTE - startpos;
+ if (range < 0)
return -1;
}
#endif /* emacs */
anchored_start = 1;
#ifdef emacs
- SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object,
- POS_AS_IN_BUFFER (startpos > 0
- ? startpos - 1 : startpos),
- 1);
+ gl_state.object = re_match_object;
+ {
+ int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos);
+
+ SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
+ }
#endif
/* Loop through the string, looking for a place to start matching. */
the first null string. */
if (fastmap && startpos < total_size && !bufp->can_be_null)
{
+ register const char *d;
+ register unsigned int buf_ch;
+
+ d = POS_ADDR_VSTRING (startpos);
+
if (range > 0) /* Searching forwards. */
{
- register const char *d;
register int lim = 0;
int irange = range;
if (startpos < size1 && startpos + range >= size1)
lim = range - (size1 - startpos);
- d = POS_ADDR_VSTRING (startpos);
-
/* Written out as an if-else to avoid testing `translate'
inside the loop. */
- if (translate)
- while (range > lim
- && !fastmap[(unsigned char)
- RE_TRANSLATE (translate, (unsigned char) *d++)])
- range--;
+ if (RE_TRANSLATE_P (translate))
+ {
+ if (multibyte)
+ while (range > lim)
+ {
+ int buf_charlen;
+
+ buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+ buf_charlen);
+
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
+ if (buf_ch >= 0400
+ || fastmap[buf_ch])
+ break;
+
+ range -= buf_charlen;
+ d += buf_charlen;
+ }
+ else
+ while (range > lim
+ && !fastmap[(unsigned char)
+ RE_TRANSLATE (translate, (unsigned char) *d)])
+ {
+ d++;
+ range--;
+ }
+ }
else
- while (range > lim && !fastmap[(unsigned char) *d++])
- range--;
+ while (range > lim && !fastmap[(unsigned char) *d])
+ {
+ d++;
+ range--;
+ }
startpos += irange - range;
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ int room = (size1 == 0 || startpos >= size1
+ ? size2 + size1 - startpos
+ : size1 - startpos);
- if (!fastmap[(unsigned char) TRANSLATE (c)])
+ buf_ch = STRING_CHAR (d, room);
+ if (RE_TRANSLATE_P (translate))
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
+
+ if (! (buf_ch >= 0400
+ || fastmap[buf_ch]))
goto advance;
}
}
int len = 0;
/* Find the head of multibyte form. */
- while (!CHAR_HEAD_P (p))
+ while (!CHAR_HEAD_P (*p))
p--, len++;
/* Adjust it. */
int result;
#ifdef emacs
- SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object,
- POS_AS_IN_BUFFER (pos > 0 ? pos - 1 : pos),
- 1);
+ int charpos;
+ int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
+ gl_state.object = re_match_object;
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos);
+ SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
#endif
result = re_match_2_internal (bufp, string1, size1, string2, size2,
- pos, regs, stop);
+ pos, regs, stop);
alloca (0);
return result;
}
/* This is written out as an if-else so we don't waste time
testing `translate' inside the loop. */
- if (translate)
+ if (RE_TRANSLATE_P (translate))
{
- do
- {
- PREFETCH ();
- if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++)
- != (unsigned char) *p++)
- goto fail;
- }
- while (--mcnt);
+#ifdef emacs
+ if (multibyte)
+ do
+ {
+ int pat_charlen, buf_charlen;
+ unsigned int pat_ch, buf_ch;
+
+ PREFETCH ();
+ pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+
+ if (RE_TRANSLATE (translate, buf_ch)
+ != pat_ch)
+ goto fail;
+
+ p += pat_charlen;
+ d += buf_charlen;
+ mcnt -= pat_charlen;
+ }
+ while (mcnt > 0);
+ else
+#endif /* not emacs */
+ do
+ {
+ PREFETCH ();
+ if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d)
+ != (unsigned char) *p++)
+ goto fail;
+ d++;
+ }
+ while (--mcnt);
}
else
{
/* Match any character except possibly a newline or a null. */
case anychar:
- DEBUG_PRINT1 ("EXECUTING anychar.\n");
+ {
+ int buf_charlen;
+ unsigned int buf_ch;
- PREFETCH ();
+ DEBUG_PRINT1 ("EXECUTING anychar.\n");
- if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
- || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
- goto fail;
+ PREFETCH ();
- SET_REGS_MATCHED ();
- DEBUG_PRINT2 (" Matched `%d'.\n", *d);
- d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1;
+#ifdef emacs
+ if (multibyte)
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+ else
+#endif /* not emacs */
+ {
+ buf_ch = (unsigned char) *d;
+ buf_charlen = 1;
+ }
+
+ buf_ch = TRANSLATE (buf_ch);
+
+ if ((!(bufp->syntax & RE_DOT_NEWLINE)
+ && buf_ch == '\n')
+ || ((bufp->syntax & RE_DOT_NOT_NULL)
+ && buf_ch == '\000'))
+ goto fail;
+
+ SET_REGS_MATCHED ();
+ DEBUG_PRINT2 (" Matched `%d'.\n", *d);
+ d += buf_charlen;
+ }
break;
range table. */
unsigned char *range_table;
- /* Nonzero if there is range table. */
+ /* Nonzero if there is a range table. */
int range_table_exists;
- /* Number of ranges of range table. Not in bytes. */
- int count;
+ /* Number of ranges of range table. This is not included
+ in the initial byte-length of the command. */
+ int count = 0;
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
PREFETCH ();
c = (unsigned char) *d;
- range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
+
+#ifdef emacs
if (range_table_exists)
- EXTRACT_NUMBER_AND_INCR (count, range_table);
- else
- count = 0;
+ {
+ range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
+ EXTRACT_NUMBER_AND_INCR (count, range_table);
+ }
if (multibyte && BASE_LEADING_CODE_P (c))
c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
+#endif /* emacs */
if (SINGLE_BYTE_CHAR_P (c))
{ /* Lookup bitmap. */
/* Cast to `unsigned' instead of `unsigned char' in
case the bit list is a full 32 bytes long. */
if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
- && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
+ && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
+ not = !not;
}
+#ifdef emacs
else if (range_table_exists)
- CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
+ {
+ int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
+
+ if ( (class_bits & BIT_ALNUM && ISALNUM (c))
+ | (class_bits & BIT_ALPHA && ISALPHA (c))
+ | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
+ | (class_bits & BIT_GRAPH && ISGRAPH (c))
+ | (class_bits & BIT_LOWER && ISLOWER (c))
+ | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
+ | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
+ | (class_bits & BIT_PRINT && ISPRINT (c))
+ | (class_bits & BIT_PUNCT && ISPUNCT (c))
+ | (class_bits & BIT_SPACE && ISSPACE (c))
+ | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
+ | (class_bits & BIT_UPPER && ISUPPER (c))
+ | (class_bits & BIT_WORD && ISWORD (c)))
+ not = !not;
+ else
+ CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
+ }
+#endif /* emacs */
- p = CHARSET_RANGE_TABLE_END (range_table, count);
+ if (range_table_exists)
+ p = CHARSET_RANGE_TABLE_END (range_table, count);
+ else
+ p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
if (!not) goto fail;
/* Compare that many; failure if mismatch, else move
past them. */
- if (translate
+ if (RE_TRANSLATE_P (translate)
? bcmp_translate (d, d2, mcnt, translate)
: bcmp (d, d2, mcnt))
goto fail;
on_failure:
DEBUG_PRINT1 ("EXECUTING on_failure_jump");
+#if defined (WINDOWSNT) && defined (emacs)
+ QUIT;
+#endif
+
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
/* A smart repeat ends with `maybe_pop_jump'.
We change it to either `pop_failure_jump' or `jump'. */
case maybe_pop_jump:
+#if defined (WINDOWSNT) && defined (emacs)
+ QUIT;
+#endif
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
{
/* Unconditionally jump (without popping any failure points). */
case jump:
unconditional_jump:
+#if defined (WINDOWSNT) && defined (emacs)
+ QUIT;
+#endif
EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
p += mcnt; /* Do the jump. */
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
int pos1 = PTR_TO_OFFSET (d - 1);
+ int charpos;
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
#ifdef emacs
- UPDATE_SYNTAX_TABLE (pos1 ? pos1 : 1);
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
#endif
s1 = SYNTAX (c1);
#ifdef emacs
- UPDATE_SYNTAX_TABLE_FORWARD (pos1 + 1);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
s2 = SYNTAX (c2);
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
int pos1 = PTR_TO_OFFSET (d - 1);
+ int charpos;
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
#ifdef emacs
- UPDATE_SYNTAX_TABLE (pos1);
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
#endif
s1 = SYNTAX (c1);
#ifdef emacs
- UPDATE_SYNTAX_TABLE_FORWARD (pos1 + 1);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
s2 = SYNTAX (c2);
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
int pos1 = PTR_TO_OFFSET (d);
+ int charpos;
GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
#ifdef emacs
- UPDATE_SYNTAX_TABLE (pos1);
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
#endif
s2 = SYNTAX (c2);
{
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
#ifdef emacs
- UPDATE_SYNTAX_TABLE_BACKWARD (pos1 - 1);
+ UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
#endif
s1 = SYNTAX (c1);
/* C1 is the character before D, S1 is the syntax of C1, C2
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
+ int pos1 = PTR_TO_OFFSET (d);
+ int charpos;
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+#ifdef emacs
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1);
+ UPDATE_SYNTAX_TABLE (charpos);
+#endif
s1 = SYNTAX (c1);
/* Case 2: S1 is not Sword. */
if (!AT_STRINGS_END (d))
{
GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
+#ifdef emacs
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+#endif
s2 = SYNTAX (c2);
/* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
#ifdef emacs
case before_dot:
DEBUG_PRINT1 ("EXECUTING before_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) >= PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE)
goto fail;
break;
case at_dot:
DEBUG_PRINT1 ("EXECUTING at_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) != PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE)
goto fail;
break;
case after_dot:
DEBUG_PRINT1 ("EXECUTING after_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) <= PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE)
goto fail;
break;
PREFETCH ();
#ifdef emacs
{
- int pos1 = PTR_TO_OFFSET (d);
+ int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
UPDATE_SYNTAX_TABLE (pos1);
}
#endif
PREFETCH ();
#ifdef emacs
{
- int pos1 = PTR_TO_OFFSET (d);
+ int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
UPDATE_SYNTAX_TABLE (pos1);
}
#endif
/* We goto here if a matching operation fails. */
fail:
+#if defined (WINDOWSNT) && defined (emacs)
+ QUIT;
+#endif
if (!FAIL_STACK_EMPTY ())
{ /* A restart point is known. Restore to that state. */
DEBUG_PRINT1 ("\nFAIL:\n");
RE_TRANSLATE_TYPE translate;
{
register unsigned char *p1 = s1, *p2 = s2;
- while (len)
+ unsigned char *p1_end = s1 + len;
+ unsigned char *p2_end = s2 + len;
+
+ while (p1 != p1_end && p2 != p2_end)
{
- if (RE_TRANSLATE (translate, *p1++) != RE_TRANSLATE (translate, *p2++))
+ int p1_charlen, p2_charlen;
+ int p1_ch, p2_ch;
+
+ p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+ p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+
+ if (RE_TRANSLATE (translate, p1_ch)
+ != RE_TRANSLATE (translate, p2_ch))
return 1;
- len--;
+
+ p1 += p1_charlen, p2 += p2_charlen;
}
+
+ if (p1 != p1_end || p2 != p2_end)
+ return 1;
+
return 0;
}
\f