USA. */
/* TODO:
- - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac".
- - use `keep_string' more often than just .*\n.
- structure the opcode space into opcode+flag.
- - merge with glic's regex.[ch]
-
- That's it for now -sm */
+ - merge with glibc's regex.[ch]
+ */
/* AIX requires this to be the first thing in the file. */
#if defined (_AIX) && !defined (REGEX_MALLOC)
/* Converts the pointer to the char to BEG-based offset from the start. */
#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
-#else
-#define PTR_TO_OFFSET(d) 0
#endif
#ifdef HAVE_CONFIG_H
#define realloc xrealloc
#define free xfree
+#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_STRING_CHAR(p, s) \
(multibyte ? (STRING_CHAR (p, s)) : (*(p)))
+#define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
+ (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
+
+/* Set C a (possibly multibyte) character before P. P points into a
+ string which is the virtual concatenation of STR1 (which ends at
+ END1) or STR2 (which ends at END2). */
+#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
+ do { \
+ if (multibyte) \
+ { \
+ re_char *dtemp = (p) == (str2) ? (end1) : (p); \
+ re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
+ while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
+ c = STRING_CHAR (dtemp, (p) - dtemp); \
+ } \
+ else \
+ (c = ((p) == (str2) ? (end1) : (p))[-1]); \
+ } while (0)
+
#else /* not emacs */
/* Define the syntax stuff for \<, \>, etc. */
-/* This must be nonzero for the wordchar and notwordchar pattern
- commands in re_match_2. */
-#ifndef Sword
-#define Sword 1
-#endif
+/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
+enum syntaxcode { Swhitespace = 0, Sword = 1 };
#ifdef SWITCH_ENUM_BUG
#define SWITCH_ENUM_CAST(x) ((int)(x))
/* Dummy macros for non-Emacs environments. */
#define BASE_LEADING_CODE_P(c) (0)
+#define CHAR_CHARSET(c) 0
+#define CHARSET_LEADING_CODE_BASE(c) 0
+#define MAX_MULTIBYTE_LENGTH 1
+#define RE_MULTIBYTE_P(x) 0
#define WORD_BOUNDARY_P(c1, c2) (0)
#define CHAR_HEAD_P(p) (1)
#define SINGLE_BYTE_CHAR_P(c) (1)
#define MULTIBYTE_FORM_LENGTH(p, s) (1)
#define STRING_CHAR(p, s) (*(p))
#define RE_STRING_CHAR STRING_CHAR
+#define CHAR_STRING(c, s) (*(s) = (c), 1)
#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
+#define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
(c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
#endif /* not emacs */
#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
REGEX_REALLOCATE (source, osize, nsize)
/* No need to explicitly free anything. */
-#define REGEX_FREE_STACK(arg)
+#define REGEX_FREE_STACK(arg) ((void)0)
#endif /* not REGEX_MALLOC */
#endif /* not using relocating allocator */
indefinitely). */
on_failure_jump_loop,
+ /* Just like `on_failure_jump_loop', except that it checks for
+ a different kind of loop (the kind that shows up with non-greedy
+ operators). This operation has to be immediately preceded
+ by a `no_op'. */
+ on_failure_jump_nastyloop,
+
/* A smart `on_failure_jump' used for greedy * and + operators.
It analyses the loop before which it is put and if the
loop does not require backtracking, it changes itself to
on_failure_jump_smart,
/* Followed by two-byte relative address and two-byte number n.
- After matching N times, jump to the address upon failure. */
+ After matching N times, jump to the address upon failure.
+ Does not work if N starts at 0: use on_failure_jump_loop
+ instead. */
succeed_n,
/* Followed by two-byte relative address, and two-byte number n.
bytes of number. */
set_number_at,
- wordchar, /* Matches any word-constituent character. */
- notwordchar, /* Matches any char that is not a word-constituent. */
-
wordbeg, /* Succeeds if at word beginning. */
wordend, /* Succeeds if at word end. */
wordbound, /* Succeeds if at a word boundary. */
- notwordbound /* Succeeds if not at a word boundary. */
-
-#ifdef emacs
- ,before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
+ notwordbound, /* Succeeds if not at a word boundary. */
/* Matches any character whose syntax is specified. Followed by
a byte which contains a syntax code, e.g., Sword. */
syntaxspec,
/* Matches any character whose syntax is not that specified. */
- notsyntaxspec,
+ notsyntaxspec
+
+#ifdef emacs
+ ,before_dot, /* Succeeds if before point. */
+ at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
/* Matches any character whose category-set contains the specified
category. The operator is followed by a byte which contains a
printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
break;
+ case on_failure_jump_nastyloop:
+ extract_number_and_incr (&mcnt, &p);
+ printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
+ break;
+
case on_failure_jump_loop:
extract_number_and_incr (&mcnt, &p);
printf ("/on_failure_jump_loop to %d", p + mcnt - start);
case wordend:
printf ("/wordend");
+ case syntaxspec:
+ printf ("/syntaxspec");
+ mcnt = *p++;
+ printf ("/%d", mcnt);
+ break;
+
+ case notsyntaxspec:
+ printf ("/notsyntaxspec");
+ mcnt = *p++;
+ printf ("/%d", mcnt);
+ break;
+
#ifdef emacs
case before_dot:
printf ("/before_dot");
printf ("/after_dot");
break;
- case syntaxspec:
- printf ("/syntaxspec");
+ case categoryspec:
+ printf ("/categoryspec");
mcnt = *p++;
printf ("/%d", mcnt);
break;
- case notsyntaxspec:
- printf ("/notsyntaxspec");
+ case notcategoryspec:
+ printf ("/notcategoryspec");
mcnt = *p++;
printf ("/%d", mcnt);
break;
#endif /* emacs */
- case wordchar:
- printf ("/wordchar");
- break;
-
- case notwordchar:
- printf ("/notwordchar");
- break;
-
case begbuf:
printf ("/begbuf");
break;
fail_stack.frame = 0; \
} while (0)
-#define RESET_FAIL_STACK()
+#define RESET_FAIL_STACK() ((void)0)
#endif
const unsigned char *pend,
reg_syntax_t syntax));
static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
+static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend,
+ char *fastmap, const int multibyte));
/* Fetch the next character in the uncompiled pattern---translating it
if necessary. Also cast from a signed character in the constant
string passed to us by the user to an unsigned char that we can use
as an array index (in, e.g., `translate'). */
-#ifndef PATFETCH
#define PATFETCH(c) \
do { \
PATFETCH_RAW (c); \
- if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
+ c = TRANSLATE (c); \
} while (0)
-#endif
/* Fetch the next character in the uncompiled pattern, with no
translation. */
#define PATFETCH_RAW(c) \
- do {if (p == pend) return REG_EEND; \
- c = *p++; \
+ do { \
+ int len; \
+ if (p == pend) return REG_EEND; \
+ c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
+ p += len; \
} while (0)
-/* Go backwards one character in the pattern. */
-#define PATUNFETCH p--
-
/* If `translate' is non-null, return translate[D], else just D. We
cast the subscript to translate because some data is declared as
/* Work area for range table of charset. */
struct range_table_work_area range_table_work;
+ /* If the object matched can contain multibyte characters. */
+ const boolean multibyte = RE_MULTIBYTE_P (bufp);
+
#ifdef DEBUG
debug++;
DEBUG_PRINT1 ("\nCompiling pattern: ");
/* Always count groups, whether or not bufp->no_sub is set. */
bufp->re_nsub = 0;
-#ifdef emacs
- /* bufp->multibyte is set before regex_compile is called, so don't alter
- it. */
-#else /* not emacs */
- /* Nothing is recognized as a multibyte character. */
- bufp->multibyte = 0;
-#endif
-
#if !defined (emacs) && !defined (SYNTAX_TABLE)
/* Initialize the syntax table. */
init_syntax_once ();
if (p == pend)
break;
-
- PATFETCH (c);
-
- if (c == '*'
- || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
+ else if (*p == '*'
+ || (!(syntax & RE_BK_PLUS_QM)
+ && (*p == '+' || *p == '?')))
;
-
- else if (syntax & RE_BK_PLUS_QM && c == '\\')
+ else if (syntax & RE_BK_PLUS_QM && *p == '\\')
{
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
- {
- PATUNFETCH;
- PATUNFETCH;
- break;
- }
-
- c = c1;
+ if (p+1 == pend)
+ FREE_STACK_RETURN (REG_EESCAPE);
+ if (p[1] == '+' || p[1] == '?')
+ PATFETCH (c); /* Gobble up the backslash. */
+ else
+ break;
}
else
- {
- PATUNFETCH;
- break;
- }
-
+ break;
/* If we get here, we found another repeat character. */
- }
+ PATFETCH (c);
+ }
/* Star, etc. applied to an empty pattern is equivalent
to an empty pattern. */
{
boolean simple = skip_one_char (laststart) == b;
unsigned int startoffset = 0;
+ re_opcode_t ofj =
+ (simple || !analyse_first (laststart, b, NULL, 0)) ?
+ on_failure_jump : on_failure_jump_loop;
assert (skip_one_char (laststart) <= b);
if (!zero_times_ok && simple)
GET_BUFFER_SPACE (6);
if (!zero_times_ok)
/* A + loop. */
- STORE_JUMP (on_failure_jump_loop, b, b + 6);
+ STORE_JUMP (ofj, b, b + 6);
else
/* Simple * loops can use on_failure_keep_string_jump
depending on what follows. But since we don't know
that yet, we leave the decision up to
on_failure_jump_smart. */
- INSERT_JUMP (simple ? on_failure_jump_smart
- : on_failure_jump_loop,
+ INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
laststart + startoffset, b + 6);
b += 3;
STORE_JUMP (jump, b, laststart + startoffset);
else /* not greedy */
{ /* I wish the greedy and non-greedy cases could be merged. */
+ GET_BUFFER_SPACE (7); /* We might use less. */
if (many_times_ok)
{
+ boolean emptyp = analyse_first (laststart, b, NULL, 0);
+
/* The non-greedy multiple match looks like a repeat..until:
we only need a conditional jump at the end of the loop */
- GET_BUFFER_SPACE (3);
- STORE_JUMP (on_failure_jump, b, laststart);
+ if (emptyp) BUF_PUSH (no_op);
+ STORE_JUMP (emptyp ? on_failure_jump_nastyloop
+ : on_failure_jump, b, laststart);
b += 3;
if (zero_times_ok)
{
/* The repeat...until naturally matches one or more.
To also match zero times, we need to first jump to
the end of the loop (its conditional jump). */
- GET_BUFFER_SPACE (3);
INSERT_JUMP (jump, laststart, b);
b += 3;
}
else
{
/* non-greedy a?? */
- GET_BUFFER_SPACE (6);
INSERT_JUMP (jump, laststart, b + 3);
b += 3;
INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
/* Read in characters and ranges, setting map bits. */
for (;;)
{
- int len;
boolean escaped_char = false;
+ const unsigned char *p2 = p;
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
/* Could be the end of the bracket expression. If it's
not (i.e., when the bracket expression is `[]' so
far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
+ if (c == ']' && p2 != p1)
break;
}
- /* If C indicates start of multibyte char, get the
- actual character code in C, and set the pattern
- pointer P to the next character boundary. */
- if (bufp->multibyte && BASE_LEADING_CODE_P (c))
- {
- PATUNFETCH;
- c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
- p += len;
- }
/* What should we do for the character which is
greater than 0x7F, but not BASE_LEADING_CODE_P?
XXX */
/* See if we're at the beginning of a possible character
class. */
- else if (!escaped_char &&
- syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+ if (!escaped_char &&
+ syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
{
/* Leave room for the null. */
char str[CHAR_CLASS_MAX_LENGTH + 1];
+ const unsigned char *class_beg;
PATFETCH (c);
c1 = 0;
+ class_beg = p;
/* If pattern is `[[:'. */
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
they can only match ASCII characters. We
don't need to handle them for multibyte. */
- if (bufp->multibyte)
+ if (multibyte)
{
int bit = 0;
}
else
{
- c1++;
- while (c1--)
- PATUNFETCH;
+ /* Go back to right after the "[:". */
+ p = class_beg;
SET_LIST_BIT ('[');
/* Because the `:' may starts the range, we
/* Fetch the character which ends the range. */
PATFETCH (c1);
- if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
- {
- PATUNFETCH;
- c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
- p += len;
- }
if (SINGLE_BYTE_CHAR_P (c)
&& ! SINGLE_BYTE_CHAR_P (c1))
if (p+1 < pend)
{
/* Look for a special (?...) construct */
- PATFETCH (c);
- if ((syntax & RE_SHY_GROUPS) && c == '?')
+ if ((syntax & RE_SHY_GROUPS) && *p == '?')
{
+ PATFETCH (c); /* Gobble up the '?'. */
PATFETCH (c);
switch (c)
{
FREE_STACK_RETURN (REG_BADPAT);
}
}
- else PATUNFETCH;
}
if (!shy)
if (!(syntax & RE_INTERVALS)
/* If we're at `\{' and it's not the open-interval
operator. */
- || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- || (p - 2 == pattern && p == pend))
+ || (syntax & RE_NO_BK_BRACES)
+ /* What is that? -sm */
+ /* || (p - 2 == pattern && p == pend) */)
goto normal_backslash;
handle_interval:
/* At least (most) this many matches must be made. */
int lower_bound = 0, upper_bound = -1;
- beg_interval = p - 1;
+ beg_interval = p;
if (p == pend)
{
GET_UNSIGNED_NUMBER (lower_bound);
if (c == ',')
- {
- GET_UNSIGNED_NUMBER (upper_bound);
- if (upper_bound < 0) upper_bound = RE_DUP_MAX;
- }
+ GET_UNSIGNED_NUMBER (upper_bound);
else
/* Interval such as `{1}' => match exactly once. */
upper_bound = lower_bound;
if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound)
+ || (upper_bound >= 0 && lower_bound > upper_bound))
{
if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
goto unfetch_interval;
}
- /* If the upper bound is zero, don't want to succeed at
- all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
if (upper_bound == 0)
- {
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (jump, laststart, b + 3);
- b += 3;
- }
+ /* If the upper bound is zero, just drop the sub pattern
+ altogether. */
+ b = laststart;
+ else if (lower_bound == 1 && upper_bound == 1)
+ /* Just match it once: nothing to do here. */
+ ;
/* Otherwise, we have a nontrivial interval. When
we're all done, the pattern will look like:
else
{ /* If the upper bound is > 1, we need to insert
more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
-
- GET_BUFFER_SPACE (nbytes);
-
- /* Initialize lower bound of the `succeed_n', even
- though it will be set during matching by its
- attendant `set_number_at' (inserted next),
- because `re_compile_fastmap' needs to know.
- Jump to the `jump_n' we might insert below. */
- INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
- lower_bound);
- b += 5;
-
- /* Code to initialize the lower bound. Insert
- before the `succeed_n'. The `5' is the last two
- bytes of this `set_number_at', plus 3 bytes of
- the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
-
- if (upper_bound > 1)
+ unsigned int nbytes = (upper_bound < 0 ? 3
+ : upper_bound > 1 ? 5 : 0);
+ unsigned int startoffset = 0;
+
+ GET_BUFFER_SPACE (20); /* We might use less. */
+
+ if (lower_bound == 0)
+ {
+ /* A succeed_n that starts with 0 is really a
+ a simple on_failure_jump_loop. */
+ INSERT_JUMP (on_failure_jump_loop, laststart,
+ b + 3 + nbytes);
+ b += 3;
+ }
+ else
+ {
+ /* Initialize lower bound of the `succeed_n', even
+ though it will be set during matching by its
+ attendant `set_number_at' (inserted next),
+ because `re_compile_fastmap' needs to know.
+ Jump to the `jump_n' we might insert below. */
+ INSERT_JUMP2 (succeed_n, laststart,
+ b + 5 + nbytes,
+ lower_bound);
+ b += 5;
+
+ /* Code to initialize the lower bound. Insert
+ before the `succeed_n'. The `5' is the last two
+ bytes of this `set_number_at', plus 3 bytes of
+ the following `succeed_n'. */
+ insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+ b += 5;
+ startoffset += 5;
+ }
+
+ if (upper_bound < 0)
+ {
+ /* A negative upper bound stands for infinity,
+ in which case it degenerates to a plain jump. */
+ STORE_JUMP (jump, b, laststart + startoffset);
+ b += 3;
+ }
+ else if (upper_bound > 1)
{ /* More than one repetition is allowed, so
append a backward jump to the `succeed_n'
that starts this interval.
When we've reached this during matching,
we'll have matched the interval once, so
jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + 5,
+ STORE_JUMP2 (jump_n, b, laststart + startoffset,
upper_bound - 1);
b += 5;
beg_interval = NULL;
/* normal_char and normal_backslash need `c'. */
- PATFETCH (c);
+ c = '{';
if (!(syntax & RE_NO_BK_BRACES))
{
- if (p > pattern && p[-1] == '\\')
- goto normal_backslash;
+ assert (p > pattern && p[-1] == '\\');
+ goto normal_backslash;
}
- goto normal_char;
+ else
+ goto normal_char;
#ifdef emacs
/* There is no way to specify the before_dot and after_dot
case 'w':
laststart = b;
- BUF_PUSH (wordchar);
+ BUF_PUSH_2 (syntaxspec, Sword);
break;
case 'W':
laststart = b;
- BUF_PUSH (notwordchar);
+ BUF_PUSH_2 (notsyntaxspec, Sword);
break;
default:
/* Expects the character in `c'. */
normal_char:
- p1 = p - 1; /* P1 points the head of C. */
-#ifdef emacs
- if (bufp->multibyte)
- {
- c = STRING_CHAR (p1, pend - p1);
- c = TRANSLATE (c);
- /* Set P to the next character boundary. */
- p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
- }
-#endif
/* If no exactn currently being built. */
if (!pending_exact
|| pending_exact + *pending_exact + 1 != b
/* We have only one byte following the exactn for the count. */
- || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
+ || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
/* If followed by a repetition operator. */
|| (p != pend && (*p == '*' || *p == '^'))
pending_exact = b - 1;
}
-#ifdef emacs
- if (! SINGLE_BYTE_CHAR_P (c))
- {
- unsigned char str[MAX_MULTIBYTE_LENGTH];
- int i = CHAR_STRING (c, str);
- int j;
- for (j = 0; j < i; j++)
- {
- BUF_PUSH (str[j]);
- (*pending_exact)++;
- }
- }
- else
-#endif
- {
- BUF_PUSH (c);
- (*pending_exact)++;
- }
+ GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
+ {
+ int len = CHAR_STRING (c, b);
+ b += len;
+ (*pending_exact) += len;
+ }
+
break;
} /* switch (c) */
} /* while p != pend */
return false;
}
\f
-/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
- BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
- characters can start a string that matches the pattern. This fastmap
- is used by re_search to skip quickly over impossible starting points.
+/* analyse_first.
+ If fastmap is non-NULL, go through the pattern and fill fastmap
+ with all the possible leading chars. If fastmap is NULL, don't
+ bother filling it up (obviously) and only return whether the
+ pattern could potentially match the empty string.
+
+ Return 1 if p..pend might match the empty string.
+ Return 0 if p..pend matches at least one char.
+ Return -1 if p..pend matches at least one char, but fastmap was not
+ updated accurately.
+ Return -2 if an error occurred. */
- Character codes above (1 << BYTEWIDTH) are not represented in the
- fastmap, but the leading codes are represented. Thus, the fastmap
- indicates which character sets could start a match.
-
- The caller must supply the address of a (1 << BYTEWIDTH)-byte data
- area as BUFP->fastmap.
-
- We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
- the pattern buffer.
-
- Returns 0 if we succeed, -2 if an internal error. */
-
-int
-re_compile_fastmap (bufp)
- struct re_pattern_buffer *bufp;
+static int
+analyse_first (p, pend, fastmap, multibyte)
+ unsigned char *p, *pend;
+ char *fastmap;
+ const int multibyte;
{
int j, k;
+ boolean not;
#ifdef MATCH_MAY_ALLOCATE
fail_stack_type fail_stack;
#endif
char *destination;
#endif
- register char *fastmap = bufp->fastmap;
- unsigned char *pattern = bufp->buffer;
- unsigned long size = bufp->used;
- unsigned char *p = pattern;
- register unsigned char *pend = pattern + size;
-
#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
/* This holds the pointer to the failure stack, when
it is allocated relocatably. */
match the empty string. */
boolean path_can_be_null = true;
- /* We aren't doing a `succeed_n' to begin with. */
- boolean succeed_n_p = false;
-
/* If all elements for base leading-codes in fastmap is set, this
flag is set true. */
boolean match_any_multibyte_characters = false;
- /* Maximum code of simple (single byte) character. */
- int simple_char_max;
-
- assert (fastmap != NULL && p != NULL);
+ assert (p);
INIT_FAIL_STACK ();
- bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
- bufp->fastmap_accurate = 1; /* It will be when we're done. */
- bufp->can_be_null = 0;
/* The loop below works as follows:
- It has a working-list kept in the PATTERN_STACK and which basically
never set `p' (or push) anything `<= p1'. */
/* If can_be_null is set, then the fastmap will not be used anyway. */
- while (!bufp->can_be_null)
+ while (1)
{
/* `p1' is used as a marker of how far back a `on_failure_jump'
can go without being ignored. It is normally equal to `p'
as used for the *? operator. */
unsigned char *p1 = p;
- if (p == pend || *p == succeed)
+ if (p >= pend)
{
- /* We have reached the (effective) end of pattern. */
- if (!PATTERN_STACK_EMPTY ())
- {
- bufp->can_be_null |= path_can_be_null;
-
- /* Reset for next path. */
- path_can_be_null = true;
+ if (path_can_be_null)
+ return (RESET_FAIL_STACK (), 1);
- p = (unsigned char*) POP_PATTERN_OP ();
+ /* We have reached the (effective) end of pattern. */
+ if (PATTERN_STACK_EMPTY ())
+ return (RESET_FAIL_STACK (), 0);
- continue;
- }
- else
- break;
+ p = (unsigned char*) POP_PATTERN_OP ();
+ path_can_be_null = true;
+ continue;
}
/* We should never be about to go beyond the end of the pattern. */
switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
{
+ case succeed:
+ p = pend;
+ continue;
case duplicate:
/* If the first character has to match a backreference, that means
with `break'. */
case exactn:
- fastmap[p[1]] = 1;
- break;
-
-
-#ifndef emacs
- case charset:
- {
- int length = (*p & 0x7f);;
- p++;
-
- for (j = length * BYTEWIDTH - 1; j >= 0; j--)
- if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- fastmap[j] = 1;
- }
- break;
-
- case charset_not:
- /* Chars beyond end of map must be allowed. */
- {
- int length = (*p & 0x7f);;
- p++;
-
- for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
-
- for (j = length * BYTEWIDTH - 1; j >= 0; j--)
- if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
- fastmap[j] = 1;
- }
- break;
-
- case wordchar:
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) == Sword)
- fastmap[j] = 1;
+ if (fastmap) fastmap[p[1]] = 1;
break;
- case notwordchar:
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) != Sword)
- fastmap[j] = 1;
- break;
-#else /* emacs */
- case charset:
- for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
- j >= 0; j--)
- if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- fastmap[j] = 1;
-
- /* If we can match a character class, we can match
- any character set. */
- if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
- && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
- goto set_fastmap_for_multibyte_characters;
-
- if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
- && match_any_multibyte_characters == false)
- {
- /* Set fastmap[I] 1 where I is a base leading code of each
- multibyte character in the range table. */
- int c, count;
-
- /* Make P points the range table. */
- p += CHARSET_BITMAP_SIZE (&p[-2]);
-
- /* Extract the number of ranges in range table into COUNT. */
- EXTRACT_NUMBER_AND_INCR (count, p);
- for (; count > 0; count--, p += 2 * 3) /* XXX */
- {
- /* Extract the start of each range. */
- EXTRACT_CHARACTER (c, p);
- j = CHAR_CHARSET (c);
- fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
- }
- }
- break;
+ case anychar:
+ /* We could put all the chars except for \n (and maybe \0)
+ but we don't bother since it is generally not worth it. */
+ if (!fastmap) break;
+ return (RESET_FAIL_STACK (), -1);
case charset_not:
All the single-byte codes can occur in multibyte buffers.
So any that are not listed in the charset
are possible matches, even in multibyte buffers. */
- simple_char_max = (1 << BYTEWIDTH);
+ if (!fastmap) break;
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
- j < simple_char_max; j++)
+ j < (1 << BYTEWIDTH); j++)
fastmap[j] = 1;
-
+ /* Fallthrough */
+ case charset:
+ if (!fastmap) break;
+ not = (re_opcode_t) *(p - 1) == charset_not;
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
j >= 0; j--)
- if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
+ if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
fastmap[j] = 1;
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- which doesn't match the specified set of characters. */
+ if ((not && multibyte)
+ /* Any character set can possibly contain a character
+ which doesn't match the specified set of characters. */
+ || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
+ /* If we can match a character class, we can match
+ any character set. */
{
set_fastmap_for_multibyte_characters:
if (match_any_multibyte_characters == false)
match_any_multibyte_characters = true;
}
}
- break;
-
-
- case wordchar:
- /* All the single-byte codes can occur in multibyte buffers,
- and they may have word syntax. So do consider them. */
- simple_char_max = (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (SYNTAX (j) == Sword)
- fastmap[j] = 1;
-
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- whose syntax is `Sword'. */
- goto set_fastmap_for_multibyte_characters;
- break;
+ else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && match_any_multibyte_characters == false)
+ {
+ /* Set fastmap[I] 1 where I is a base leading code of each
+ multibyte character in the range table. */
+ int c, count;
- case notwordchar:
- /* All the single-byte codes can occur in multibyte buffers,
- and they may not have word syntax. So do consider them. */
- simple_char_max = (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (SYNTAX (j) != Sword)
- fastmap[j] = 1;
+ /* Make P points the range table. `+ 2' is to skip flag
+ bits for a character class. */
+ p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- whose syntax is not `Sword'. */
- goto set_fastmap_for_multibyte_characters;
+ /* Extract the number of ranges in range table into COUNT. */
+ EXTRACT_NUMBER_AND_INCR (count, p);
+ for (; count > 0; count--, p += 2 * 3) /* XXX */
+ {
+ /* Extract the start of each range. */
+ EXTRACT_CHARACTER (c, p);
+ j = CHAR_CHARSET (c);
+ fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
+ }
+ }
break;
-#endif
-
- case anychar:
- {
- int fastmap_newline = fastmap['\n'];
-
- /* `.' matches anything, except perhaps newline.
- Even in a multibyte buffer, it should match any
- conceivable byte value for the fastmap. */
- if (bufp->multibyte)
- match_any_multibyte_characters = true;
-
- simple_char_max = (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- fastmap[j] = 1;
-
- /* ... except perhaps newline. */
- if (!(bufp->syntax & RE_DOT_NEWLINE))
- fastmap['\n'] = fastmap_newline;
-
- /* Otherwise, have to check alternative paths. */
- break;
- }
-#ifdef emacs
- case wordbound:
- case notwordbound:
- case wordbeg:
- case wordend:
- case notsyntaxspec:
case syntaxspec:
- /* This match depends on text properties. These end with
- aborting optimizations. */
- bufp->can_be_null = 1;
- continue;
-#if 0
- k = *p++;
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (SYNTAX (j) == (enum syntaxcode) k)
- fastmap[j] = 1;
-
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- whose syntax is K. */
- goto set_fastmap_for_multibyte_characters;
- break;
-
case notsyntaxspec:
+ if (!fastmap) break;
+#ifndef emacs
+ not = (re_opcode_t)p[-1] == notsyntaxspec;
k = *p++;
- simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (SYNTAX (j) != (enum syntaxcode) k)
+ for (j = 0; j < (1 << BYTEWIDTH); j++)
+ if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
fastmap[j] = 1;
-
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- whose syntax is not K. */
- goto set_fastmap_for_multibyte_characters;
break;
-#endif
-
+#else /* emacs */
+ /* This match depends on text properties. These end with
+ aborting optimizations. */
+ return (RESET_FAIL_STACK (), -1);
case categoryspec:
- k = *p++;
- simple_char_max = (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (CHAR_HAS_CATEGORY (j, k))
- fastmap[j] = 1;
-
- if (bufp->multibyte)
- /* Any character set can possibly contain a character
- whose category is K. */
- goto set_fastmap_for_multibyte_characters;
- break;
-
-
case notcategoryspec:
+ if (!fastmap) break;
+ not = (re_opcode_t)p[-1] == notcategoryspec;
k = *p++;
- simple_char_max = (1 << BYTEWIDTH);
- for (j = 0; j < simple_char_max; j++)
- if (!CHAR_HAS_CATEGORY (j, k))
+ for (j = 0; j < (1 << BYTEWIDTH); j++)
+ if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
fastmap[j] = 1;
- if (bufp->multibyte)
+ if (multibyte)
/* Any character set can possibly contain a character
- whose category is not K. */
+ whose category is K (or not). */
goto set_fastmap_for_multibyte_characters;
break;
/* All cases after this match the empty string. These end with
`continue'. */
-
case before_dot:
case at_dot:
case after_dot:
- continue;
-#endif /* emacs */
-
-
+#endif /* !emacs */
case no_op:
case begline:
case endline:
case begbuf:
case endbuf:
-#ifndef emacs
case wordbound:
case notwordbound:
case wordbeg:
case wordend:
-#endif
continue;
- case jump_n:
case jump:
EXTRACT_NUMBER_AND_INCR (j, p);
if (j < 0)
case on_failure_jump:
case on_failure_keep_string_jump:
case on_failure_jump_loop:
+ case on_failure_jump_nastyloop:
case on_failure_jump_smart:
p++;
break;
case on_failure_jump:
case on_failure_keep_string_jump:
+ case on_failure_jump_nastyloop:
case on_failure_jump_loop:
case on_failure_jump_smart:
- handle_on_failure_jump:
EXTRACT_NUMBER_AND_INCR (j, p);
-
- /* For some patterns, e.g., `(a?)?', `p+j' here points to the
- end of the pattern. We don't want to push such a point,
- since when we restore it above, entering the switch will
- increment `p' past the end of the pattern. We don't need
- to push such a point since we obviously won't find any more
- fastmap entries beyond `pend'. Such a pattern can match
- the null string, though. */
if (p + j <= p1)
- /* Backward jump to be ignored. */
- ;
- else if (p + j < pend)
- {
- if (!PUSH_PATTERN_OP (p + j, fail_stack))
- {
- RESET_FAIL_STACK ();
- return -2;
- }
- }
- else
- bufp->can_be_null = 1;
-
- if (succeed_n_p)
- {
- EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
- succeed_n_p = false;
- }
-
+ ; /* Backward jump to be ignored. */
+ else if (!PUSH_PATTERN_OP (p + j, fail_stack))
+ return (RESET_FAIL_STACK (), -2);
continue;
+ case jump_n:
+ /* This code simply does not properly handle forward jump_n. */
+ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
+ p += 4;
+ /* jump_n can either jump or fall through. The (backward) jump
+ case has already been handled, so we only need to look at the
+ fallthrough case. */
+ continue;
+
case succeed_n:
- /* Get to the number of times to succeed. */
- p += 2;
-
- /* Increment p past the n for when k != 0. */
- EXTRACT_NUMBER_AND_INCR (k, p);
- if (k == 0)
- {
- p -= 4;
- succeed_n_p = true; /* Spaghetti code alert. */
- goto handle_on_failure_jump;
- }
+ /* If N == 0, it should be an on_failure_jump_loop instead. */
+ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
+ p += 4;
+ /* We only care about one iteration of the loop, so we don't
+ need to consider the case where this behaves like an
+ on_failure_jump. */
continue;
p = pend;
} /* while p */
- /* Set `can_be_null' for the last path (also the first path, if the
- pattern is empty). */
- bufp->can_be_null |= path_can_be_null;
- RESET_FAIL_STACK ();
+ return (RESET_FAIL_STACK (), 0);
+} /* analyse_first */
+\f
+/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
+ BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
+ characters can start a string that matches the pattern. This fastmap
+ is used by re_search to skip quickly over impossible starting points.
+
+ Character codes above (1 << BYTEWIDTH) are not represented in the
+ fastmap, but the leading codes are represented. Thus, the fastmap
+ indicates which character sets could start a match.
+
+ The caller must supply the address of a (1 << BYTEWIDTH)-byte data
+ area as BUFP->fastmap.
+
+ We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
+ the pattern buffer.
+
+ Returns 0 if we succeed, -2 if an internal error. */
+
+int
+re_compile_fastmap (bufp)
+ struct re_pattern_buffer *bufp;
+{
+ char *fastmap = bufp->fastmap;
+ int analysis;
+
+ assert (fastmap && bufp->buffer);
+
+ bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
+ bufp->fastmap_accurate = 1; /* It will be when we're done. */
+
+ analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
+ fastmap, RE_MULTIBYTE_P (bufp));
+ if (analysis < -1)
+ return analysis;
+ bufp->can_be_null = (analysis != 0);
return 0;
} /* re_compile_fastmap */
\f
int anchored_start = 0;
/* Nonzero if we have to concern multibyte character. */
- int multibyte = bufp->multibyte;
+ const boolean multibyte = RE_MULTIBYTE_P (bufp);
/* Check for out-of-range STARTPOS. */
if (startpos < 0 || startpos > total_size)
}
else /* Searching backwards. */
{
- buf_ch = STRING_CHAR (d, (startpos >= size1
- ? size2 + size1 - startpos
- : size1 - startpos));
- if (RE_TRANSLATE_P (translate))
- buf_ch = RE_TRANSLATE (translate, buf_ch);
+ int room = (startpos >= size1
+ ? size2 + size1 - startpos
+ : size1 - startpos);
+ buf_ch = RE_STRING_CHAR (d, room);
+ buf_ch = TRANSLATE (buf_ch);
if (! (buf_ch >= 0400
|| fastmap[buf_ch]))
\f
/* Declarations and macros for re_match_2. */
-static int bcmp_translate ();
+static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
+ register int len,
+ RE_TRANSLATE_TYPE translate,
+ const int multibyte));
/* This converts PTR, a pointer into one of the search strings `string1'
and `string2' into an offset from the beginning of that string. */
: ((regoff_t) ((ptr) - string2 + size1)))
/* Call before fetching a character with *d. This switches over to
- string2 if necessary. */
+ string2 if necessary.
+ Check re_match_2_internal for a discussion of why end_match_2 might
+ not be within string2 (but be equal to end_match_1 instead). */
#define PREFETCH() \
while (d == dend) \
{ \
p += 1 + CHARSET_BITMAP_SIZE (p - 1);
break;
-#ifdef emacs
case syntaxspec:
case notsyntaxspec:
+#ifdef emacs
case categoryspec:
case notcategoryspec:
#endif /* emacs */
unsigned char *p1, *p2;
{
re_opcode_t op2;
- const boolean multibyte = bufp->multibyte;
+ const boolean multibyte = RE_MULTIBYTE_P (bufp);
unsigned char *pend = bufp->buffer + bufp->used;
assert (p1 >= bufp->buffer && p1 < pend
}
}
-#ifdef emacs
case wordend:
case notsyntaxspec:
return ((re_opcode_t) *p1 == syntaxspec
|| (re_opcode_t) *p1 == syntaxspec)
&& p1[1] == Sword);
+#ifdef emacs
case categoryspec:
return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
case notcategoryspec:
RE_TRANSLATE_TYPE translate = bufp->translate;
/* Nonzero if we have to concern multibyte character. */
- int multibyte = bufp->multibyte;
+ const boolean multibyte = RE_MULTIBYTE_P (bufp);
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
for (mcnt = 1; mcnt < num_regs; mcnt++)
regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE;
- /* Shorten strings to `stop'. */
- if (stop <= size1)
- {
- size1 = stop;
- size2 = 0;
- }
- else if (stop <= size1 + size2)
- size2 = stop - size1;
-
/* We move `string1' into `string2' if the latter's empty -- but not if
`string1' is null. */
if (size2 == 0 && string1 != NULL)
end1 = string1 + size1;
end2 = string2 + size2;
- /* Compute where to stop matching, within the two strings. */
- end_match_1 = end1;
- end_match_2 = end2;
-
/* `p' scans through the pattern as `d' scans through the data.
`dend' is the end of the input string that `d' points within. `d'
is advanced into the following input string whenever necessary, but
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
equal `string2'. */
- if (size1 > 0 && pos <= size1)
+ if (pos >= size1)
{
- d = string1 + pos;
- dend = end_match_1;
+ /* Only match within string2. */
+ d = string2 + pos - size1;
+ dend = end_match_2 = string2 + stop - size1;
+ end_match_1 = end1; /* Just to give it a value. */
}
else
{
- d = string2 + pos - size1;
- dend = end_match_2;
+ if (stop <= size1)
+ {
+ /* Only match within string1. */
+ end_match_1 = string1 + stop;
+ /* BEWARE!
+ When we reach end_match_1, PREFETCH normally switches to string2.
+ But in the present case, this means that just doing a PREFETCH
+ makes us jump from `stop' to `gap' within the string.
+ What we really want here is for the search to stop as
+ soon as we hit end_match_1. That's why we set end_match_2
+ to end_match_1 (since PREFETCH fails as soon as we hit
+ end_match_2). */
+ end_match_2 = end_match_1;
+ }
+ else
+ {
+ end_match_1 = end1;
+ end_match_2 = string2 + stop - size1;
+ }
+ d = string1 + pos;
+ dend = end_match_1;
}
DEBUG_PRINT1 ("The compiled pattern is: ");
testing `translate' inside the loop. */
if (RE_TRANSLATE_P (translate))
{
-#ifdef emacs
if (multibyte)
do
{
}
while (mcnt > 0);
else
-#endif /* not emacs */
do
{
PREFETCH ();
DEBUG_PRINT1 ("EXECUTING anychar.\n");
PREFETCH ();
-
-#ifdef emacs
- if (multibyte)
- buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
- else
-#endif /* not emacs */
- {
- buf_ch = *d;
- buf_charlen = 1;
- }
-
+ buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
buf_ch = TRANSLATE (buf_ch);
if ((!(bufp->syntax & RE_DOT_NEWLINE)
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
- PREFETCH ();
- c = *d;
-
range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
-#ifdef emacs
if (range_table_exists)
{
range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
EXTRACT_NUMBER_AND_INCR (count, range_table);
}
- if (multibyte && BASE_LEADING_CODE_P (c))
- c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
-#endif /* emacs */
+ PREFETCH ();
+ c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
+ c = TRANSLATE (c); /* The character to match. */
if (SINGLE_BYTE_CHAR_P (c))
{ /* Lookup bitmap. */
- c = TRANSLATE (c); /* The character to match. */
- len = 1;
-
/* Cast to `unsigned' instead of `unsigned char' in
case the bit list is a full 32 bytes long. */
if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
/* Compare that many; failure if mismatch, else move
past them. */
if (RE_TRANSLATE_P (translate)
- ? bcmp_translate (d, d2, mcnt, translate)
+ ? bcmp_translate (d, d2, mcnt, translate, multibyte)
: bcmp (d, d2, mcnt))
{
d = dfail;
{
if (!bufp->not_bol) break;
}
- else if (d[-1] == '\n' && bufp->newline_anchor)
+ else
{
- break;
+ unsigned char c;
+ GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
+ if (c == '\n' && bufp->newline_anchor)
+ break;
}
/* In all other cases, we fail. */
goto fail;
PUSH_FAILURE_POINT (p - 3, NULL);
break;
+ /* A nasty loop is introduced by the non-greedy *? and +?.
+ With such loops, the stack only ever contains one failure point
+ at a time, so that a plain on_failure_jump_loop kind of
+ cycle detection cannot work. Worse yet, such a detection
+ can not only fail to detect a cycle, but it can also wrongly
+ detect a cycle (between different instantiations of the same
+ loop.
+ So the method used for those nasty loops is a little different:
+ We use a special cycle-detection-stack-frame which is pushed
+ when the on_failure_jump_nastyloop failure-point is *popped*.
+ This special frame thus marks the beginning of one iteration
+ through the loop and we can hence easily check right here
+ whether something matched between the beginning and the end of
+ the loop. */
+ case on_failure_jump_nastyloop:
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
+ mcnt, p + mcnt);
+
+ assert ((re_opcode_t)p[-4] == no_op);
+ CHECK_INFINITE_LOOP (p - 4, d);
+ PUSH_FAILURE_POINT (p - 3, d);
+ break;
+
/* Simple loop detecting on_failure_jump: just check on the
failure stack if the same spot was already hit earlier. */
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
#ifdef emacs
- int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1));
+ int offset = PTR_TO_OFFSET (d - 1);
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE (charpos);
#endif
- /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
s1 = SYNTAX (c1);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
PREFETCH ();
- /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
- c2 = STRING_CHAR (d, dend - d);
+ c2 = RE_STRING_CHAR (d, dend - d);
s2 = SYNTAX (c2);
if (/* Case 2: Only one of S1 and S2 is Sword. */
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
#ifdef emacs
- int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
+ int offset = PTR_TO_OFFSET (d);
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE (charpos);
#endif
PREFETCH ();
- /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
- c2 = STRING_CHAR (d, dend - d);
+ c2 = RE_STRING_CHAR (d, dend - d);
s2 = SYNTAX (c2);
/* Case 2: S2 is not Sword. */
is the character at D, and S2 is the syntax of C2. */
int c1, c2, s1, s2;
#ifdef emacs
- int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1);
+ int offset = PTR_TO_OFFSET (d) - 1;
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE (charpos);
#endif
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
if (!AT_STRINGS_END (d))
{
PREFETCH ();
- /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
- c2 = STRING_CHAR (d, dend - d);
+ c2 = RE_STRING_CHAR (d, dend - d);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD (charpos);
#endif
}
break;
-#ifdef emacs
- case before_dot:
- DEBUG_PRINT1 ("EXECUTING before_dot.\n");
- if (PTR_BYTE_POS (d) >= PT_BYTE)
- goto fail;
- break;
-
- case at_dot:
- DEBUG_PRINT1 ("EXECUTING at_dot.\n");
- if (PTR_BYTE_POS (d) != PT_BYTE)
- goto fail;
- break;
-
- case after_dot:
- DEBUG_PRINT1 ("EXECUTING after_dot.\n");
- if (PTR_BYTE_POS (d) <= PT_BYTE)
- goto fail;
- break;
-
case syntaxspec:
- DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
+ case notsyntaxspec:
+ not = (re_opcode_t) *(p - 1) == notsyntaxspec;
mcnt = *p++;
- goto matchsyntax;
-
- case wordchar:
- DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
- mcnt = (int) Sword;
- matchsyntax:
+ DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
PREFETCH ();
#ifdef emacs
{
- int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
+ int offset = PTR_TO_OFFSET (d);
+ int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE (pos1);
}
#endif
{
int c, len;
- if (multibyte)
- /* we must concern about multibyte form, ... */
- c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
- else
- /* everything should be handled as ASCII, even though it
- looks like multibyte form. */
- c = *d, len = 1;
+ c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
- if (SYNTAX (c) != (enum syntaxcode) mcnt)
- goto fail;
+ if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
+ goto fail;
d += len;
}
break;
- case notsyntaxspec:
- DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
- mcnt = *p++;
- goto matchnotsyntax;
-
- case notwordchar:
- DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
- mcnt = (int) Sword;
- matchnotsyntax:
- PREFETCH ();
#ifdef emacs
- {
- int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
- UPDATE_SYNTAX_TABLE (pos1);
- }
-#endif
- {
- int c, len;
-
- if (multibyte)
- c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
- else
- c = *d, len = 1;
-
- if (SYNTAX (c) == (enum syntaxcode) mcnt)
+ case before_dot:
+ DEBUG_PRINT1 ("EXECUTING before_dot.\n");
+ if (PTR_BYTE_POS (d) >= PT_BYTE)
goto fail;
- d += len;
- }
break;
- case categoryspec:
- DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
- mcnt = *p++;
- PREFETCH ();
- {
- int c, len;
-
- if (multibyte)
- c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
- else
- c = *d, len = 1;
+ case at_dot:
+ DEBUG_PRINT1 ("EXECUTING at_dot.\n");
+ if (PTR_BYTE_POS (d) != PT_BYTE)
+ goto fail;
+ break;
- if (!CHAR_HAS_CATEGORY (c, mcnt))
- goto fail;
- d += len;
- }
+ case after_dot:
+ DEBUG_PRINT1 ("EXECUTING after_dot.\n");
+ if (PTR_BYTE_POS (d) <= PT_BYTE)
+ goto fail;
break;
+ case categoryspec:
case notcategoryspec:
- DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
+ not = (re_opcode_t) *(p - 1) == notcategoryspec;
mcnt = *p++;
+ DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
PREFETCH ();
{
int c, len;
+ c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
- if (multibyte)
- c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
- else
- c = *d, len = 1;
-
- if (CHAR_HAS_CATEGORY (c, mcnt))
+ if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
goto fail;
d += len;
}
- break;
-
-#else /* not emacs */
- case wordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
- PREFETCH ();
- if (!WORDCHAR_P (d))
- goto fail;
- d++;
break;
- case notwordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
- PREFETCH ();
- if (WORDCHAR_P (d))
- goto fail;
- d++;
- break;
-#endif /* not emacs */
+#endif /* emacs */
default:
abort ();
assert (str == NULL);
goto continue_failure_jump;
+ case on_failure_jump_nastyloop:
+ assert ((re_opcode_t)pat[-2] == no_op);
+ PUSH_FAILURE_POINT (pat - 2, str);
+ /* Fallthrough */
+
case on_failure_jump_loop:
case on_failure_jump:
case succeed_n:
p = pat + mcnt;
break;
+ case no_op:
+ /* A special frame used for nastyloops. */
+ goto fail;
+
default:
abort();
}
bytes; nonzero otherwise. */
static int
-bcmp_translate (s1, s2, len, translate)
- unsigned char *s1, *s2;
+bcmp_translate (s1, s2, len, translate, multibyte)
+ re_char *s1, *s2;
register int len;
RE_TRANSLATE_TYPE translate;
+ const int multibyte;
{
- register unsigned char *p1 = s1, *p2 = s2;
- unsigned char *p1_end = s1 + len;
- unsigned char *p2_end = s2 + len;
+ register re_char *p1 = s1, *p2 = s2;
+ re_char *p1_end = s1 + len;
+ re_char *p2_end = s2 + len;
while (p1 != p1_end && p2 != p2_end)
{
int p1_charlen, p2_charlen;
int p1_ch, p2_ch;
- /* FIXME: This assumes `multibyte = true'. */
- p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
- p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+ p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+ p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
if (RE_TRANSLATE (translate, p1_ch)
!= RE_TRANSLATE (translate, p2_ch))