0.12. (Implements POSIX draft P10003.2/D11.2, except for
internationalization features.)
- Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+ Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#undef _GNU_SOURCE
#define _GNU_SOURCE
+#ifdef emacs
+/* Converts the pointer to the char to BEG-based offset from the start. */
+#define PTR_TO_OFFSET(d) \
+ POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \
+ ? (d) - string1 : (d) - (string2 - size1))
+#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
+#else
+#define PTR_TO_OFFSET(d) 0
+#endif
+
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "lisp.h"
#include "buffer.h"
+
+/* Make syntax table lookup grant data in gl_state. */
+#define SYNTAX_ENTRY_VIA_PROPERTY
+
#include "syntax.h"
+#include "charset.h"
+#include "category.h"
#define malloc xmalloc
+#define realloc xrealloc
#define free xfree
#else /* not emacs */
#define SYNTAX(c) re_syntax_table[c]
+/* Dummy macros for non-Emacs environments. */
+#define BASE_LEADING_CODE_P(c) (0)
+#define WORD_BOUNDARY_P(c1, c2) (0)
+#define CHAR_HEAD_P(p) (1)
+#define SINGLE_BYTE_CHAR_P(c) (1)
+#define SAME_CHARSET_P(c1, c2) (1)
+#define MULTIBYTE_FORM_LENGTH(p, s) (1)
+#define STRING_CHAR(p, s) (*(p))
+#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
+#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
+ (c = ((p) == (end1) ? *(str2) : *(p)))
+#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
+ (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
#endif /* not emacs */
\f
/* Get the interface, including the syntax bits. */
syntaxspec,
/* Matches any character whose syntax is not that specified. */
- notsyntaxspec
+ notsyntaxspec,
+
+ /* Matches any character whose category-set contains the specified
+ category. The operator is followed by a byte which contains a
+ category code (mnemonic ASCII character). */
+ categoryspec,
+
+ /* Matches any character whose category-set does not contain the
+ specified category. The operator is followed by a byte which
+ contains the category code (mnemonic ASCII character). */
+ notcategoryspec
#endif /* emacs */
} re_opcode_t;
\f
#endif /* DEBUG */
\f
+/* Store a multibyte character in three contiguous bytes starting
+ DESTINATION, and increment DESTINATION to the byte after where the
+ character is stored. Therefore, DESTINATION must be an lvalue. */
+
+#define STORE_CHARACTER_AND_INCR(destination, character) \
+ do { \
+ (destination)[0] = (character) & 0377; \
+ (destination)[1] = ((character) >> 8) & 0377; \
+ (destination)[2] = (character) >> 16; \
+ (destination) += 3; \
+ } while (0)
+
+/* Put into DESTINATION a character stored in three contiguous bytes
+ starting at SOURCE. */
+
+#define EXTRACT_CHARACTER(destination, source) \
+ do { \
+ (destination) = ((source)[0] \
+ | ((source)[1] << 8) \
+ | ((source)[2] << 16)); \
+ } while (0)
+
+
+/* Macros for charset. */
+
+/* Size of bitmap of charset P in bytes. P is a start of charset,
+ i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
+#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
+
+/* Nonzero if charset P has range table. */
+#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
+
+/* Return the address of range table of charset P. But not the start
+ of table itself, but the before where the number of ranges is
+ stored. `2 +' means to skip re_opcode_t and size of bitmap. */
+#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)])
+
+/* Test if C is listed in the bitmap of charset P. */
+#define CHARSET_LOOKUP_BITMAP(p, c) \
+ ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
+ && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
+
+/* Return the address of end of RANGE_TABLE. COUNT is number of
+ ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
+ is start of range and end of range. `* 3' is size of each start
+ and end. */
+#define CHARSET_RANGE_TABLE_END(range_table, count) \
+ ((range_table) + (count) * 2 * 3)
+
+/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
+ COUNT is number of ranges in RANGE_TABLE. */
+#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
+ do \
+ { \
+ int range_start, range_end; \
+ unsigned char *p; \
+ unsigned char *range_table_end \
+ = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
+ \
+ for (p = (range_table); p < range_table_end; p += 2 * 3) \
+ { \
+ EXTRACT_CHARACTER (range_start, p); \
+ EXTRACT_CHARACTER (range_end, p + 3); \
+ \
+ if (range_start <= (c) && (c) <= range_end) \
+ { \
+ (not) = !(not); \
+ break; \
+ } \
+ } \
+ } \
+ while (0)
+
+/* Test if C is in range table of CHARSET. The flag NOT is negated if
+ C is listed in it. */
+#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
+ do \
+ { \
+ /* Number of ranges in range table. */ \
+ int count; \
+ unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
+ \
+ EXTRACT_NUMBER_AND_INCR (count, range_table); \
+ CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
+ } \
+ while (0)
+\f
/* If DEBUG is defined, Regex prints many voluminous messages about what
it is doing (if the variable `debug' is nonzero). If linked with the
main program in `iregex.c', you can enter patterns and strings
REGEX_ALLOCATE_STACK. */
-/* Number of failure points for which to initially allocate space
+/* Approximate number of failure points for which to initially allocate space
when matching. If this number is exceeded, we allocate more
space, so it is not a hard limit. */
#ifndef INIT_FAILURE_ALLOC
-#define INIT_FAILURE_ALLOC 5
+#define INIT_FAILURE_ALLOC 20
#endif
/* Roughly the maximum number of failure points on the stack. Would be
- exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
+ exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
This is a variable only so users of regex can assign to it; we never
change it ourselves. */
#if defined (MATCH_MAY_ALLOCATE)
-/* 4400 was enough to cause a crash on Alpha OSF/1,
- whose default stack limit is 2mb. */
-int re_max_failures = 20000;
+/* Note that 4400 is enough to cause a crash on Alpha OSF/1,
+ whose default stack limit is 2mb. In order for a larger
+ value to work reliably, you have to try to make it accord
+ with the process stack limit. */
+int re_max_failures = 40000;
#else
-int re_max_failures = 2000;
+int re_max_failures = 4000;
#endif
union fail_stack_elt
#define INIT_FAIL_STACK() \
do { \
fail_stack.stack = (fail_stack_elt_t *) \
- REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
+ REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
+ * sizeof (fail_stack_elt_t)); \
\
if (fail_stack.stack == NULL) \
return -2; \
#endif
-/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
+/* Double the size of FAIL_STACK, up to a limit
+ which allows approximately `re_max_failures' items.
Return 1 if succeeds, and 0 if either ran out of memory
allocating space for it or it was already too large.
REGEX_REALLOCATE_STACK requires `destination' be declared. */
-#define DOUBLE_FAIL_STACK(fail_stack) \
- ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
+/* Factor to increase the failure stack size by
+ when we increase it.
+ This used to be 2, but 2 was too wasteful
+ because the old discarded stacks added up to as much space
+ were as ultimate, maximum-size stack. */
+#define FAIL_STACK_GROWTH_FACTOR 4
+
+#define GROW_FAIL_STACK(fail_stack) \
+ (((fail_stack).size * sizeof (fail_stack_elt_t) \
+ >= re_max_failures * TYPICAL_FAILURE_SIZE) \
? 0 \
- : ((fail_stack).stack = (fail_stack_elt_t *) \
+ : ((fail_stack).stack \
+ = (fail_stack_elt_t *) \
REGEX_REALLOCATE_STACK ((fail_stack).stack, \
(fail_stack).size * sizeof (fail_stack_elt_t), \
- ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
+ MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
+ ((fail_stack).size * sizeof (fail_stack_elt_t) \
+ * FAIL_STACK_GROWTH_FACTOR))), \
\
(fail_stack).stack == NULL \
? 0 \
- : ((fail_stack).size <<= 1, \
+ : ((fail_stack).size \
+ = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
+ ((fail_stack).size * sizeof (fail_stack_elt_t) \
+ * FAIL_STACK_GROWTH_FACTOR)) \
+ / sizeof (fail_stack_elt_t)), \
1)))
space to do so. */
#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
((FAIL_STACK_FULL () \
- && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
+ && !GROW_FAIL_STACK (FAIL_STACK)) \
? 0 \
: ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1))
if we ever fail back to it.
Requires variables fail_stack, regstart, regend, reg_info, and
- num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
+ num_regs be declared. GROW_FAIL_STACK requires `destination' be
declared.
Does `return FAILURE_CODE' if runs out of memory. */
/* Ensure we have enough space allocated for what we will push. */ \
while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
{ \
- if (!DOUBLE_FAIL_STACK (fail_stack)) \
+ if (!GROW_FAIL_STACK (fail_stack)) \
return failure_code; \
\
DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
#define NUM_NONREG_ITEMS 4
#endif
-/* We push at most this many items on the stack. */
-/* We used to use (num_regs - 1), which is the number of registers
- this regexp will save; but that was changed to 5
- to avoid stack overflow for a regexp with lots of parens. */
-#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
+/* Estimate the size of data pushed by a typical failure stack entry.
+ An estimate is all we need, because all we use this for
+ is to choose a limit for how big to make the failure stack. */
+
+#define TYPICAL_FAILURE_SIZE 20
-/* We actually push this many items. */
+/* This is how many items we actually use for a failure point.
+ It depends on the regexp. */
#define NUM_FAILURE_ITEMS \
(((0 \
? 0 : highest_active_reg - lowest_active_reg + 1) \
#define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
- if (translate) c = (unsigned char) translate[c]; \
+ if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
} while (0)
#endif
when we use a character as a subscript we must make it unsigned. */
#ifndef TRANSLATE
#define TRANSLATE(d) \
- (translate ? (char) translate[(unsigned char) (d)] : (d))
+ (RE_TRANSLATE_P (translate) \
+ ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
#endif
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
+/* Structure to manage work area for range table. */
+struct range_table_work_area
+{
+ int *table; /* actual work area. */
+ int allocated; /* allocated size for work area in bytes. */
+ int used; /* actually used size in words. */
+};
+
+/* Make sure that WORK_AREA can hold more N multibyte characters. */
+#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
+ do { \
+ if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
+ { \
+ (work_area).allocated += 16 * sizeof (int); \
+ if ((work_area).table) \
+ (work_area).table \
+ = (int *) realloc ((work_area).table, (work_area).allocated); \
+ else \
+ (work_area).table \
+ = (int *) malloc ((work_area).allocated); \
+ if ((work_area).table == 0) \
+ FREE_STACK_RETURN (REG_ESPACE); \
+ } \
+ } while (0)
+
+/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
+ do { \
+ EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
+ (work_area).table[(work_area).used++] = (range_start); \
+ (work_area).table[(work_area).used++] = (range_end); \
+ } while (0)
+
+/* Free allocated memory for WORK_AREA. */
+#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
+ do { \
+ if ((work_area).table) \
+ free ((work_area).table); \
+ } while (0)
+
+#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0)
+#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
+#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
+
+
/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) \
(b[((unsigned char) (c)) / BYTEWIDTH] \
/* Return, freeing storage we allocated. */
#define FREE_STACK_RETURN(value) \
- return (free (compile_stack.stack), value)
+ do { \
+ FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
+ free (compile_stack.stack); \
+ return value; \
+ } while (0)
static reg_errcode_t
regex_compile (pattern, size, syntax, bufp)
/* We fetch characters from PATTERN here. Even though PATTERN is
`char *' (i.e., signed), we declare these variables as unsigned, so
they can be reliably used as array indices. */
- register unsigned char c, c1;
+ register unsigned int c, c1;
/* A random temporary spot in PATTERN. */
const char *p1;
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
+#ifdef AIX
+ /* `const' makes AIX compiler fail. */
+ char *p = pattern;
+#else
const char *p = pattern;
+#endif
const char *pend = pattern + size;
/* How to translate the characters in the pattern. */
number is put in the stop_memory as the start_memory. */
regnum_t regnum = 0;
+ /* Work area for range table of charset. */
+ struct range_table_work_area range_table_work;
+
#ifdef DEBUG
DEBUG_PRINT1 ("\nCompiling pattern: ");
if (debug)
compile_stack.size = INIT_COMPILE_STACK_SIZE;
compile_stack.avail = 0;
+ range_table_work.table = 0;
+ range_table_work.allocated = 0;
+
/* Initialize the pattern buffer. */
bufp->syntax = syntax;
bufp->fastmap_accurate = 0;
/* Always count groups, whether or not bufp->no_sub is set. */
bufp->re_nsub = 0;
+#ifdef emacs
+ /* bufp->multibyte is set before regex_compile is called, so don't alter
+ it. */
+#else /* not emacs */
+ /* Nothing is recognized as a multibyte character. */
+ bufp->multibyte = 0;
+#endif
+
#if !defined (emacs) && !defined (SYNTAX_TABLE)
/* Initialize the syntax table. */
init_syntax_once ();
incremented `p', by the way, to be the character after
the `*'. Do we have to do something analogous here
for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
+ if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
&& zero_times_ok
- && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
+ && p < pend
+ && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
&& !(syntax & RE_DOT_NEWLINE))
{ /* We have .*\n. */
STORE_JUMP (jump, b, laststart);
case '[':
{
- boolean had_char_class = false;
+ CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
/* Read in characters and ranges, setting map bits. */
for (;;)
{
+ int len;
+ boolean escaped_char = false;
+
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
PATFETCH (c);
{
if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
- PATFETCH (c1);
- SET_LIST_BIT (c1);
- continue;
+ PATFETCH (c);
+ escaped_char = true;
}
-
- /* Could be the end of the bracket expression. If it's
- not (i.e., when the bracket expression is `[]' so
- far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
- break;
-
- /* Look ahead to see if it's a range when the last thing
- was a character class. */
- if (had_char_class && c == '-' && *p != ']')
- FREE_STACK_RETURN (REG_ERANGE);
-
- /* Look ahead to see if it's a range when the last thing
- was a character: if this is a hyphen not at the
- beginning or the end of a list, then it's the range
- operator. */
- if (c == '-'
- && !(p - 2 >= pattern && p[-2] == '[')
- && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
- && *p != ']')
+ else
{
- reg_errcode_t ret
- = compile_range (&p, pend, translate, syntax, b);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+ if (c == ']' && p != p1 + 1)
+ break;
}
- else if (p[0] == '-' && p[1] != ']')
- { /* This handles ranges made up of characters only. */
- reg_errcode_t ret;
-
- /* Move past the `-'. */
- PATFETCH (c1);
-
- ret = compile_range (&p, pend, translate, syntax, b);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ /* If C indicates start of multibyte char, get the
+ actual character code in C, and set the pattern
+ pointer P to the next character boundary. */
+ if (bufp->multibyte && BASE_LEADING_CODE_P (c))
+ {
+ PATUNFETCH;
+ c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
+ p += len;
}
+ /* What should we do for the character which is
+ greater than 0x7F, but not BASE_LEADING_CODE_P?
+ XXX */
/* See if we're at the beginning of a possible character
class. */
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
- { /* Leave room for the null. */
+ else if (!escaped_char &&
+ syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+ {
+ /* Leave room for the null. */
char str[CHAR_CLASS_MAX_LENGTH + 1];
PATFETCH (c);
}
str[c1] = '\0';
- /* If isn't a word bracketed by `[:' and:`]':
- undo the ending character, the letters, and leave
- the leading `:' and `[' (but set bits for them). */
+ /* If isn't a word bracketed by `[:' and `:]':
+ undo the ending character, the letters, and
+ leave the leading `:' and `[' (but set bits for
+ them). */
if (c == ':' && *p == ']')
{
int ch;
|| (is_xdigit && ISXDIGIT (ch)))
SET_LIST_BIT (translated);
}
- had_char_class = true;
+
+ /* Repeat the loop. */
+ continue;
}
else
{
while (c1--)
PATUNFETCH;
SET_LIST_BIT ('[');
- SET_LIST_BIT (':');
- had_char_class = false;
+
+ /* Because the `:' may starts the range, we
+ can't simply set bit and repeat the loop.
+ Instead, just set it to C and handle below. */
+ c = ':';
+ }
+ }
+
+ if (p < pend && p[0] == '-' && p[1] != ']')
+ {
+
+ /* Discard the `-'. */
+ PATFETCH (c1);
+
+ /* Fetch the character which ends the range. */
+ PATFETCH (c1);
+ if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
+ {
+ PATUNFETCH;
+ c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
+ p += len;
+ }
+
+ if (SINGLE_BYTE_CHAR_P (c)
+ && ! SINGLE_BYTE_CHAR_P (c1))
+ {
+ /* Handle a range such as \177-\377 in multibyte mode.
+ Split that into two ranges,,
+ the low one ending at 0237, and the high one
+ starting at ...040. */
+ int c1_base = (c1 & ~0177) | 040;
+ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
+ c1 = 0237;
}
+ else if (!SAME_CHARSET_P (c, c1))
+ FREE_STACK_RETURN (REG_ERANGE);
}
else
+ /* Range from C to C. */
+ c1 = c;
+
+ /* Set the range ... */
+ if (SINGLE_BYTE_CHAR_P (c))
+ /* ... into bitmap. */
{
- had_char_class = false;
- SET_LIST_BIT (c);
+ unsigned this_char;
+ int range_start = c, range_end = c1;
+
+ /* If the start is after the end, the range is empty. */
+ if (range_start > range_end)
+ {
+ if (syntax & RE_NO_EMPTY_RANGES)
+ FREE_STACK_RETURN (REG_ERANGE);
+ /* Else, repeat the loop. */
+ }
+ else
+ {
+ for (this_char = range_start; this_char <= range_end;
+ this_char++)
+ SET_LIST_BIT (TRANSLATE (this_char));
+ }
}
+ else
+ /* ... into range table. */
+ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
}
/* Discard any (non)matching list bytes that are all 0 at the
while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
b[-1]--;
b += b[-1];
+
+ /* Build real range table from work area. */
+ if (RANGE_TABLE_WORK_USED (range_table_work))
+ {
+ int i;
+ int used = RANGE_TABLE_WORK_USED (range_table_work);
+
+ /* Allocate space for COUNT + RANGE_TABLE. Needs two
+ bytes for COUNT and three bytes for each character. */
+ GET_BUFFER_SPACE (2 + used * 3);
+
+ /* Indicate the existence of range table. */
+ laststart[1] |= 0x80;
+
+ STORE_NUMBER_AND_INCR (b, used / 2);
+ for (i = 0; i < used; i++)
+ STORE_CHARACTER_AND_INCR
+ (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
+ }
}
break;
PATFETCH (c);
BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
break;
+
+ case 'c':
+ laststart = b;
+ PATFETCH_RAW (c);
+ BUF_PUSH_2 (categoryspec, c);
+ break;
+
+ case 'C':
+ laststart = b;
+ PATFETCH_RAW (c);
+ BUF_PUSH_2 (notcategoryspec, c);
+ break;
#endif /* emacs */
default:
/* Expects the character in `c'. */
normal_char:
+ p1 = p - 1; /* P1 points the head of C. */
+#ifdef emacs
+ if (bufp->multibyte)
+ /* Set P to the next character boundary. */
+ p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
+#endif
/* If no exactn currently being built. */
if (!pending_exact
|| pending_exact + *pending_exact + 1 != b
/* We have only one byte following the exactn for the count. */
- || *pending_exact == (1 << BYTEWIDTH) - 1
+ || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
/* If followed by a repetition operator. */
- || *p == '*' || *p == '^'
+ || (p != pend && (*p == '*' || *p == '^'))
|| ((syntax & RE_BK_PLUS_QM)
- ? *p == '\\' && (p[1] == '+' || p[1] == '?')
- : (*p == '+' || *p == '?'))
+ ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
+ : p != pend && (*p == '+' || *p == '?'))
|| ((syntax & RE_INTERVALS)
&& ((syntax & RE_NO_BK_BRACES)
- ? *p == '{'
- : (p[0] == '\\' && p[1] == '{'))))
+ ? p != pend && *p == '{'
+ : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
{
/* Start building a new exactn. */
pending_exact = b - 1;
}
- BUF_PUSH (c);
- (*pending_exact)++;
+ /* Here, C may translated, therefore C may not equal to *P1. */
+ while (1)
+ {
+ BUF_PUSH (c);
+ (*pending_exact)++;
+ if (++p1 == p)
+ break;
+
+ /* Rest of multibyte form should be copied literally. */
+ c = *(unsigned char *)p1;
+ }
break;
} /* switch (c) */
} /* while p != pend */
{
int num_regs = bufp->re_nsub + 1;
- /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
- is strictly greater than re_max_failures, the largest possible stack
- is 2 * re_max_failures failure points. */
- if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
+ if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
{
- fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
+ fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
#ifdef emacs
if (! fail_stack.stack)
return false;
}
-
-
-/* Read the ending character of a range (in a bracket expression) from the
- uncompiled pattern *P_PTR (which ends at PEND). We assume the
- starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
- Then we set the translation of all bits between the starting and
- ending characters (inclusive) in the compiled pattern B.
-
- Return an error code.
-
- We use these short variable names so we can use the same macros as
- `regex_compile' itself. */
-
-static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
-{
- unsigned this_char;
-
- const char *p = *p_ptr;
- int range_start, range_end;
-
- if (p == pend)
- return REG_ERANGE;
-
- /* Even though the pattern is a signed `char *', we need to fetch
- with unsigned char *'s; if the high bit of the pattern character
- is set, the range endpoints will be negative if we fetch using a
- signed char *.
-
- We also want to fetch the endpoints without translating them; the
- appropriate translation is done in the bit-setting loop below. */
- /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
- range_start = ((const unsigned char *) p)[-2];
- range_end = ((const unsigned char *) p)[0];
-
- /* Have to increment the pointer into the pattern string, so the
- caller isn't still at the ending character. */
- (*p_ptr)++;
-
- /* If the start is after the end, the range is empty. */
- if (range_start > range_end)
- return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-
- /* Here we see why `this_char' has to be larger than an `unsigned
- char' -- the range is inclusive, so if `range_end' == 0xff
- (assuming 8-bit characters), we would otherwise go into an infinite
- loop, since all characters <= 0xff. */
- for (this_char = range_start; this_char <= range_end; this_char++)
- {
- SET_LIST_BIT (TRANSLATE (this_char));
- }
-
- return REG_NOERROR;
-}
\f
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
{
- int j, k;
+ int i, j, k;
#ifdef MATCH_MAY_ALLOCATE
fail_stack_type fail_stack;
#endif
/* We aren't doing a `succeed_n' to begin with. */
boolean succeed_n_p = false;
+ /* If all elements for base leading-codes in fastmap is set, this
+ flag is set true. */
+ boolean match_any_multibyte_characters = false;
+
+ /* Maximum code of simple (single byte) character. */
+ int simple_char_max;
+
assert (fastmap != NULL && p != NULL);
INIT_FAIL_STACK ();
break;
+#ifndef emacs
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
if (SYNTAX (j) != Sword)
fastmap[j] = 1;
break;
+#else /* emacs */
+ case charset:
+ for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
+ j >= 0; j--)
+ if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
+ fastmap[j] = 1;
+
+ if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && match_any_multibyte_characters == false)
+ {
+ /* Set fastmap[I] 1 where I is a base leading code of each
+ multibyte character in the range table. */
+ int c, count;
+
+ /* Make P points the range table. */
+ p += CHARSET_BITMAP_SIZE (&p[-2]);
+
+ /* Extract the number of ranges in range table into
+ COUNT. */
+ EXTRACT_NUMBER_AND_INCR (count, p);
+ for (; count > 0; count--, p += 2 * 3) /* XXX */
+ {
+ /* Extract the start of each range. */
+ EXTRACT_CHARACTER (c, p);
+ j = CHAR_CHARSET (c);
+ fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
+ }
+ }
+ break;
+
+
+ case charset_not:
+ /* Chars beyond end of map must be allowed. End of map is
+ `127' if bufp->multibyte is nonzero. */
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+ j < simple_char_max; j++)
+ fastmap[j] = 1;
+
+ for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
+ j >= 0; j--)
+ if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
+ fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ which doesn't match the specified set of characters. */
+ {
+ set_fastmap_for_multibyte_characters:
+ if (match_any_multibyte_characters == false)
+ {
+ for (j = 0x80; j < 0xA0; j++) /* XXX */
+ if (BASE_LEADING_CODE_P (j))
+ fastmap[j] = 1;
+ match_any_multibyte_characters = true;
+ }
+ }
+ break;
+
+
+ case wordchar:
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
+ if (SYNTAX (j) == Sword)
+ fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose syntax is `Sword'. */
+ goto set_fastmap_for_multibyte_characters;
+ break;
+ case notwordchar:
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
+ if (SYNTAX (j) != Sword)
+ fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose syntax is not `Sword'. */
+ goto set_fastmap_for_multibyte_characters;
+ break;
+#endif
+
case anychar:
{
int fastmap_newline = fastmap['\n'];
- /* `.' matches anything ... */
- for (j = 0; j < (1 << BYTEWIDTH); j++)
+ /* `.' matches anything (but if bufp->multibyte is
+ nonzero, matches `\000' .. `\127' and possible multibyte
+ character) ... */
+ if (bufp->multibyte)
+ {
+ simple_char_max = 0x80;
+
+ for (j = 0x80; j < 0xA0; j++)
+ if (BASE_LEADING_CODE_P (j))
+ fastmap[j] = 1;
+ match_any_multibyte_characters = true;
+ }
+ else
+ simple_char_max = (1 << BYTEWIDTH);
+
+ for (j = 0; j < simple_char_max; j++)
fastmap[j] = 1;
/* ... except perhaps newline. */
}
#ifdef emacs
+ case wordbound:
+ case notwordbound:
+ case wordbeg:
+ case wordend:
+ case notsyntaxspec:
case syntaxspec:
+ /* This match depends on text properties. These end with
+ aborting optimizations. */
+ bufp->can_be_null = 1;
+ goto done;
+#if 0
k = *p++;
- for (j = 0; j < (1 << BYTEWIDTH); j++)
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
if (SYNTAX (j) == (enum syntaxcode) k)
fastmap[j] = 1;
- break;
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose syntax is K. */
+ goto set_fastmap_for_multibyte_characters;
+ break;
case notsyntaxspec:
k = *p++;
- for (j = 0; j < (1 << BYTEWIDTH); j++)
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
if (SYNTAX (j) != (enum syntaxcode) k)
fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose syntax is not K. */
+ goto set_fastmap_for_multibyte_characters;
+ break;
+#endif
+
+
+ case categoryspec:
+ k = *p++;
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
+ if (CHAR_HAS_CATEGORY (j, k))
+ fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose category is K. */
+ goto set_fastmap_for_multibyte_characters;
break;
+ case notcategoryspec:
+ k = *p++;
+ simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
+ for (j = 0; j < simple_char_max; j++)
+ if (!CHAR_HAS_CATEGORY (j, k))
+ fastmap[j] = 1;
+
+ if (bufp->multibyte)
+ /* Any character set can possibly contain a character
+ whose category is not K. */
+ goto set_fastmap_for_multibyte_characters;
+ break;
+
/* All cases after this match the empty string. These end with
`continue'. */
case endline:
case begbuf:
case endbuf:
+#ifndef emacs
case wordbound:
case notwordbound:
case wordbeg:
case wordend:
+#endif
case push_dummy_failure:
continue;
regs, size);
}
+/* End address of virtual concatenation of string. */
+#define STOP_ADDR_VSTRING(P) \
+ (((P) >= size1 ? string2 + size2 : string1 + size1))
+
+/* Address of POS in the concatenation of virtual string. */
+#define POS_ADDR_VSTRING(POS) \
+ (((POS) >= size1 ? string2 - size1 : string1) + (POS))
/* Using the compiled pattern in BUFP->buffer, first tries to match the
virtual concatenation of STRING1 and STRING2, starting first at index
int endpos = startpos + range;
int anchored_start = 0;
+ /* Nonzero if we have to concern multibyte character. */
+ int multibyte = bufp->multibyte;
+
/* Check for out-of-range STARTPOS. */
if (startpos < 0 || startpos > total_size)
return -1;
range = total_size - startpos;
/* If the search isn't to be a backwards one, don't waste time in a
- search for a pattern that must be anchored. */
+ search for a pattern anchored at beginning of buffer. */
if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
{
if (startpos > 0)
return -1;
else
- range = 1;
+ range = 0;
}
#ifdef emacs
don't keep searching past point. */
if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
{
- range = PT - startpos;
- if (range <= 0)
+ range = PT_BYTE - BEGV_BYTE - startpos;
+ if (range < 0)
return -1;
}
#endif /* emacs */
if (bufp->buffer[0] == begline)
anchored_start = 1;
+#ifdef emacs
+ gl_state.object = re_match_object;
+ {
+ int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos);
+
+ SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
+ }
+#endif
+
/* Loop through the string, looking for a place to start matching. */
for (;;)
{
the first null string. */
if (fastmap && startpos < total_size && !bufp->can_be_null)
{
+ register const char *d;
+ register unsigned int buf_ch;
+
+ d = POS_ADDR_VSTRING (startpos);
+
if (range > 0) /* Searching forwards. */
{
- register const char *d;
register int lim = 0;
int irange = range;
if (startpos < size1 && startpos + range >= size1)
lim = range - (size1 - startpos);
- d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
-
/* Written out as an if-else to avoid testing `translate'
inside the loop. */
- if (translate)
- while (range > lim
- && !fastmap[(unsigned char)
- translate[(unsigned char) *d++]])
- range--;
+ if (RE_TRANSLATE_P (translate))
+ {
+ if (multibyte)
+ while (range > lim)
+ {
+ int buf_charlen;
+
+ buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+ buf_charlen);
+
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
+ if (buf_ch >= 0400
+ || fastmap[buf_ch])
+ break;
+
+ range -= buf_charlen;
+ d += buf_charlen;
+ }
+ else
+ while (range > lim
+ && !fastmap[(unsigned char)
+ RE_TRANSLATE (translate, (unsigned char) *d)])
+ {
+ d++;
+ range--;
+ }
+ }
else
- while (range > lim && !fastmap[(unsigned char) *d++])
- range--;
+ while (range > lim && !fastmap[(unsigned char) *d])
+ {
+ d++;
+ range--;
+ }
startpos += irange - range;
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ int room = (size1 == 0 || startpos >= size1
+ ? size2 + size1 - startpos
+ : size1 - startpos);
+
+ buf_ch = STRING_CHAR (d, room);
+ if (RE_TRANSLATE_P (translate))
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
- if (!fastmap[(unsigned char) TRANSLATE (c)])
+ if (! (buf_ch >= 0400
+ || fastmap[buf_ch]))
goto advance;
}
}
break;
else if (range > 0)
{
- range--;
- startpos++;
+ /* Update STARTPOS to the next character boundary. */
+ if (multibyte)
+ {
+ const unsigned char *p
+ = (const unsigned char *) POS_ADDR_VSTRING (startpos);
+ const unsigned char *pend
+ = (const unsigned char *) STOP_ADDR_VSTRING (startpos);
+ int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
+
+ range -= len;
+ if (range < 0)
+ break;
+ startpos += len;
+ }
+ else
+ {
+ range--;
+ startpos++;
+ }
}
else
{
range++;
startpos--;
+
+ /* Update STARTPOS to the previous character boundary. */
+ if (multibyte)
+ {
+ const unsigned char *p
+ = (const unsigned char *) POS_ADDR_VSTRING (startpos);
+ int len = 0;
+
+ /* Find the head of multibyte form. */
+ while (!CHAR_HEAD_P (*p))
+ p--, len++;
+
+ /* Adjust it. */
+#if 0 /* XXX */
+ if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
+ ;
+ else
+#endif
+ {
+ range += len;
+ if (range > 0)
+ break;
+
+ startpos -= len;
+ }
+ }
}
}
return -1;
== Sword)
/* Disabled due to a compiler bug -- see comment at case wordbound */
+
+/* The comment at case wordbound is following one, but we don't use
+ AT_WORD_BOUNDARY anymore to support multibyte form.
+
+ The DEC Alpha C compiler 3.x generates incorrect code for the
+ test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
+ AT_WORD_BOUNDARY, so this code is disabled. Expanding the
+ macro and introducing temporary variables works around the bug. */
+
#if 0
/* Test if the character before D and the one at D differ with respect
to being word-constituent. */
}
#endif /* not emacs */
+#ifdef emacs
+/* In Emacs, this is the string or buffer in which we
+ are matching. It is used for looking up syntax properties. */
+Lisp_Object re_match_object;
+#endif
/* re_match_2 matches the compiled pattern in BUFP against the
the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
struct re_registers *regs;
int stop;
{
- int result = re_match_2_internal (bufp, string1, size1, string2, size2,
- pos, regs, stop);
+ int result;
+
+#ifdef emacs
+ int charpos;
+ int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
+ gl_state.object = re_match_object;
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos);
+ SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
+#endif
+
+ result = re_match_2_internal (bufp, string1, size1, string2, size2,
+ pos, regs, stop);
alloca (0);
return result;
}
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
+ /* Nonzero if we have to concern multibyte character. */
+ int multibyte = bufp->multibyte;
+
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
restart, regend, and reg_info for all registers corresponding to
/* This is written out as an if-else so we don't waste time
testing `translate' inside the loop. */
- if (translate)
+ if (RE_TRANSLATE_P (translate))
{
- do
- {
- PREFETCH ();
- if ((unsigned char) translate[(unsigned char) *d++]
- != (unsigned char) *p++)
- goto fail;
- }
- while (--mcnt);
+#ifdef emacs
+ if (multibyte)
+ do
+ {
+ int pat_charlen, buf_charlen;
+ unsigned int pat_ch, buf_ch;
+
+ PREFETCH ();
+ pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+
+ if (RE_TRANSLATE (translate, buf_ch)
+ != pat_ch)
+ goto fail;
+
+ p += pat_charlen;
+ d += buf_charlen;
+ mcnt -= pat_charlen;
+ }
+ while (mcnt > 0);
+ else
+#endif /* not emacs */
+ do
+ {
+ PREFETCH ();
+ if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d)
+ != (unsigned char) *p++)
+ goto fail;
+ d++;
+ }
+ while (--mcnt);
}
else
{
/* Match any character except possibly a newline or a null. */
case anychar:
- DEBUG_PRINT1 ("EXECUTING anychar.\n");
+ {
+ int buf_charlen;
+ unsigned int buf_ch;
- PREFETCH ();
+ DEBUG_PRINT1 ("EXECUTING anychar.\n");
- if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
- || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
- goto fail;
+ PREFETCH ();
- SET_REGS_MATCHED ();
- DEBUG_PRINT2 (" Matched `%d'.\n", *d);
- d++;
+#ifdef emacs
+ if (multibyte)
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+ else
+#endif /* not emacs */
+ {
+ buf_ch = (unsigned char) *d;
+ buf_charlen = 1;
+ }
+
+ buf_ch = TRANSLATE (buf_ch);
+
+ if ((!(bufp->syntax & RE_DOT_NEWLINE)
+ && buf_ch == '\n')
+ || ((bufp->syntax & RE_DOT_NOT_NULL)
+ && buf_ch == '\000'))
+ goto fail;
+
+ SET_REGS_MATCHED ();
+ DEBUG_PRINT2 (" Matched `%d'.\n", *d);
+ d += buf_charlen;
+ }
break;
case charset:
case charset_not:
{
- register unsigned char c;
+ register unsigned int c;
boolean not = (re_opcode_t) *(p - 1) == charset_not;
+ int len;
+
+ /* Start of actual range_table, or end of bitmap if there is no
+ range table. */
+ unsigned char *range_table;
+
+ /* Nonzero if there is range table. */
+ int range_table_exists;
+
+ /* Number of ranges of range table. Not in bytes. */
+ int count;
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
PREFETCH ();
- c = TRANSLATE (*d); /* The character to match. */
+ c = (unsigned char) *d;
+
+ range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
+ range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
+ if (range_table_exists)
+ EXTRACT_NUMBER_AND_INCR (count, range_table);
+ else
+ count = 0;
+
+ if (multibyte && BASE_LEADING_CODE_P (c))
+ c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
- /* Cast to `unsigned' instead of `unsigned char' in case the
- bit list is a full 32 bytes long. */
- if (c < (unsigned) (*p * BYTEWIDTH)
+ if (SINGLE_BYTE_CHAR_P (c))
+ { /* Lookup bitmap. */
+ c = TRANSLATE (c); /* The character to match. */
+ len = 1;
+
+ /* Cast to `unsigned' instead of `unsigned char' in
+ case the bit list is a full 32 bytes long. */
+ if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
+ }
+ else if (range_table_exists)
+ CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
- p += 1 + *p;
+ p = CHARSET_RANGE_TABLE_END (range_table, count);
if (!not) goto fail;
SET_REGS_MATCHED ();
- d++;
+ d += len;
break;
}
/* Compare that many; failure if mismatch, else move
past them. */
- if (translate
+ if (RE_TRANSLATE_P (translate)
? bcmp_translate (d, d2, mcnt, translate)
: bcmp (d, d2, mcnt))
goto fail;
else if ((re_opcode_t) *p2 == exactn
|| (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
{
- register unsigned char c
+ register unsigned int c
= *p2 == (unsigned char) endline ? '\n' : p2[2];
- if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
+ if ((re_opcode_t) p1[3] == exactn)
+ {
+ if (!(multibyte /* && (c != '\n') */
+ && BASE_LEADING_CODE_P (c))
+ ? c != p1[5]
+ : (STRING_CHAR (&p2[2], pend - &p2[2])
+ != STRING_CHAR (&p1[5], pend - &p1[5])))
{
p[-3] = (unsigned char) pop_failure_jump;
DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
c, p1[5]);
}
+ }
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
int not = (re_opcode_t) p1[3] == charset_not;
- if (c < (unsigned char) (p1[4] * BYTEWIDTH)
+ if (multibyte /* && (c != '\n') */
+ && BASE_LEADING_CODE_P (c))
+ c = STRING_CHAR (&p2[2], pend - &p2[2]);
+
+ /* Test if C is listed in charset (or charset_not)
+ at `&p1[3]'. */
+ if (SINGLE_BYTE_CHAR_P (c))
+ {
+ if (c < CHARSET_BITMAP_SIZE (&p1[3]) * BYTEWIDTH
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
+ }
+ else if (CHARSET_RANGE_TABLE_EXISTS_P (&p1[3]))
+ CHARSET_LOOKUP_RANGE_TABLE (not, c, &p1[3]);
/* `not' is equal to 1 if c would match, which means
that we can't change to pop_failure_jump. */
}
else if ((re_opcode_t) *p2 == charset)
{
-#ifdef DEBUG
- register unsigned char c
- = *p2 == (unsigned char) endline ? '\n' : p2[2];
-#endif
+ if ((re_opcode_t) p1[3] == exactn)
+ {
+ register unsigned int c = p1[5];
+ int not = 0;
+
+ if (multibyte && BASE_LEADING_CODE_P (c))
+ c = STRING_CHAR (&p1[5], pend - &p1[5]);
- if ((re_opcode_t) p1[3] == exactn
- && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
- && (p2[2 + p1[5] / BYTEWIDTH]
- & (1 << (p1[5] % BYTEWIDTH)))))
+ /* Test if C is listed in charset at `p2'. */
+ if (SINGLE_BYTE_CHAR_P (c))
+ {
+ if (c < CHARSET_BITMAP_SIZE (p2) * BYTEWIDTH
+ && (p2[2 + c / BYTEWIDTH]
+ & (1 << (c % BYTEWIDTH))))
+ not = !not;
+ }
+ else if (CHARSET_RANGE_TABLE_EXISTS_P (p2))
+ CHARSET_LOOKUP_RANGE_TABLE (not, c, p2);
+
+ if (!not)
{
p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
- c, p1[5]);
+ DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
+ }
}
- else if ((re_opcode_t) p1[3] == charset_not)
+ /* It is hard to list up all the character in charset
+ P2 if it includes multibyte character. Give up in
+ such case. */
+ else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
+ {
+ /* Now, we are sure that P2 has no range table.
+ So, for the size of bitmap in P2, `p2[1]' is
+ enough. But P1 may have range table, so the
+ size of bitmap table of P1 is extracted by
+ using macro `CHARSET_BITMAP_SIZE'.
+
+ Since we know that all the character listed in
+ P2 is ASCII, it is enough to test only bitmap
+ table of P1. */
+
+ if ((re_opcode_t) p1[3] == charset_not)
{
int idx;
- /* We win if the charset_not inside the loop
- lists every character listed in the charset after. */
+ /* We win if the charset_not inside the loop lists
+ every character listed in the charset after. */
for (idx = 0; idx < (int) p2[1]; idx++)
if (! (p2[2 + idx] == 0
- || (idx < (int) p1[4]
+ || (idx < CHARSET_BITMAP_SIZE (&p1[3])
&& ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
break;
/* We win if the charset inside the loop
has no overlap with the one after the loop. */
for (idx = 0;
- idx < (int) p2[1] && idx < (int) p1[4];
+ (idx < (int) p2[1]
+ && idx < CHARSET_BITMAP_SIZE (&p1[3]));
idx++)
if ((p2[2 + idx] & p1[5 + idx]) != 0)
break;
- if (idx == p2[1] || idx == p1[4])
+ if (idx == p2[1]
+ || idx == CHARSET_BITMAP_SIZE (&p1[3]))
{
p[-3] = (unsigned char) pop_failure_jump;
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
}
+ }
p -= 2; /* Point at relative address again. */
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
break;
}
-#if 0
- /* The DEC Alpha C compiler 3.x generates incorrect code for the
- test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
- AT_WORD_BOUNDARY, so this code is disabled. Expanding the
- macro and introducing temporary variables works around the bug. */
-
case wordbound:
DEBUG_PRINT1 ("EXECUTING wordbound.\n");
- if (AT_WORD_BOUNDARY (d))
- break;
- goto fail;
- case notwordbound:
- DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
- if (AT_WORD_BOUNDARY (d))
- goto fail;
- break;
-#else
- case wordbound:
- {
- boolean prevchar, thischar;
+ /* We SUCCEED in one of the following cases: */
- DEBUG_PRINT1 ("EXECUTING wordbound.\n");
+ /* Case 1: D is at the beginning or the end of string. */
if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
break;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ int c1, c2, s1, s2;
+ int pos1 = PTR_TO_OFFSET (d - 1);
+ int charpos;
+
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
+#ifdef emacs
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
+#endif
+ s1 = SYNTAX (c1);
+#ifdef emacs
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+#endif
+ s2 = SYNTAX (c2);
- prevchar = WORDCHAR_P (d - 1);
- thischar = WORDCHAR_P (d);
- if (prevchar != thischar)
+ if (/* Case 2: Only one of S1 and S2 is Sword. */
+ ((s1 == Sword) != (s2 == Sword))
+ /* Case 3: Both of S1 and S2 are Sword, and macro
+ WORD_BOUNDARY_P (C1, C2) returns nonzero. */
+ || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
break;
- goto fail;
}
+ goto fail;
case notwordbound:
- {
- boolean prevchar, thischar;
-
DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
+
+ /* We FAIL in one of the following cases: */
+
+ /* Case 1: D is at the beginning or the end of string. */
if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
goto fail;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ int c1, c2, s1, s2;
+ int pos1 = PTR_TO_OFFSET (d - 1);
+ int charpos;
+
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
+#ifdef emacs
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
+#endif
+ s1 = SYNTAX (c1);
+#ifdef emacs
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+#endif
+ s2 = SYNTAX (c2);
- prevchar = WORDCHAR_P (d - 1);
- thischar = WORDCHAR_P (d);
- if (prevchar != thischar)
+ if (/* Case 2: Only one of S1 and S2 is Sword. */
+ ((s1 == Sword) != (s2 == Sword))
+ /* Case 3: Both of S1 and S2 are Sword, and macro
+ WORD_BOUNDARY_P (C1, C2) returns nonzero. */
+ || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
goto fail;
- break;
}
-#endif
+ break;
case wordbeg:
DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
- if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
- break;
+
+ /* We FAIL in one of the following cases: */
+
+ /* Case 1: D is at the end of string. */
+ if (AT_STRINGS_END (d))
goto fail;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ int c1, c2, s1, s2;
+ int pos1 = PTR_TO_OFFSET (d);
+ int charpos;
+
+ GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
+#ifdef emacs
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
+ UPDATE_SYNTAX_TABLE (charpos);
+#endif
+ s2 = SYNTAX (c2);
+
+ /* Case 2: S2 is not Sword. */
+ if (s2 != Sword)
+ goto fail;
+
+ /* Case 3: D is not at the beginning of string ... */
+ if (!AT_STRINGS_BEG (d))
+ {
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+#ifdef emacs
+ UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+#endif
+ s1 = SYNTAX (c1);
+
+ /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
+ returns 0. */
+ if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
+ goto fail;
+ }
+ }
+ break;
case wordend:
DEBUG_PRINT1 ("EXECUTING wordend.\n");
- if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
- && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
- break;
+
+ /* We FAIL in one of the following cases: */
+
+ /* Case 1: D is at the beginning of string. */
+ if (AT_STRINGS_BEG (d))
+ goto fail;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ int c1, c2, s1, s2;
+ int pos1 = PTR_TO_OFFSET (d);
+ int charpos;
+
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+#ifdef emacs
+ charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1);
+ UPDATE_SYNTAX_TABLE (charpos);
+#endif
+ s1 = SYNTAX (c1);
+
+ /* Case 2: S1 is not Sword. */
+ if (s1 != Sword)
+ goto fail;
+
+ /* Case 3: D is not at the end of string ... */
+ if (!AT_STRINGS_END (d))
+ {
+ GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
+#ifdef emacs
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+#endif
+ s2 = SYNTAX (c2);
+
+ /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
+ returns 0. */
+ if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
goto fail;
+ }
+ }
+ break;
#ifdef emacs
case before_dot:
DEBUG_PRINT1 ("EXECUTING before_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) >= PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE)
goto fail;
break;
case at_dot:
DEBUG_PRINT1 ("EXECUTING at_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) != PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE)
goto fail;
break;
case after_dot:
DEBUG_PRINT1 ("EXECUTING after_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) <= PT)
+ if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE)
goto fail;
break;
mcnt = (int) Sword;
matchsyntax:
PREFETCH ();
- /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
- d++;
- if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
+#ifdef emacs
+ {
+ int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
+ UPDATE_SYNTAX_TABLE (pos1);
+ }
+#endif
+ {
+ int c, len;
+
+ if (multibyte)
+ /* we must concern about multibyte form, ... */
+ c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
+ else
+ /* everything should be handled as ASCII, even though it
+ looks like multibyte form. */
+ c = *d, len = 1;
+
+ if (SYNTAX (c) != (enum syntaxcode) mcnt)
goto fail;
+ d += len;
+ }
SET_REGS_MATCHED ();
break;
mcnt = (int) Sword;
matchnotsyntax:
PREFETCH ();
- /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
- d++;
- if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
+#ifdef emacs
+ {
+ int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
+ UPDATE_SYNTAX_TABLE (pos1);
+ }
+#endif
+ {
+ int c, len;
+
+ if (multibyte)
+ c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
+ else
+ c = *d, len = 1;
+
+ if (SYNTAX (c) == (enum syntaxcode) mcnt)
goto fail;
+ d += len;
+ }
SET_REGS_MATCHED ();
break;
+ case categoryspec:
+ DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
+ mcnt = *p++;
+ PREFETCH ();
+ {
+ int c, len;
+
+ if (multibyte)
+ c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
+ else
+ c = *d, len = 1;
+
+ if (!CHAR_HAS_CATEGORY (c, mcnt))
+ goto fail;
+ d += len;
+ }
+ SET_REGS_MATCHED ();
+ break;
+
+ case notcategoryspec:
+ DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
+ mcnt = *p++;
+ PREFETCH ();
+ {
+ int c, len;
+
+ if (multibyte)
+ c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
+ else
+ c = *d, len = 1;
+
+ if (CHAR_HAS_CATEGORY (c, mcnt))
+ goto fail;
+ d += len;
+ }
+ SET_REGS_MATCHED ();
+ break;
+
#else /* not emacs */
case wordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
PREFETCH ();
- if (!WORDCHAR_P (d))
- goto fail;
+ if (!WORDCHAR_P (d))
+ goto fail;
SET_REGS_MATCHED ();
- d++;
+ d++;
break;
case notwordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
PREFETCH ();
if (WORDCHAR_P (d))
- goto fail;
- SET_REGS_MATCHED ();
- d++;
+ goto fail;
+ SET_REGS_MATCHED ();
+ d++;
break;
#endif /* not emacs */
- default:
- abort ();
+ default:
+ abort ();
}
- continue; /* Successfully executed one pattern command; keep going. */
+ continue; /* Successfully executed one pattern command; keep going. */
/* We goto here if a matching operation fails. */
fail:
if (!FAIL_STACK_EMPTY ())
- { /* A restart point is known. Restore to that state. */
- DEBUG_PRINT1 ("\nFAIL:\n");
- POP_FAILURE_POINT (d, p,
- lowest_active_reg, highest_active_reg,
- regstart, regend, reg_info);
-
- /* If this failure point is a dummy, try the next one. */
- if (!p)
+ { /* A restart point is known. Restore to that state. */
+ DEBUG_PRINT1 ("\nFAIL:\n");
+ POP_FAILURE_POINT (d, p,
+ lowest_active_reg, highest_active_reg,
+ regstart, regend, reg_info);
+
+ /* If this failure point is a dummy, try the next one. */
+ if (!p)
goto fail;
- /* If we failed to the end of the pattern, don't examine *p. */
+ /* If we failed to the end of the pattern, don't examine *p. */
assert (p <= pend);
- if (p < pend)
- {
- boolean is_a_jump_n = false;
-
- /* If failed to a backwards jump that's part of a repetition
- loop, need to pop this failure point and use the next one. */
- switch ((re_opcode_t) *p)
- {
- case jump_n:
- is_a_jump_n = true;
- case maybe_pop_jump:
- case pop_failure_jump:
- case jump:
- p1 = p + 1;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
-
- if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
- || (!is_a_jump_n
- && (re_opcode_t) *p1 == on_failure_jump))
- goto fail;
- break;
- default:
- /* do nothing */ ;
- }
- }
-
- if (d >= string1 && d <= end1)
+ if (p < pend)
+ {
+ boolean is_a_jump_n = false;
+
+ /* If failed to a backwards jump that's part of a repetition
+ loop, need to pop this failure point and use the next one. */
+ switch ((re_opcode_t) *p)
+ {
+ case jump_n:
+ is_a_jump_n = true;
+ case maybe_pop_jump:
+ case pop_failure_jump:
+ case jump:
+ p1 = p + 1;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p1);
+ p1 += mcnt;
+
+ if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
+ || (!is_a_jump_n
+ && (re_opcode_t) *p1 == on_failure_jump))
+ goto fail;
+ break;
+ default:
+ /* do nothing */ ;
+ }
+ }
+
+ if (d >= string1 && d <= end1)
dend = end_match_1;
- }
+ }
else
- break; /* Matching at this starting point really fails. */
+ break; /* Matching at this starting point really fails. */
} /* for (;;) */
if (best_regs_set)
FREE_VARIABLES ();
- return -1; /* Failure to match. */
+ return -1; /* Failure to match. */
} /* re_match_2 */
\f
/* Subroutine definitions for re_match_2. */
{
/* Skip over opcodes that can match nothing, and return true or
false, as appropriate, when we get to one that can't, or to the
- matching stop_memory. */
+ matching stop_memory. */
switch ((re_opcode_t) *p1)
- {
- /* Could be either a loop or a series of alternatives. */
- case on_failure_jump:
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
+ {
+ /* Could be either a loop or a series of alternatives. */
+ case on_failure_jump:
+ p1++;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- /* If the next operation is not a jump backwards in the
+ /* If the next operation is not a jump backwards in the
pattern. */
if (mcnt >= 0)
{
- /* Go through the on_failure_jumps of the alternatives,
- seeing if any of the alternatives cannot match nothing.
- The last alternative starts with only a jump,
- whereas the rest start with on_failure_jump and end
- with a jump, e.g., here is the pattern for `a|b|c':
+ /* Go through the on_failure_jumps of the alternatives,
+ seeing if any of the alternatives cannot match nothing.
+ The last alternative starts with only a jump,
+ whereas the rest start with on_failure_jump and end
+ with a jump, e.g., here is the pattern for `a|b|c':
- /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
- /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
- /exactn/1/c
+ /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
+ /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
+ /exactn/1/c
- So, we have to first go through the first (n-1)
- alternatives and then deal with the last one separately. */
+ So, we have to first go through the first (n-1)
+ alternatives and then deal with the last one separately. */
- /* Deal with the first (n-1) alternatives, which start
- with an on_failure_jump (see above) that jumps to right
- past a jump_past_alt. */
+ /* Deal with the first (n-1) alternatives, which start
+ with an on_failure_jump (see above) that jumps to right
+ past a jump_past_alt. */
- while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
- {
- /* `mcnt' holds how many bytes long the alternative
- is, including the ending `jump_past_alt' and
- its number. */
+ while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
+ {
+ /* `mcnt' holds how many bytes long the alternative
+ is, including the ending `jump_past_alt' and
+ its number. */
- if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
- reg_info))
- return false;
+ if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
+ reg_info))
+ return false;
- /* Move to right after this alternative, including the
+ /* Move to right after this alternative, including the
jump_past_alt. */
- p1 += mcnt;
+ p1 += mcnt;
- /* Break if it's the beginning of an n-th alternative
- that doesn't begin with an on_failure_jump. */
- if ((re_opcode_t) *p1 != on_failure_jump)
- break;
+ /* Break if it's the beginning of an n-th alternative
+ that doesn't begin with an on_failure_jump. */
+ if ((re_opcode_t) *p1 != on_failure_jump)
+ break;
/* Still have to check that it's not an n-th
alternative that starts with an on_failure_jump. */
p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
- {
- /* Get to the beginning of the n-th alternative. */
- p1 -= 3;
- break;
- }
- }
+ EXTRACT_NUMBER_AND_INCR (mcnt, p1);
+ if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
+ {
+ /* Get to the beginning of the n-th alternative. */
+ p1 -= 3;
+ break;
+ }
+ }
- /* Deal with the last alternative: go back and get number
- of the `jump_past_alt' just before it. `mcnt' contains
- the length of the alternative. */
- EXTRACT_NUMBER (mcnt, p1 - 2);
+ /* Deal with the last alternative: go back and get number
+ of the `jump_past_alt' just before it. `mcnt' contains
+ the length of the alternative. */
+ EXTRACT_NUMBER (mcnt, p1 - 2);
- if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
- return false;
+ if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
+ return false;
- p1 += mcnt; /* Get past the n-th alternative. */
- } /* if mcnt > 0 */
- break;
+ p1 += mcnt; /* Get past the n-th alternative. */
+ } /* if mcnt > 0 */
+ break;
- case stop_memory:
+ case stop_memory:
assert (p1[1] == **p);
- *p = p1 + 2;
- return true;
+ *p = p1 + 2;
+ return true;
- default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
- return false;
- }
+ default:
+ if (!common_op_match_null_string_p (&p1, end, reg_info))
+ return false;
+ }
} /* while p1 < end */
return false;
while (p1 < end)
{
/* Skip over opcodes that can match nothing, and break when we get
- to one that can't. */
+ to one that can't. */
switch ((re_opcode_t) *p1)
- {
- /* It's a loop. */
- case on_failure_jump:
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
- break;
+ {
+ /* It's a loop. */
+ case on_failure_jump:
+ p1++;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p1);
+ p1 += mcnt;
+ break;
default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
- return false;
- }
+ if (!common_op_match_null_string_p (&p1, end, reg_info))
+ return false;
+ }
} /* while p1 < end */
return true;
ret = group_match_null_string_p (&p1, end, reg_info);
/* Have to set this here in case we're checking a group which
- contains a group and a back reference to it. */
+ contains a group and a back reference to it. */
if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
- REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
+ REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
if (!ret)
- return false;
+ return false;
break;
- /* If this is an optimized succeed_n for zero times, make the jump. */
+ /* If this is an optimized succeed_n for zero times, make the jump. */
case jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (mcnt >= 0)
- p1 += mcnt;
+ p1 += mcnt;
else
- return false;
+ return false;
break;
case succeed_n:
- /* Get to the number of times to succeed. */
+ /* Get to the number of times to succeed. */
p1 += 2;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (mcnt == 0)
- {
- p1 -= 4;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
- }
+ {
+ p1 -= 4;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p1);
+ p1 += mcnt;
+ }
else
- return false;
+ return false;
break;
case duplicate:
if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
- return false;
+ return false;
break;
case set_number_at:
RE_TRANSLATE_TYPE translate;
{
register unsigned char *p1 = s1, *p2 = s2;
- while (len)
+ unsigned char *p1_end = s1 + len;
+ unsigned char *p2_end = s2 + len;
+
+ while (p1 != p1_end && p2 != p2_end)
{
- if (translate[*p1++] != translate[*p2++]) return 1;
- len--;
+ int p1_charlen, p2_charlen;
+ int p1_ch, p2_ch;
+
+ p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+ p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+
+ if (RE_TRANSLATE (translate, p1_ch)
+ != RE_TRANSLATE (translate, p2_ch))
+ return 1;
+
+ p1 += p1_charlen, p2 += p2_charlen;
}
+
+ if (p1 != p1_end || p2 != p2_end)
+ return 1;
+
return 0;
}
\f
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
are set in BUFP on entry.
- We call regex_compile to do the actual compilation. */
+ We call regex_compile to do the actual compilation. */
const char *
re_compile_pattern (pattern, length, bufp)
setting no_sub. */
bufp->no_sub = 0;
- /* Match anchors at newline. */
+ /* Match anchors at newline. */
bufp->newline_anchor = 1;
ret = regex_compile (pattern, length, re_syntax_options, bufp);
return gettext (re_error_msgid[(int) ret]);
}
\f
-/* Entry points compatible with 4.2 BSD regex library. We don't define
- them unless specifically requested. */
+/* Entry points compatible with 4.2 BSD regex library. We don't define
+ them unless specifically requested. */
#if defined (_REGEX_RE_COMP) || defined (_LIBC)
{
re_comp_buf.buffer = (unsigned char *) malloc (200);
if (re_comp_buf.buffer == NULL)
- return gettext (re_error_msgid[(int) REG_ESPACE]);
+ return gettext (re_error_msgid[(int) REG_ESPACE]);
re_comp_buf.allocated = 200;
re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
/* Since `re_exec' always passes NULL for the `regs' argument, we
don't need to initialize the pattern buffer fields which affect it. */
- /* Match anchors at newlines. */
+ /* Match anchors at newlines. */
re_comp_buf.newline_anchor = 1;
ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
/* regcomp takes a regular expression as a string and compiles it.
- PREG is a regex_t *. We do not expect any fields to be initialized,
+ PREG is a regex_t *. We do not expect any fields to be initialized,
since POSIX says we shouldn't. Thus, we set
`buffer' to the compiled pattern;
routine will report only success or failure, and nothing about the
registers.
- It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
+ It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
the return codes and their meanings.) */
int
= (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
* sizeof (*(RE_TRANSLATE_TYPE)0));
if (preg->translate == NULL)
- return (int) REG_ESPACE;
+ return (int) REG_ESPACE;
/* Map uppercase characters to corresponding lowercase ones. */
for (i = 0; i < CHAR_SET_SIZE; i++)
- preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
+ preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
}
else
preg->translate = NULL;
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
syntax &= ~RE_DOT_NEWLINE;
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
- /* It also changes the matching behavior. */
+ /* It also changes the matching behavior. */
preg->newline_anchor = 1;
}
else
string STRING.
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
- `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
+ `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
least NMATCH elements, and we set them to the offsets of the
corresponding matched substrings.
/* The user has told us exactly how many registers to return
information about, via `nmatch'. We have to pass that on to the
- matching routines. */
+ matching routines. */
private_preg.regs_allocated = REGS_FIXED;
if (want_reg_info)
regs.start = TALLOC (nmatch, regoff_t);
regs.end = TALLOC (nmatch, regoff_t);
if (regs.start == NULL || regs.end == NULL)
- return (int) REG_NOMATCH;
+ return (int) REG_NOMATCH;
}
/* Perform the searching operation. */
ret = re_search (&private_preg, string, len,
- /* start: */ 0, /* range: */ len,
- want_reg_info ? ®s : (struct re_registers *) 0);
+ /* start: */ 0, /* range: */ len,
+ want_reg_info ? ®s : (struct re_registers *) 0);
/* Copy the register information to the POSIX structure. */
if (want_reg_info)
{
if (ret >= 0)
- {
- unsigned r;
+ {
+ unsigned r;
- for (r = 0; r < nmatch; r++)
- {
- pmatch[r].rm_so = regs.start[r];
- pmatch[r].rm_eo = regs.end[r];
- }
- }
+ for (r = 0; r < nmatch; r++)
+ {
+ pmatch[r].rm_so = regs.start[r];
+ pmatch[r].rm_eo = regs.end[r];
+ }
+ }
- /* If we needed the temporary register info, free the space now. */
+ /* If we needed the temporary register info, free the space now. */
free (regs.start);
free (regs.end);
}
if (errcode < 0
|| errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
/* Only error codes returned by the rest of the code should be passed
- to this routine. If we are given anything else, or if other regex
+ to this routine. If we are given anything else, or if other regex
code generates an invalid error code, then the program has a bug.
Dump core so we can fix it. */
abort ();
if (errbuf_size != 0)
{
if (msg_size > errbuf_size)
- {
- strncpy (errbuf, msg, errbuf_size - 1);
- errbuf[errbuf_size - 1] = 0;
- }
+ {
+ strncpy (errbuf, msg, errbuf_size - 1);
+ errbuf[errbuf_size - 1] = 0;
+ }
else
- strcpy (errbuf, msg);
+ strcpy (errbuf, msg);
}
return msg_size;