/* TODO:
- structure the opcode space into opcode+flag.
- merge with glibc's regex.[ch].
- - replace succeed_n + jump_n with a combined operation so that the counter
- can simply be decremented when popping the failure_point without having
- to stack up failure_count entries.
- - get rid of `newline_anchor'.
- */
+ - replace (succeed_n + jump_n + set_number_at) with something that doesn't
+ need to modify the compiled regexp so that re_match can be reentrant.
+ - get rid of on_failure_jump_smart by doing the optimization in re_comp
+ rather than at run-time, so that re_match can be reentrant.
+*/
/* AIX requires this to be the first thing in the file. */
#if defined _AIX && !defined REGEX_MALLOC
# include <sys/types.h>
#endif
+/* Whether to use ISO C Amendment 1 wide char functions.
+ Those should not be used for Emacs since it uses its own. */
+#if defined _LIBC
+#define WIDE_CHAR_SUPPORT 1
+#else
+#define WIDE_CHAR_SUPPORT \
+ (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
+#endif
+
+/* For platform which support the ISO C amendement 1 functionality we
+ support user defined character classes. */
+#if WIDE_CHAR_SUPPORT
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
+#ifdef _LIBC
+/* We have to keep the namespace clean. */
+# define regfree(preg) __regfree (preg)
+# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
+# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
+# define regerror(errcode, preg, errbuf, errbuf_size) \
+ __regerror(errcode, preg, errbuf, errbuf_size)
+# define re_set_registers(bu, re, nu, st, en) \
+ __re_set_registers (bu, re, nu, st, en)
+# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
+ __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
+# define re_match(bufp, string, size, pos, regs) \
+ __re_match (bufp, string, size, pos, regs)
+# define re_search(bufp, string, size, startpos, range, regs) \
+ __re_search (bufp, string, size, startpos, range, regs)
+# define re_compile_pattern(pattern, length, bufp) \
+ __re_compile_pattern (pattern, length, bufp)
+# define re_set_syntax(syntax) __re_set_syntax (syntax)
+# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
+ __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
+# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
+
+/* Make sure we call libc's function even if the user overrides them. */
+# define btowc __btowc
+# define iswctype __iswctype
+# define wctype __wctype
+
+# define WEAK_ALIAS(a,b) weak_alias (a, b)
+
+/* We are also using some library internals. */
+# include <locale/localeinfo.h>
+# include <locale/elem-hash.h>
+# include <langinfo.h>
+#else
+# define WEAK_ALIAS(a,b)
+#endif
+
/* This is for other GNU distributions with internationalized messages. */
#if HAVE_LIBINTL_H || defined _LIBC
# include <libintl.h>
# include "charset.h"
# include "category.h"
+# ifdef malloc
+# undef malloc
+# endif
# define malloc xmalloc
+# ifdef realloc
+# undef realloc
+# endif
# define realloc xrealloc
+# ifdef free
+# undef free
+# endif
# define free xfree
/* Converts the pointer to the char to BEG-based offset from the start. */
? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
: 1)
-# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
+# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
: 1)
is followed by a range table:
2 bytes of flags for character sets (low 8 bits, high 8 bits)
See RANGE_TABLE_WORK_BITS below.
- 2 bytes, the number of pairs that follow
+ 2 bytes, the number of pairs that follow (upto 32767)
pairs, each 2 multibyte characters,
each multibyte character represented as 3 bytes. */
charset,
static void
extract_number (dest, source)
int *dest;
- unsigned char *source;
+ re_char *source;
{
int temp = SIGN_EXTEND_CHAR (*(source + 1));
*dest = *source & 0377;
static void
extract_number_and_incr (destination, source)
int *destination;
- unsigned char **source;
+ re_char **source;
{
extract_number (destination, *source);
*source += 2;
#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
do \
{ \
- int range_start, range_end; \
- unsigned char *p; \
- unsigned char *range_table_end \
+ re_wchar_t range_start, range_end; \
+ re_char *p; \
+ re_char *range_table_end \
= CHARSET_RANGE_TABLE_END ((range_table), (count)); \
\
for (p = (range_table); p < range_table_end; p += 2 * 3) \
{ \
/* Number of ranges in range table. */ \
int count; \
- unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
- \
+ re_char *range_table = CHARSET_RANGE_TABLE (charset); \
+ \
EXTRACT_NUMBER_AND_INCR (count, range_table); \
CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
} \
void
print_partial_compiled_pattern (start, end)
- unsigned char *start;
- unsigned char *end;
+ re_char *start;
+ re_char *end;
{
int mcnt, mcnt2;
- unsigned char *p = start;
- unsigned char *pend = end;
+ re_char *p = start;
+ re_char *pend = end;
if (start == NULL)
{
print_compiled_pattern (bufp)
struct re_pattern_buffer *bufp;
{
- unsigned char *buffer = bufp->buffer;
+ re_char *buffer = bufp->buffer;
print_partial_compiled_pattern (buffer, buffer + bufp->used);
printf ("%ld bytes used/%ld bytes allocated.\n",
printf ("re_nsub: %d\t", bufp->re_nsub);
printf ("regs_alloc: %d\t", bufp->regs_allocated);
printf ("can_be_null: %d\t", bufp->can_be_null);
- printf ("newline_anchor: %d\n", bufp->newline_anchor);
printf ("no_sub: %d\t", bufp->no_sub);
printf ("not_bol: %d\t", bufp->not_bol);
printf ("not_eol: %d\t", bufp->not_eol);
re_syntax_options = syntax;
return ret;
}
+WEAK_ALIAS (__re_set_syntax, re_set_syntax)
\f
/* This table gives an error message for each of the error codes listed
in regex.h. Obviously the order here has to be same as there.
/* Roughly the maximum number of failure points on the stack. Would be
exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
This is a variable only so users of regex can assign to it; we never
- change it ourselves. */
-#if defined MATCH_MAY_ALLOCATE
-/* Note that 4400 is enough to cause a crash on Alpha OSF/1,
+ change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
+ before using it, so it should probably be a byte-count instead. */
+# if defined MATCH_MAY_ALLOCATE
+/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. In order for a larger
value to work reliably, you have to try to make it accord
with the process stack limit. */
-int re_max_failures = 40000;
-#else
-int re_max_failures = 4000;
-#endif
+size_t re_max_failures = 40000;
+# else
+size_t re_max_failures = 4000;
+# endif
union fail_stack_elt
{
- const unsigned char *pointer;
- unsigned int integer;
+ re_char *pointer;
+ /* This should be the biggest `int' that's no bigger than a pointer. */
+ long integer;
};
typedef union fail_stack_elt fail_stack_elt_t;
typedef struct
{
fail_stack_elt_t *stack;
- unsigned size;
- unsigned avail; /* Offset of next open position. */
- unsigned frame; /* Offset of the cur constructed frame. */
+ size_t size;
+ size_t avail; /* Offset of next open position. */
+ size_t frame; /* Offset of the cur constructed frame. */
} fail_stack_type;
-#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1)))
-/* Push pointer POINTER on FAIL_STACK.
- Return 1 if was able to do so and 0 if ran out of memory allocating
- space to do so. */
-#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
- ((FAIL_STACK_FULL () \
- && !GROW_FAIL_STACK (FAIL_STACK)) \
- ? 0 \
- : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
- 1))
-#define POP_PATTERN_OP() POP_FAILURE_POINTER ()
-
/* Push a pointer value onto the failure stack.
Assumes the variable `fail_stack'. Probably should only
be called from within `PUSH_FAILURE_POINT'. */
#define PUSH_FAILURE_POINTER(item) \
- fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
+ fail_stack.stack[fail_stack.avail++].pointer = (item)
/* This pushes an integer-valued item onto the failure stack.
Assumes the variable `fail_stack'. Probably should only
PUSH_FAILURE_INT (num); \
} while (0)
-#define PUSH_FAILURE_COUNT(ptr) \
+/* Change the counter's value to VAL, but make sure that it will
+ be reset when backtracking. */
+#define PUSH_NUMBER(ptr,val) \
do { \
char *destination; \
int c; \
ENSURE_FAIL_STACK(3); \
EXTRACT_NUMBER (c, ptr); \
- DEBUG_PRINT3 (" Push counter %p = %d\n", ptr, c); \
+ DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
PUSH_FAILURE_INT (c); \
PUSH_FAILURE_POINTER (ptr); \
PUSH_FAILURE_INT (-1); \
+ STORE_NUMBER (ptr, val); \
} while (0)
/* Pop a saved register off the stack. */
if (reg == -1) \
{ \
/* It's a counter. */ \
+ /* Here, we discard `const', making re_match non-reentrant. */ \
unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
reg = POP_FAILURE_INT (); \
STORE_NUMBER (ptr, reg); \
/* Estimate the size of data pushed by a typical failure stack entry.
An estimate is all we need, because all we use this for
is to choose a limit for how big to make the failure stack. */
-
+/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
#define TYPICAL_FAILURE_SIZE 20
/* How many items can still be added to the stack without overflowing it. */
while (fail_stack.frame < fail_stack.avail) \
POP_FAILURE_REG_OR_COUNT (); \
\
- pat = (unsigned char *) POP_FAILURE_POINTER (); \
+ pat = POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
\
/* If the saved string location is NULL, it came from an \
on_failure_keep_string_jump opcode, and we want to throw away the \
saved NULL, thus retaining our current position in the string. */ \
- str = (re_char *) POP_FAILURE_POINTER (); \
+ str = POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping string %p: `", str); \
DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
DEBUG_PRINT1 ("'\n"); \
int arg, unsigned char *end));
static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
int arg1, int arg2, unsigned char *end));
-static boolean at_begline_loc_p _RE_ARGS ((const unsigned char *pattern,
- const unsigned char *p,
+static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
+ re_char *p,
reg_syntax_t syntax));
-static boolean at_endline_loc_p _RE_ARGS ((const unsigned char *p,
- const unsigned char *pend,
+static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
+ re_char *pend,
reg_syntax_t syntax));
-static unsigned char *skip_one_char _RE_ARGS ((unsigned char *p));
-static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend,
+static re_char *skip_one_char _RE_ARGS ((re_char *p));
+static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
char *fastmap, const int multibyte));
/* Fetch the next character in the uncompiled pattern---translating it
- if necessary. Also cast from a signed character in the constant
- string passed to us by the user to an unsigned char that we can use
- as an array index (in, e.g., `translate'). */
+ if necessary. */
#define PATFETCH(c) \
do { \
PATFETCH_RAW (c); \
/* Make sure we have at least N more bytes of space in buffer. */
#define GET_BUFFER_SPACE(n) \
- while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
+ while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
EXTEND_BUFFER ()
/* Make sure we have one more byte of buffer space and then add C to it. */
#endif
#define EXTEND_BUFFER() \
do { \
- unsigned char *old_buffer = bufp->buffer; \
+ re_char *old_buffer = bufp->buffer; \
if (bufp->allocated == MAX_BUF_SIZE) \
return REG_ESIZE; \
bufp->allocated <<= 1; \
if (bufp->allocated > MAX_BUF_SIZE) \
bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
+ RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
if (bufp->buffer == NULL) \
return REG_ESPACE; \
/* If the buffer moved, move all the pointers into it. */ \
#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
(work_area).bits |= (bit)
-/* These bits represent the various character classes such as [:alnum:]
- in a charset's range table. */
-#define BIT_ALNUM 0x1
-#define BIT_ALPHA 0x2
-#define BIT_WORD 0x4
-#define BIT_ASCII 0x8
-#define BIT_NONASCII 0x10
-#define BIT_GRAPH 0x20
-#define BIT_LOWER 0x40
-#define BIT_PRINT 0x80
-#define BIT_PUNCT 0x100
-#define BIT_SPACE 0x200
-#define BIT_UPPER 0x400
-#define BIT_UNIBYTE 0x800
-#define BIT_MULTIBYTE 0x1000
+/* Bits used to implement the multibyte-part of the various character classes
+ such as [:alnum:] in a charset's range table. */
+#define BIT_WORD 0x1
+#define BIT_LOWER 0x2
+#define BIT_PUNCT 0x4
+#define BIT_SPACE 0x8
+#define BIT_UPPER 0x10
+#define BIT_MULTIBYTE 0x20
/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
/* Set the bit for character C in a list. */
-#define SET_LIST_BIT(c) \
- (b[((unsigned char) (c)) / BYTEWIDTH] \
- |= 1 << (((unsigned char) c) % BYTEWIDTH))
+#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
/* Get the next unsigned number in the uncompiled pattern. */
} \
} while (0)
-#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
-
-#define IS_CHAR_CLASS(string) \
- (STREQ (string, "alpha") || STREQ (string, "upper") \
- || STREQ (string, "lower") || STREQ (string, "digit") \
- || STREQ (string, "alnum") || STREQ (string, "xdigit") \
- || STREQ (string, "space") || STREQ (string, "print") \
- || STREQ (string, "punct") || STREQ (string, "graph") \
- || STREQ (string, "cntrl") || STREQ (string, "blank") \
- || STREQ (string, "word") \
- || STREQ (string, "ascii") || STREQ (string, "nonascii") \
- || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
-
-/* QUIT is only used on NTemacs. */
-#if !defined WINDOWSNT || !defined emacs || !defined QUIT
-# undef QUIT
-# define QUIT
+#if WIDE_CHAR_SUPPORT
+/* The GNU C library provides support for user-defined character classes
+ and the functions from ISO C amendement 1. */
+# ifdef CHARCLASS_NAME_MAX
+# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
+# else
+/* This shouldn't happen but some implementation might still have this
+ problem. Use a reasonable default value. */
+# define CHAR_CLASS_MAX_LENGTH 256
+# endif
+typedef wctype_t re_wctype_t;
+typedef wchar_t re_wchar_t;
+# define re_wctype wctype
+# define re_iswctype iswctype
+# define re_wctype_to_bit(cc) 0
+#else
+# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
+# define btowc(c) c
+
+/* Character classes. */
+typedef enum { RECC_ERROR = 0,
+ RECC_ALNUM, RECC_ALPHA, RECC_WORD,
+ RECC_GRAPH, RECC_PRINT,
+ RECC_LOWER, RECC_UPPER,
+ RECC_PUNCT, RECC_CNTRL,
+ RECC_DIGIT, RECC_XDIGIT,
+ RECC_BLANK, RECC_SPACE,
+ RECC_MULTIBYTE, RECC_NONASCII,
+ RECC_ASCII, RECC_UNIBYTE
+} re_wctype_t;
+
+typedef int re_wchar_t;
+
+/* Map a string to the char class it names (if any). */
+static re_wctype_t
+re_wctype (str)
+ re_char *str;
+{
+ const char *string = str;
+ if (STREQ (string, "alnum")) return RECC_ALNUM;
+ else if (STREQ (string, "alpha")) return RECC_ALPHA;
+ else if (STREQ (string, "word")) return RECC_WORD;
+ else if (STREQ (string, "ascii")) return RECC_ASCII;
+ else if (STREQ (string, "nonascii")) return RECC_NONASCII;
+ else if (STREQ (string, "graph")) return RECC_GRAPH;
+ else if (STREQ (string, "lower")) return RECC_LOWER;
+ else if (STREQ (string, "print")) return RECC_PRINT;
+ else if (STREQ (string, "punct")) return RECC_PUNCT;
+ else if (STREQ (string, "space")) return RECC_SPACE;
+ else if (STREQ (string, "upper")) return RECC_UPPER;
+ else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
+ else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
+ else if (STREQ (string, "digit")) return RECC_DIGIT;
+ else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
+ else if (STREQ (string, "cntrl")) return RECC_CNTRL;
+ else if (STREQ (string, "blank")) return RECC_BLANK;
+ else return 0;
+}
+
+/* True iff CH is in the char class CC. */
+static boolean
+re_iswctype (ch, cc)
+ int ch;
+ re_wctype_t cc;
+{
+ switch (cc)
+ {
+ case RECC_ALNUM: return ISALNUM (ch);
+ case RECC_ALPHA: return ISALPHA (ch);
+ case RECC_BLANK: return ISBLANK (ch);
+ case RECC_CNTRL: return ISCNTRL (ch);
+ case RECC_DIGIT: return ISDIGIT (ch);
+ case RECC_GRAPH: return ISGRAPH (ch);
+ case RECC_LOWER: return ISLOWER (ch);
+ case RECC_PRINT: return ISPRINT (ch);
+ case RECC_PUNCT: return ISPUNCT (ch);
+ case RECC_SPACE: return ISSPACE (ch);
+ case RECC_UPPER: return ISUPPER (ch);
+ case RECC_XDIGIT: return ISXDIGIT (ch);
+ case RECC_ASCII: return IS_REAL_ASCII (ch);
+ case RECC_NONASCII: return !IS_REAL_ASCII (ch);
+ case RECC_UNIBYTE: return ISUNIBYTE (ch);
+ case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
+ case RECC_WORD: return ISWORD (ch);
+ case RECC_ERROR: return false;
+ default:
+ abort();
+ }
+}
+
+/* Return a bit-pattern to use in the range-table bits to match multibyte
+ chars of class CC. */
+static int
+re_wctype_to_bit (cc)
+ re_wctype_t cc;
+{
+ switch (cc)
+ {
+ case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
+ case RECC_MULTIBYTE: return BIT_MULTIBYTE;
+ case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+ case RECC_LOWER: return BIT_LOWER;
+ case RECC_UPPER: return BIT_UPPER;
+ case RECC_PUNCT: return BIT_PUNCT;
+ case RECC_SPACE: return BIT_SPACE;
+ case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
+ case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+ default:
+ abort();
+ }
+}
+#endif
+
+/* Explicit quit checking is only used on NTemacs. */
+#if defined WINDOWSNT && defined emacs && defined QUIT
+extern int immediate_quit;
+# define IMMEDIATE_QUIT_CHECK \
+ do { \
+ if (immediate_quit) QUIT; \
+ } while (0)
+#else
+# define IMMEDIATE_QUIT_CHECK ((void)0)
#endif
\f
#ifndef MATCH_MAY_ALLOCATE
`re_nsub' is the number of subexpressions in PATTERN;
`not_bol' and `not_eol' are zero;
- The `fastmap' and `newline_anchor' fields are neither
- examined nor set. */
+ The `fastmap' field is neither examined nor set. */
/* Insert the `jump' from the end of last alternative to "here".
The space for the jump has already been allocated. */
reg_syntax_t syntax;
struct re_pattern_buffer *bufp;
{
- /* We fetch characters from PATTERN here. Even though PATTERN is
- `char *' (i.e., signed), we declare these variables as unsigned, so
- they can be reliably used as array indices. */
- register unsigned int c, c1;
+ /* We fetch characters from PATTERN here. */
+ register re_wchar_t c, c1;
/* A random temporary spot in PATTERN. */
re_char *p1;
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's come before. */
|| at_begline_loc_p (pattern, p, syntax))
- BUF_PUSH (begline);
+ BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
else
goto normal_char;
}
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's next. */
|| at_endline_loc_p (p, pend, syntax))
- BUF_PUSH (endline);
+ BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
else
goto normal_char;
}
boolean simple = skip_one_char (laststart) == b;
unsigned int startoffset = 0;
re_opcode_t ofj =
+ /* Check if the loop can match the empty string. */
(simple || !analyse_first (laststart, b, NULL, 0)) ?
on_failure_jump : on_failure_jump_loop;
assert (skip_one_char (laststart) <= b);
syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
{
/* Leave room for the null. */
- char str[CHAR_CLASS_MAX_LENGTH + 1];
+ unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
const unsigned char *class_beg;
PATFETCH (c);
for (;;)
{
- PATFETCH (c);
- if (c == ':' || c == ']' || p == pend
- || c1 == CHAR_CLASS_MAX_LENGTH)
- break;
- str[c1++] = c;
+ PATFETCH (c);
+ if ((c == ':' && *p == ']') || p == pend)
+ break;
+ if (c1 < CHAR_CLASS_MAX_LENGTH)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
}
str[c1] = '\0';
if (c == ':' && *p == ']')
{
int ch;
- boolean is_alnum = STREQ (str, "alnum");
- boolean is_alpha = STREQ (str, "alpha");
- boolean is_ascii = STREQ (str, "ascii");
- boolean is_blank = STREQ (str, "blank");
- boolean is_cntrl = STREQ (str, "cntrl");
- boolean is_digit = STREQ (str, "digit");
- boolean is_graph = STREQ (str, "graph");
- boolean is_lower = STREQ (str, "lower");
- boolean is_multibyte = STREQ (str, "multibyte");
- boolean is_nonascii = STREQ (str, "nonascii");
- boolean is_print = STREQ (str, "print");
- boolean is_punct = STREQ (str, "punct");
- boolean is_space = STREQ (str, "space");
- boolean is_unibyte = STREQ (str, "unibyte");
- boolean is_upper = STREQ (str, "upper");
- boolean is_word = STREQ (str, "word");
- boolean is_xdigit = STREQ (str, "xdigit");
-
- if (!IS_CHAR_CLASS (str))
+ re_wctype_t cc;
+
+ cc = re_wctype (str);
+
+ if (cc == 0)
FREE_STACK_RETURN (REG_ECTYPE);
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH (c);
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
/* Most character classes in a multibyte match
just set a flag. Exceptions are is_blank,
is_digit, is_cntrl, and is_xdigit, since
they can only match ASCII characters. We
- don't need to handle them for multibyte. */
+ don't need to handle them for multibyte.
+ They are distinguished by a negative wctype. */
if (multibyte)
- {
- int bit = 0;
-
- if (is_alnum) bit = BIT_ALNUM;
- if (is_alpha) bit = BIT_ALPHA;
- if (is_ascii) bit = BIT_ASCII;
- if (is_graph) bit = BIT_GRAPH;
- if (is_lower) bit = BIT_LOWER;
- if (is_multibyte) bit = BIT_MULTIBYTE;
- if (is_nonascii) bit = BIT_NONASCII;
- if (is_print) bit = BIT_PRINT;
- if (is_punct) bit = BIT_PUNCT;
- if (is_space) bit = BIT_SPACE;
- if (is_unibyte) bit = BIT_UNIBYTE;
- if (is_upper) bit = BIT_UPPER;
- if (is_word) bit = BIT_WORD;
- if (bit)
- SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
- bit);
- }
+ SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
+ re_wctype_to_bit (cc));
- /* Handle character classes for ASCII characters. */
- for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
+ for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
{
int translated = TRANSLATE (ch);
- /* This was split into 3 if's to
- avoid an arbitrary limit in some compiler. */
- if ( (is_alnum && ISALNUM (ch))
- || (is_alpha && ISALPHA (ch))
- || (is_blank && ISBLANK (ch))
- || (is_cntrl && ISCNTRL (ch)))
- SET_LIST_BIT (translated);
- if ( (is_digit && ISDIGIT (ch))
- || (is_graph && ISGRAPH (ch))
- || (is_lower && ISLOWER (ch))
- || (is_print && ISPRINT (ch)))
- SET_LIST_BIT (translated);
- if ( (is_punct && ISPUNCT (ch))
- || (is_space && ISSPACE (ch))
- || (is_upper && ISUPPER (ch))
- || (is_xdigit && ISXDIGIT (ch)))
- SET_LIST_BIT (translated);
- if ( (is_ascii && IS_REAL_ASCII (ch))
- || (is_nonascii && !IS_REAL_ASCII (ch))
- || (is_unibyte && ISUNIBYTE (ch))
- || (is_multibyte && !ISUNIBYTE (ch)))
- SET_LIST_BIT (translated);
-
- if ( (is_word && ISWORD (ch)))
+ if (re_iswctype (btowc (ch), cc))
SET_LIST_BIT (translated);
}
{
if (! SINGLE_BYTE_CHAR_P (c1))
{
- /* Handle a range such as \177-\377 in
- multibyte mode. Split that into two
- ranges, the low one ending at 0237, and
- the high one starting at the smallest
- character in the charset of C1 and
- ending at C1. */
+ /* Handle a range starting with a
+ character of less than 256, and ending
+ with a character of not less than 256.
+ Split that into two ranges, the low one
+ ending at 0377, and the high one
+ starting at the smallest character in
+ the charset of C1 and ending at C1. */
int charset = CHAR_CHARSET (c1);
int c2 = MAKE_CHAR (charset, 0, 0);
SET_RANGE_TABLE_WORK_AREA (range_table_work,
c2, c1);
- c1 = 0237;
+ c1 = 0377;
}
}
else if (!SAME_CHARSET_P (c, c1))
if (SINGLE_BYTE_CHAR_P (c))
/* ... into bitmap. */
{
- unsigned this_char;
+ re_wchar_t this_char;
int range_start = c, range_end = c1;
/* If the start is after the end, the range is empty. */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
- if (syntax & RE_NO_BK_REFS)
- goto normal_char;
+ {
+ regnum_t reg;
- c1 = c - '0';
+ if (syntax & RE_NO_BK_REFS)
+ goto normal_backslash;
- if (c1 > regnum)
- FREE_STACK_RETURN (REG_ESUBREG);
+ reg = c - '0';
- /* Can't back reference to a subexpression if inside of it. */
- if (group_in_compile_stack (compile_stack, (regnum_t) c1))
- goto normal_char;
+ /* Can't back reference to a subexpression before its end. */
+ if (reg > regnum || group_in_compile_stack (compile_stack, reg))
+ FREE_STACK_RETURN (REG_ESUBREG);
- laststart = b;
- BUF_PUSH_2 (duplicate, c1);
+ laststart = b;
+ BUF_PUSH_2 (duplicate, reg);
+ }
break;
static boolean
at_begline_loc_p (pattern, p, syntax)
- const unsigned char *pattern, *p;
+ re_char *pattern, *p;
reg_syntax_t syntax;
{
- const unsigned char *prev = p - 2;
+ re_char *prev = p - 2;
boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
return
static boolean
at_endline_loc_p (p, pend, syntax)
- const unsigned char *p, *pend;
+ re_char *p, *pend;
reg_syntax_t syntax;
{
- const unsigned char *next = p;
+ re_char *next = p;
boolean next_backslash = *next == '\\';
- const unsigned char *next_next = p + 1 < pend ? p + 1 : 0;
+ re_char *next_next = p + 1 < pend ? p + 1 : 0;
return
/* Before a subexpression? */
Return 1 if p..pend might match the empty string.
Return 0 if p..pend matches at least one char.
- Return -1 if p..pend matches at least one char, but fastmap was not
- updated accurately.
- Return -2 if an error occurred. */
+ Return -1 if fastmap was not updated accurately. */
static int
analyse_first (p, pend, fastmap, multibyte)
- unsigned char *p, *pend;
+ re_char *p, *pend;
char *fastmap;
const int multibyte;
{
int j, k;
boolean not;
-#ifdef MATCH_MAY_ALLOCATE
- fail_stack_type fail_stack;
-#endif
-#ifndef REGEX_MALLOC
- char *destination;
-#endif
-
-#if defined REL_ALLOC && defined REGEX_MALLOC
- /* This holds the pointer to the failure stack, when
- it is allocated relocatably. */
- fail_stack_elt_t *failure_stack_ptr;
-#endif
-
- /* Assume that each path through the pattern can be null until
- proven otherwise. We set this false at the bottom of switch
- statement, to which we get only if a particular path doesn't
- match the empty string. */
- boolean path_can_be_null = true;
/* If all elements for base leading-codes in fastmap is set, this
flag is set true. */
assert (p);
- INIT_FAIL_STACK ();
-
/* The loop below works as follows:
- It has a working-list kept in the PATTERN_STACK and which basically
starts by only containing a pointer to the first operation.
so that `p' is monotonically increasing. More to the point, we
never set `p' (or push) anything `<= p1'. */
- /* If can_be_null is set, then the fastmap will not be used anyway. */
- while (1)
+ while (p < pend)
{
/* `p1' is used as a marker of how far back a `on_failure_jump'
can go without being ignored. It is normally equal to `p'
3..9: <body>
10: on_failure_jump 3
as used for the *? operator. */
- unsigned char *p1 = p;
-
- if (p >= pend)
- {
- if (path_can_be_null)
- return (RESET_FAIL_STACK (), 1);
-
- /* We have reached the (effective) end of pattern. */
- if (PATTERN_STACK_EMPTY ())
- return (RESET_FAIL_STACK (), 0);
-
- p = (unsigned char*) POP_PATTERN_OP ();
- path_can_be_null = true;
- continue;
- }
-
- /* We should never be about to go beyond the end of the pattern. */
- assert (p < pend);
+ re_char *p1 = p;
switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
{
case succeed:
- p = pend;
+ return 1;
continue;
case duplicate:
/* We could put all the chars except for \n (and maybe \0)
but we don't bother since it is generally not worth it. */
if (!fastmap) break;
- return (RESET_FAIL_STACK (), -1);
+ return -1;
case charset_not:
#else /* emacs */
/* This match depends on text properties. These end with
aborting optimizations. */
- return (RESET_FAIL_STACK (), -1);
+ return -1;
case categoryspec:
case notcategoryspec:
EXTRACT_NUMBER_AND_INCR (j, p);
if (p + j <= p1)
; /* Backward jump to be ignored. */
- else if (!PUSH_PATTERN_OP (p + j, fail_stack))
- return (RESET_FAIL_STACK (), -2);
+ else
+ { /* We have to look down both arms.
+ We first go down the "straight" path so as to minimize
+ stack usage when going through alternatives. */
+ int r = analyse_first (p, pend, fastmap, multibyte);
+ if (r) return r;
+ p += j;
+ }
continue;
/* Getting here means we have found the possible starting
characters for one path of the pattern -- and that the empty
- string does not match. We need not follow this path further.
- Instead, look at the next alternative (remembered on the
- stack), or quit if no more. The test at the top of the loop
- does these things. */
- path_can_be_null = false;
- p = pend;
+ string does not match. We need not follow this path further. */
+ return 0;
} /* while p */
- return (RESET_FAIL_STACK (), 0);
+ /* We reached the end without matching anything. */
+ return 1;
+
} /* analyse_first */
\f
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
fastmap, RE_MULTIBYTE_P (bufp));
- if (analysis < -1)
- return analysis;
bufp->can_be_null = (analysis != 0);
return 0;
} /* re_compile_fastmap */
regs->start = regs->end = (regoff_t *) 0;
}
}
+WEAK_ALIAS (__re_set_registers, re_set_registers)
\f
/* Searching routines. */
return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
regs, size);
}
+WEAK_ALIAS (__re_search, re_search)
/* End address of virtual concatenation of string. */
#define STOP_ADDR_VSTRING(P) \
register RE_TRANSLATE_TYPE translate = bufp->translate;
int total_size = size1 + size2;
int endpos = startpos + range;
- int anchored_start = 0;
+ boolean anchored_start;
/* Nonzero if we have to concern multibyte character. */
const boolean multibyte = RE_MULTIBYTE_P (bufp);
/* Update the fastmap now if not correct already. */
if (fastmap && !bufp->fastmap_accurate)
- if (re_compile_fastmap (bufp) == -2)
- return -2;
+ re_compile_fastmap (bufp);
/* See whether the pattern is anchored. */
- if (bufp->buffer[0] == begline)
- anchored_start = 1;
+ anchored_start = (bufp->buffer[0] == begline);
#ifdef emacs
gl_state.object = re_match_object;
because that case doesn't repeat. */
if (anchored_start && startpos > 0)
{
- if (! (bufp->newline_anchor
- && ((startpos <= size1 ? string1[startpos - 1]
- : string2[startpos - size1 - 1])
- == '\n')))
+ if (! ((startpos <= size1 ? string1[startpos - 1]
+ : string2[startpos - size1 - 1])
+ == '\n'))
goto advance;
}
if (fastmap && startpos < total_size && !bufp->can_be_null)
{
register re_char *d;
- register unsigned int buf_ch;
+ register re_wchar_t buf_ch;
d = POS_ADDR_VSTRING (startpos);
}
return -1;
} /* re_search_2 */
+WEAK_ALIAS (__re_search_2, re_search_2)
\f
/* Declarations and macros for re_match_2. */
/* If the operation is a match against one or more chars,
return a pointer to the next operation, else return NULL. */
-static unsigned char *
+static re_char *
skip_one_char (p)
- unsigned char *p;
+ re_char *p;
{
switch (SWITCH_ENUM_CAST (*p++))
{
break;
case endline:
- if (!bufp->newline_anchor)
- break;
- /* Fallthrough */
case exactn:
{
- register unsigned int c
+ register re_wchar_t c
= (re_opcode_t) *p2 == endline ? '\n'
- : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
+ : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
if ((re_opcode_t) *p1 == exactn)
{
break;
case charset:
- case charset_not:
{
if ((re_opcode_t) *p1 == exactn)
/* Reuse the code above. */
return mutually_exclusive_p (bufp, p2, p1);
-
/* It is hard to list up all the character in charset
P2 if it includes multibyte character. Give up in
such case. */
P2 is ASCII, it is enough to test only bitmap
table of P1. */
- if (*p1 == *p2)
+ if ((re_opcode_t) *p1 == charset)
{
int idx;
/* We win if the charset inside the loop
return 1;
}
}
- else if ((re_opcode_t) *p1 == charset
- || (re_opcode_t) *p1 == charset_not)
+ else if ((re_opcode_t) *p1 == charset_not)
{
int idx;
/* We win if the charset_not inside the loop lists
}
}
}
+ break;
+ case charset_not:
+ switch (SWITCH_ENUM_CAST (*p1))
+ {
+ case exactn:
+ case charset:
+ /* Reuse the code above. */
+ return mutually_exclusive_p (bufp, p2, p1);
+ case charset_not:
+ /* When we have two charset_not, it's very unlikely that
+ they don't overlap. The union of the two sets of excluded
+ chars should cover all possible chars, which, as a matter of
+ fact, is virtually impossible in multibyte buffers. */
+ ;
+ }
+ break;
+
case wordend:
case notsyntaxspec:
return ((re_opcode_t) *p1 == syntaxspec
# endif
return result;
}
+WEAK_ALIAS (__re_match, re_match)
#endif /* not emacs */
#ifdef emacs
#endif
return result;
}
+WEAK_ALIAS (__re_match_2, re_match_2)
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
{
/* General temporaries. */
int mcnt;
+ size_t reg;
boolean not;
- unsigned char *p1;
/* Just past the end of the corresponding string. */
re_char *end1, *end2;
re_char *dfail;
/* Where we are in the pattern, and the end of the pattern. */
- unsigned char *p = bufp->buffer;
- register unsigned char *pend = p + bufp->used;
+ re_char *p = bufp->buffer;
+ re_char *pend = p + bufp->used;
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
/* Initialize subexpression text positions to -1 to mark ones that no
start_memory/stop_memory has been seen for. Also initialize the
register information struct. */
- for (mcnt = 1; mcnt < num_regs; mcnt++)
- regstart[mcnt] = regend[mcnt] = NULL;
+ for (reg = 1; reg < num_regs; reg++)
+ regstart[reg] = regend[reg] = NULL;
/* We move `string1' into `string2' if the latter's empty -- but not if
`string1' is null. */
DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
- for (mcnt = 1; mcnt < num_regs; mcnt++)
+ for (reg = 1; reg < num_regs; reg++)
{
- best_regstart[mcnt] = regstart[mcnt];
- best_regend[mcnt] = regend[mcnt];
+ best_regstart[reg] = regstart[reg];
+ best_regend[reg] = regend[reg];
}
}
goto fail;
dend = ((d >= string1 && d <= end1)
? end_match_1 : end_match_2);
- for (mcnt = 1; mcnt < num_regs; mcnt++)
+ for (reg = 1; reg < num_regs; reg++)
{
- regstart[mcnt] = best_regstart[mcnt];
- regend[mcnt] = best_regend[mcnt];
+ regstart[reg] = best_regstart[reg];
+ regend[reg] = best_regend[reg];
}
}
} /* d != end_match_2 */
/* Go through the first `min (num_regs, regs->num_regs)'
registers, since that is all we initialized. */
- for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
+ for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
{
- if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
- regs->start[mcnt] = regs->end[mcnt] = -1;
+ if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
+ regs->start[reg] = regs->end[reg] = -1;
else
{
- regs->start[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
- regs->end[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
+ regs->start[reg]
+ = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
+ regs->end[reg]
+ = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
}
}
we (re)allocated the registers, this is the case,
because we always allocate enough to have at least one
-1 at the end. */
- for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
- regs->start[mcnt] = regs->end[mcnt] = -1;
+ for (reg = num_regs; reg < regs->num_regs; reg++)
+ regs->start[reg] = regs->end[reg] = -1;
} /* regs && !bufp->no_sub */
DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
case anychar:
{
int buf_charlen;
- unsigned int buf_ch;
+ re_wchar_t buf_ch;
DEBUG_PRINT1 ("EXECUTING anychar.\n");
/* Start of actual range_table, or end of bitmap if there is no
range table. */
- unsigned char *range_table;
+ re_char *range_table;
/* Nonzero if there is a range table. */
int range_table_exists;
{
int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
- if ( (class_bits & BIT_ALNUM && ISALNUM (c))
- | (class_bits & BIT_ALPHA && ISALPHA (c))
- | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
- | (class_bits & BIT_GRAPH && ISGRAPH (c))
- | (class_bits & BIT_LOWER && ISLOWER (c))
- | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
- | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
- | (class_bits & BIT_PRINT && ISPRINT (c))
+ if ( (class_bits & BIT_LOWER && ISLOWER (c))
+ | (class_bits & BIT_MULTIBYTE)
| (class_bits & BIT_PUNCT && ISPUNCT (c))
| (class_bits & BIT_SPACE && ISSPACE (c))
- | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
| (class_bits & BIT_UPPER && ISUPPER (c))
| (class_bits & BIT_WORD && ISWORD (c)))
not = !not;
/* begline matches the empty string at the beginning of the string
- (unless `not_bol' is set in `bufp'), and, if
- `newline_anchor' is set, after newlines. */
+ (unless `not_bol' is set in `bufp'), and after newlines. */
case begline:
DEBUG_PRINT1 ("EXECUTING begline.\n");
{
unsigned char c;
GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
- if (c == '\n' && bufp->newline_anchor)
+ if (c == '\n')
break;
}
/* In all other cases, we fail. */
else
{
PREFETCH_NOLIMIT ();
- if (*d == '\n' && bufp->newline_anchor)
+ if (*d == '\n')
break;
}
goto fail;
the repetition text and either the following jump or
pop_failure_jump back to this on_failure_jump. */
case on_failure_jump:
- QUIT;
+ IMMEDIATE_QUIT_CHECK;
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
mcnt, p + mcnt);
then we can use a non-backtracking loop based on
on_failure_keep_string_jump instead of on_failure_jump. */
case on_failure_jump_smart:
- QUIT;
+ IMMEDIATE_QUIT_CHECK;
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
mcnt, p + mcnt);
{
- unsigned char *p1 = p; /* Next operation. */
- unsigned char *p2 = p + mcnt; /* Destination of the jump. */
+ re_char *p1 = p; /* Next operation. */
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
+ unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
p -= 3; /* Reset so that we will re-execute the
instruction once it's been changed. */
{
/* Use a fast `on_failure_keep_string_jump' loop. */
DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
- *p = (unsigned char) on_failure_keep_string_jump;
+ *p3 = (unsigned char) on_failure_keep_string_jump;
STORE_NUMBER (p2 - 2, mcnt + 3);
}
else
{
/* Default to a safe `on_failure_jump' loop. */
DEBUG_PRINT1 (" smart default => slow loop.\n");
- *p = (unsigned char) on_failure_jump;
+ *p3 = (unsigned char) on_failure_jump;
}
DEBUG_STATEMENT (debug -= 2);
}
/* Unconditionally jump (without popping any failure points). */
case jump:
unconditional_jump:
- QUIT;
+ IMMEDIATE_QUIT_CHECK;
EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
p += mcnt; /* Do the jump. */
/* Have to succeed matching what follows at least n times.
After that, handle like `on_failure_jump'. */
case succeed_n:
+ /* Signedness doesn't matter since we only compare MCNT to 0. */
EXTRACT_NUMBER (mcnt, p + 2);
DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
/* Originally, mcnt is how many times we HAVE to succeed. */
if (mcnt != 0)
{
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
mcnt--;
- p += 2;
- PUSH_FAILURE_COUNT (p);
- DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
- STORE_NUMBER_AND_INCR (p, mcnt);
+ p += 4;
+ PUSH_NUMBER (p2, mcnt);
}
else
/* The two bytes encoding mcnt == 0 are two no_op opcodes. */
break;
case jump_n:
+ /* Signedness doesn't matter since we only compare MCNT to 0. */
EXTRACT_NUMBER (mcnt, p + 2);
DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
/* Originally, this is how many times we CAN jump. */
if (mcnt != 0)
{
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
mcnt--;
- PUSH_FAILURE_COUNT (p + 2);
- STORE_NUMBER (p + 2, mcnt);
+ PUSH_NUMBER (p2, mcnt);
goto unconditional_jump;
}
/* If don't have to jump any more, skip over the rest of command. */
case set_number_at:
{
+ unsigned char *p2; /* Location of the counter. */
DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
EXTRACT_NUMBER_AND_INCR (mcnt, p);
- p1 = p + mcnt;
+ /* Here, we discard `const', making re_match non-reentrant. */
+ p2 = (unsigned char*) p + mcnt;
+ /* Signedness doesn't matter since we only copy MCNT's bits . */
EXTRACT_NUMBER_AND_INCR (mcnt, p);
- DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
- PUSH_FAILURE_COUNT (p1);
- STORE_NUMBER (p1, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
+ PUSH_NUMBER (p2, mcnt);
break;
}
{
/* C1 is the character before D, S1 is the syntax of C1, C2
is the character at D, and S2 is the syntax of C2. */
- int c1, c2, s1, s2;
+ re_wchar_t c1, c2;
+ int s1, s2;
#ifdef emacs
int offset = PTR_TO_OFFSET (d - 1);
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
{
/* C1 is the character before D, S1 is the syntax of C1, C2
is the character at D, and S2 is the syntax of C2. */
- int c1, c2, s1, s2;
+ re_wchar_t c1, c2;
+ int s1, s2;
#ifdef emacs
int offset = PTR_TO_OFFSET (d);
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
{
/* C1 is the character before D, S1 is the syntax of C1, C2
is the character at D, and S2 is the syntax of C2. */
- int c1, c2, s1, s2;
+ re_wchar_t c1, c2;
+ int s1, s2;
#ifdef emacs
int offset = PTR_TO_OFFSET (d) - 1;
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
}
#endif
{
- int c, len;
+ int len;
+ re_wchar_t c;
c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
PREFETCH ();
{
- int c, len;
+ int len;
+ re_wchar_t c;
+
c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
/* We goto here if a matching operation fails. */
fail:
- QUIT;
+ IMMEDIATE_QUIT_CHECK;
if (!FAIL_STACK_EMPTY ())
{
- re_char *str;
- unsigned char *pat;
+ re_char *str, *pat;
/* A restart point is known. Restore to that state. */
DEBUG_PRINT1 ("\nFAIL:\n");
POP_FAILURE_POINT (str, pat);
while (p1 < p1_end && p2 < p2_end)
{
int p1_charlen, p2_charlen;
- int p1_ch, p2_ch;
+ re_wchar_t p1_ch, p2_ch;
p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
setting no_sub. */
bufp->no_sub = 0;
- /* Match anchors at newline. */
- bufp->newline_anchor = 1;
-
ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
if (!ret)
return NULL;
return gettext (re_error_msgid[(int) ret]);
}
+WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
\f
/* Entry points compatible with 4.2 BSD regex library. We don't define
them unless specifically requested. */
/* Since `re_exec' always passes NULL for the `regs' argument, we
don't need to initialize the pattern buffer fields which affect it. */
- /* Match anchors at newlines. */
- re_comp_buf.newline_anchor = 1;
-
ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
if (!ret)
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
REG_EXTENDED bit in CFLAGS is set; otherwise, to
RE_SYNTAX_POSIX_BASIC;
- `newline_anchor' to REG_NEWLINE being set in CFLAGS;
- `fastmap' and `fastmap_accurate' to zero;
+ `fastmap' to an allocated space for the fastmap;
+ `fastmap_accurate' to zero;
`re_nsub' to the number of subexpressions in PATTERN.
PATTERN is the address of the pattern string.
int
regcomp (preg, pattern, cflags)
- regex_t *preg;
- const char *pattern;
+ regex_t *__restrict preg;
+ const char *__restrict pattern;
int cflags;
{
reg_errcode_t ret;
preg->allocated = 0;
preg->used = 0;
- /* Don't bother to use a fastmap when searching. This simplifies the
- REG_NEWLINE case: if we used a fastmap, we'd have to put all the
- characters after newlines into the fastmap. This way, we just try
- every character. */
- preg->fastmap = 0;
+ /* Try to allocate space for the fastmap. */
+ preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
if (cflags & REG_ICASE)
{
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
syntax &= ~RE_DOT_NEWLINE;
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
- /* It also changes the matching behavior. */
- preg->newline_anchor = 1;
}
else
- preg->newline_anchor = 0;
+ syntax |= RE_NO_NEWLINE_ANCHOR;
preg->no_sub = !!(cflags & REG_NOSUB);
/* POSIX doesn't distinguish between an unmatched open-group and an
unmatched close-group: both are REG_EPAREN. */
- if (ret == REG_ERPAREN) ret = REG_EPAREN;
-
+ if (ret == REG_ERPAREN)
+ ret = REG_EPAREN;
+
+ if (ret == REG_NOERROR && preg->fastmap)
+ { /* Compute the fastmap now, since regexec cannot modify the pattern
+ buffer. */
+ re_compile_fastmap (preg);
+ if (preg->can_be_null)
+ { /* The fastmap can't be used anyway. */
+ free (preg->fastmap);
+ preg->fastmap = NULL;
+ }
+ }
return (int) ret;
}
+WEAK_ALIAS (__regcomp, regcomp)
/* regexec searches for a given pattern, specified by PREG, in the
int
regexec (preg, string, nmatch, pmatch, eflags)
- const regex_t *preg;
- const char *string;
+ const regex_t *__restrict preg;
+ const char *__restrict string;
size_t nmatch;
regmatch_t pmatch[];
int eflags;
struct re_registers regs;
regex_t private_preg;
int len = strlen (string);
- boolean want_reg_info = !preg->no_sub && nmatch > 0;
+ boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
private_preg = *preg;
regs.end = regs.start + nmatch;
}
+ /* Instead of using not_eol to implement REG_NOTEOL, we could simply
+ pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
+ was a little bit longer but still only matching the real part.
+ This works because the `endline' will check for a '\n' and will find a
+ '\0', correctly deciding that this is not the end of a line.
+ But it doesn't work out so nicely for REG_NOTBOL, since we don't have
+ a convenient '\0' there. For all we know, the string could be preceded
+ by '\n' which would throw things off. */
+
/* Perform the searching operation. */
ret = re_search (&private_preg, string, len,
/* start: */ 0, /* range: */ len,
/* We want zero return to mean success, unlike `re_search'. */
return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
}
+WEAK_ALIAS (__regexec, regexec)
/* Returns a message corresponding to an error code, ERRCODE, returned
return msg_size;
}
+WEAK_ALIAS (__regerror, regerror)
/* Free dynamically allocated space used by PREG. */
free (preg->translate);
preg->translate = NULL;
}
+WEAK_ALIAS (__regfree, regfree)
#endif /* not emacs */