- structure the opcode space into opcode+flag.
- merge with glibc's regex.[ch].
- replace (succeed_n + jump_n + set_number_at) with something that doesn't
- need to modify the compiled regexp.
+ need to modify the compiled regexp so that re_match can be reentrant.
+ - get rid of on_failure_jump_smart by doing the optimization in re_comp
+ rather than at run-time, so that re_match can be reentrant.
*/
/* AIX requires this to be the first thing in the file. */
#pragma alloca
#endif
-#undef _GNU_SOURCE
-#define _GNU_SOURCE
-
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
/* Whether to use ISO C Amendment 1 wide char functions.
Those should not be used for Emacs since it uses its own. */
+#if defined _LIBC
+#define WIDE_CHAR_SUPPORT 1
+#else
#define WIDE_CHAR_SUPPORT \
- (defined _LIBC || HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
+ (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
+#endif
/* For platform which support the ISO C amendement 1 functionality we
support user defined character classes. */
# include "charset.h"
# include "category.h"
+# ifdef malloc
+# undef malloc
+# endif
# define malloc xmalloc
+# ifdef realloc
+# undef realloc
+# endif
# define realloc xrealloc
+# ifdef free
+# undef free
+# endif
# define free xfree
/* Converts the pointer to the char to BEG-based offset from the start. */
{ \
re_char *dtemp = (p) == (str2) ? (end1) : (p); \
re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
- while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
- c = STRING_CHAR (dtemp, (p) - dtemp); \
+ re_char *d0 = dtemp; \
+ PREV_CHAR_BOUNDARY (d0, dlimit); \
+ c = STRING_CHAR (d0, dtemp - d0); \
} \
else \
(c = ((p) == (str2) ? (end1) : (p))[-1]); \
# define SINGLE_BYTE_CHAR_P(c) (1)
# define SAME_CHARSET_P(c1, c2) (1)
# define MULTIBYTE_FORM_LENGTH(p, s) (1)
+# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
# define STRING_CHAR(p, s) (*(p))
# define RE_STRING_CHAR STRING_CHAR
# define CHAR_STRING(c, s) (*(s) = (c), 1)
if (start == NULL)
{
- printf ("(null)\n");
+ fprintf (stderr, "(null)\n");
return;
}
/* Loop over pattern commands. */
while (p < pend)
{
- printf ("%d:\t", p - start);
+ fprintf (stderr, "%d:\t", p - start);
switch ((re_opcode_t) *p++)
{
case no_op:
- printf ("/no_op");
+ fprintf (stderr, "/no_op");
break;
case succeed:
- printf ("/succeed");
+ fprintf (stderr, "/succeed");
break;
case exactn:
mcnt = *p++;
- printf ("/exactn/%d", mcnt);
+ fprintf (stderr, "/exactn/%d", mcnt);
do
{
- putchar ('/');
- putchar (*p++);
+ fprintf (stderr, "/%c", *p++);
}
while (--mcnt);
break;
case start_memory:
- printf ("/start_memory/%d", *p++);
+ fprintf (stderr, "/start_memory/%d", *p++);
break;
case stop_memory:
- printf ("/stop_memory/%d", *p++);
+ fprintf (stderr, "/stop_memory/%d", *p++);
break;
case duplicate:
- printf ("/duplicate/%d", *p++);
+ fprintf (stderr, "/duplicate/%d", *p++);
break;
case anychar:
- printf ("/anychar");
+ fprintf (stderr, "/anychar");
break;
case charset:
int length = CHARSET_BITMAP_SIZE (p - 1);
int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
- printf ("/charset [%s",
+ fprintf (stderr, "/charset [%s",
(re_opcode_t) *(p - 1) == charset_not ? "^" : "");
assert (p + *p < pend);
/* Are we starting a range? */
if (last + 1 == c && ! in_range)
{
- putchar ('-');
+ fprintf (stderr, "-");
in_range = 1;
}
/* Have we broken a range? */
else if (last + 1 != c && in_range)
{
- putchar (last);
+ fprintf (stderr, "%c", last);
in_range = 0;
}
if (! in_range)
- putchar (c);
+ fprintf (stderr, "%c", c);
last = c;
}
if (in_range)
- putchar (last);
+ fprintf (stderr, "%c", last);
- putchar (']');
+ fprintf (stderr, "]");
p += 1 + length;
if (has_range_table)
{
int count;
- printf ("has-range-table");
+ fprintf (stderr, "has-range-table");
/* ??? Should print the range table; for now, just skip it. */
p += 2; /* skip range table bits */
break;
case begline:
- printf ("/begline");
+ fprintf (stderr, "/begline");
break;
case endline:
- printf ("/endline");
+ fprintf (stderr, "/endline");
break;
case on_failure_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump to %d", p + mcnt - start);
+ fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
break;
case on_failure_keep_string_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
+ fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
break;
case on_failure_jump_nastyloop:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
+ fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
break;
case on_failure_jump_loop:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump_loop to %d", p + mcnt - start);
+ fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
break;
case on_failure_jump_smart:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump_smart to %d", p + mcnt - start);
+ fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
break;
case jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/jump to %d", p + mcnt - start);
+ fprintf (stderr, "/jump to %d", p + mcnt - start);
break;
case succeed_n:
extract_number_and_incr (&mcnt, &p);
extract_number_and_incr (&mcnt2, &p);
- printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
+ fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
break;
case jump_n:
extract_number_and_incr (&mcnt, &p);
extract_number_and_incr (&mcnt2, &p);
- printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
+ fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
break;
case set_number_at:
extract_number_and_incr (&mcnt, &p);
extract_number_and_incr (&mcnt2, &p);
- printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
+ fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
break;
case wordbound:
- printf ("/wordbound");
+ fprintf (stderr, "/wordbound");
break;
case notwordbound:
- printf ("/notwordbound");
+ fprintf (stderr, "/notwordbound");
break;
case wordbeg:
- printf ("/wordbeg");
+ fprintf (stderr, "/wordbeg");
break;
case wordend:
- printf ("/wordend");
+ fprintf (stderr, "/wordend");
case syntaxspec:
- printf ("/syntaxspec");
+ fprintf (stderr, "/syntaxspec");
mcnt = *p++;
- printf ("/%d", mcnt);
+ fprintf (stderr, "/%d", mcnt);
break;
case notsyntaxspec:
- printf ("/notsyntaxspec");
+ fprintf (stderr, "/notsyntaxspec");
mcnt = *p++;
- printf ("/%d", mcnt);
+ fprintf (stderr, "/%d", mcnt);
break;
# ifdef emacs
case before_dot:
- printf ("/before_dot");
+ fprintf (stderr, "/before_dot");
break;
case at_dot:
- printf ("/at_dot");
+ fprintf (stderr, "/at_dot");
break;
case after_dot:
- printf ("/after_dot");
+ fprintf (stderr, "/after_dot");
break;
case categoryspec:
- printf ("/categoryspec");
+ fprintf (stderr, "/categoryspec");
mcnt = *p++;
- printf ("/%d", mcnt);
+ fprintf (stderr, "/%d", mcnt);
break;
case notcategoryspec:
- printf ("/notcategoryspec");
+ fprintf (stderr, "/notcategoryspec");
mcnt = *p++;
- printf ("/%d", mcnt);
+ fprintf (stderr, "/%d", mcnt);
break;
# endif /* emacs */
case begbuf:
- printf ("/begbuf");
+ fprintf (stderr, "/begbuf");
break;
case endbuf:
- printf ("/endbuf");
+ fprintf (stderr, "/endbuf");
break;
default:
- printf ("?%d", *(p-1));
+ fprintf (stderr, "?%d", *(p-1));
}
- putchar ('\n');
+ fprintf (stderr, "\n");
}
- printf ("%d:\tend of pattern.\n", p - start);
+ fprintf (stderr, "%d:\tend of pattern.\n", p - start);
}
/* Roughly the maximum number of failure points on the stack. Would be
exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
This is a variable only so users of regex can assign to it; we never
- change it ourselves. */
+ change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
+ before using it, so it should probably be a byte-count instead. */
# if defined MATCH_MAY_ALLOCATE
/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. In order for a larger
if (reg == -1) \
{ \
/* It's a counter. */ \
- /* Here, we discard `const', which makes re_match non-reentrant. \
- Gcc gives a warning for it, which is good. */ \
- unsigned char *ptr = POP_FAILURE_POINTER (); \
+ /* Here, we discard `const', making re_match non-reentrant. */ \
+ unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
reg = POP_FAILURE_INT (); \
STORE_NUMBER (ptr, reg); \
DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
/* Check that we are not stuck in an infinite loop. */
#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
do { \
- int failure = TOP_FAILURE_HANDLE(); \
+ int failure = TOP_FAILURE_HANDLE (); \
/* Check for infinite matching loops */ \
- while (failure > 0 && \
- (FAILURE_STR (failure) == string_place \
- || FAILURE_STR (failure) == NULL)) \
+ while (failure > 0 \
+ && (FAILURE_STR (failure) == string_place \
+ || FAILURE_STR (failure) == NULL)) \
{ \
assert (FAILURE_PAT (failure) >= bufp->buffer \
&& FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
if (FAILURE_PAT (failure) == pat_cur) \
- goto fail; \
+ { \
+ cycle = 1; \
+ break; \
+ } \
DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
failure = NEXT_FAILURE_HANDLE(failure); \
} \
DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
} while (0)
-
+
/* Push the information about the state we will need
if we ever fail back to it.
/* Estimate the size of data pushed by a typical failure stack entry.
An estimate is all we need, because all we use this for
is to choose a limit for how big to make the failure stack. */
-
+/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
#define TYPICAL_FAILURE_SIZE 20
/* How many items can still be added to the stack without overflowing it. */
static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
char *fastmap, const int multibyte));
-/* Fetch the next character in the uncompiled pattern---translating it
- if necessary. */
-#define PATFETCH(c) \
- do { \
- PATFETCH_RAW (c); \
- c = TRANSLATE (c); \
- } while (0)
-
/* Fetch the next character in the uncompiled pattern, with no
translation. */
-#define PATFETCH_RAW(c) \
+#define PATFETCH(c) \
do { \
int len; \
if (p == pend) return REG_EEND; \
/* But patterns can have more than `MAX_REGNUM' registers. We just
ignore the excess. */
-typedef unsigned regnum_t;
+typedef int regnum_t;
/* Macros for the compile stack. */
/* The next available element. */
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
-
+/* Explicit quit checking is only used on NTemacs. */
+#if defined WINDOWSNT && defined emacs && defined QUIT
+extern int immediate_quit;
+# define IMMEDIATE_QUIT_CHECK \
+ do { \
+ if (immediate_quit) QUIT; \
+ } while (0)
+#else
+# define IMMEDIATE_QUIT_CHECK ((void)0)
+#endif
+\f
/* Structure to manage work area for range table. */
struct range_table_work_area
{
int bits; /* flag to record character classes */
};
-/* Make sure that WORK_AREA can hold more N multibyte characters. */
-#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
- do { \
- if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
- { \
- (work_area).allocated += 16 * sizeof (int); \
- if ((work_area).table) \
- (work_area).table \
- = (int *) realloc ((work_area).table, (work_area).allocated); \
- else \
- (work_area).table \
- = (int *) malloc ((work_area).allocated); \
- if ((work_area).table == 0) \
- FREE_STACK_RETURN (REG_ESPACE); \
- } \
+/* Make sure that WORK_AREA can hold more N multibyte characters.
+ This is used only in set_image_of_range and set_image_of_range_1.
+ It expects WORK_AREA to be a pointer.
+ If it can't get the space, it returns from the surrounding function. */
+
+#define EXTEND_RANGE_TABLE(work_area, n) \
+ do { \
+ if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
+ { \
+ extend_range_table_work_area (work_area); \
+ if ((work_area)->table == 0) \
+ return (REG_ESPACE); \
+ } \
} while (0)
#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
+/* Set a range START..END to WORK_AREA.
+ The range is passed through TRANSLATE, so START and END
+ should be untranslated. */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
do { \
- EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
- (work_area).table[(work_area).used++] = (range_start); \
- (work_area).table[(work_area).used++] = (range_end); \
+ int tem; \
+ tem = set_image_of_range (&work_area, start, end, translate); \
+ if (tem > 0) \
+ FREE_STACK_RETURN (tem); \
} while (0)
/* Free allocated memory for WORK_AREA. */
#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
-
+\f
/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
do { if (p != pend) \
{ \
PATFETCH (c); \
+ if (c == ' ') \
+ FREE_STACK_RETURN (REG_BADBR); \
while ('0' <= c && c <= '9') \
{ \
+ int prev; \
if (num < 0) \
- num = 0; \
+ num = 0; \
+ prev = num; \
num = num * 10 + c - '0'; \
+ if (num / 10 != prev) \
+ FREE_STACK_RETURN (REG_BADBR); \
if (p == pend) \
- break; \
+ break; \
PATFETCH (c); \
} \
+ if (c == ' ') \
+ FREE_STACK_RETURN (REG_BADBR); \
} \
} while (0)
-
+\f
#if WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
and the functions from ISO C amendement 1. */
/* Map a string to the char class it names (if any). */
static re_wctype_t
-re_wctype (string)
- re_char *string;
+re_wctype (str)
+ re_char *str;
{
+ const char *string = str;
if (STREQ (string, "alnum")) return RECC_ALNUM;
else if (STREQ (string, "alpha")) return RECC_ALPHA;
else if (STREQ (string, "word")) return RECC_WORD;
int ch;
re_wctype_t cc;
{
- boolean ret = false;
-
switch (cc)
{
- case RECC_ALNUM: ret = ISALNUM (ch);
- case RECC_ALPHA: ret = ISALPHA (ch);
- case RECC_BLANK: ret = ISBLANK (ch);
- case RECC_CNTRL: ret = ISCNTRL (ch);
- case RECC_DIGIT: ret = ISDIGIT (ch);
- case RECC_GRAPH: ret = ISGRAPH (ch);
- case RECC_LOWER: ret = ISLOWER (ch);
- case RECC_PRINT: ret = ISPRINT (ch);
- case RECC_PUNCT: ret = ISPUNCT (ch);
- case RECC_SPACE: ret = ISSPACE (ch);
- case RECC_UPPER: ret = ISUPPER (ch);
- case RECC_XDIGIT: ret = ISXDIGIT (ch);
- case RECC_ASCII: ret = IS_REAL_ASCII (ch);
- case RECC_NONASCII: ret = !IS_REAL_ASCII (ch);
- case RECC_UNIBYTE: ret = ISUNIBYTE (ch);
- case RECC_MULTIBYTE: ret = !ISUNIBYTE (ch);
- case RECC_WORD: ret = ISWORD (ch);
- case RECC_ERROR: ret = false;
+ case RECC_ALNUM: return ISALNUM (ch);
+ case RECC_ALPHA: return ISALPHA (ch);
+ case RECC_BLANK: return ISBLANK (ch);
+ case RECC_CNTRL: return ISCNTRL (ch);
+ case RECC_DIGIT: return ISDIGIT (ch);
+ case RECC_GRAPH: return ISGRAPH (ch);
+ case RECC_LOWER: return ISLOWER (ch);
+ case RECC_PRINT: return ISPRINT (ch);
+ case RECC_PUNCT: return ISPUNCT (ch);
+ case RECC_SPACE: return ISSPACE (ch);
+ case RECC_UPPER: return ISUPPER (ch);
+ case RECC_XDIGIT: return ISXDIGIT (ch);
+ case RECC_ASCII: return IS_REAL_ASCII (ch);
+ case RECC_NONASCII: return !IS_REAL_ASCII (ch);
+ case RECC_UNIBYTE: return ISUNIBYTE (ch);
+ case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
+ case RECC_WORD: return ISWORD (ch);
+ case RECC_ERROR: return false;
+ default:
+ abort();
}
- return ret;
}
/* Return a bit-pattern to use in the range-table bits to match multibyte
re_wctype_to_bit (cc)
re_wctype_t cc;
{
- int ret = 0;
-
switch (cc)
{
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
- case RECC_MULTIBYTE: ret = BIT_MULTIBYTE;
- case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: ret = BIT_WORD;
- case RECC_LOWER: ret = BIT_LOWER;
- case RECC_UPPER: ret = BIT_UPPER;
- case RECC_PUNCT: ret = BIT_PUNCT;
- case RECC_SPACE: ret = BIT_SPACE;
+ case RECC_MULTIBYTE: return BIT_MULTIBYTE;
+ case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+ case RECC_LOWER: return BIT_LOWER;
+ case RECC_UPPER: return BIT_UPPER;
+ case RECC_PUNCT: return BIT_PUNCT;
+ case RECC_SPACE: return BIT_SPACE;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
- case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: ret = 0;
+ case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+ default:
+ abort();
}
- return ret;
}
#endif
+\f
+/* Filling in the work area of a range. */
-/* Explicit quit checking is only used on NTemacs. */
-#if defined WINDOWSNT && defined emacs && defined QUIT
-extern int immediate_quit;
-# define IMMEDIATE_QUIT_CHECK \
- do { \
- if (immediate_quit) QUIT; \
- } while (0)
-#else
-# define IMMEDIATE_QUIT_CHECK ((void)0)
+/* Actually extend the space in WORK_AREA. */
+
+static void
+extend_range_table_work_area (work_area)
+ struct range_table_work_area *work_area;
+{
+ work_area->allocated += 16 * sizeof (int);
+ if (work_area->table)
+ work_area->table
+ = (int *) realloc (work_area->table, work_area->allocated);
+ else
+ work_area->table
+ = (int *) malloc (work_area->allocated);
+}
+
+#ifdef emacs
+
+/* Carefully find the ranges of codes that are equivalent
+ under case conversion to the range start..end when passed through
+ TRANSLATE. Handle the case where non-letters can come in between
+ two upper-case letters (which happens in Latin-1).
+ Also handle the case of groups of more than 2 case-equivalent chars.
+
+ The basic method is to look at consecutive characters and see
+ if they can form a run that can be handled as one.
+
+ Returns -1 if successful, REG_ESPACE if ran out of space. */
+
+static int
+set_image_of_range_1 (work_area, start, end, translate)
+ RE_TRANSLATE_TYPE translate;
+ struct range_table_work_area *work_area;
+ re_wchar_t start, end;
+{
+ /* `one_case' indicates a character, or a run of characters,
+ each of which is an isolate (no case-equivalents).
+ This includes all ASCII non-letters.
+
+ `two_case' indicates a character, or a run of characters,
+ each of which has two case-equivalent forms.
+ This includes all ASCII letters.
+
+ `strange' indicates a character that has more than one
+ case-equivalent. */
+
+ enum case_type {one_case, two_case, strange};
+
+ /* Describe the run that is in progress,
+ which the next character can try to extend.
+ If run_type is strange, that means there really is no run.
+ If run_type is one_case, then run_start...run_end is the run.
+ If run_type is two_case, then the run is run_start...run_end,
+ and the case-equivalents end at run_eqv_end. */
+
+ enum case_type run_type = strange;
+ int run_start, run_end, run_eqv_end;
+
+ Lisp_Object eqv_table;
+
+ if (!RE_TRANSLATE_P (translate))
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (start);
+ work_area->table[work_area->used++] = (end);
+ return -1;
+ }
+
+ eqv_table = XCHAR_TABLE (translate)->extras[2];
+
+ for (; start <= end; start++)
+ {
+ enum case_type this_type;
+ int eqv = RE_TRANSLATE (eqv_table, start);
+ int minchar, maxchar;
+
+ /* Classify this character */
+ if (eqv == start)
+ this_type = one_case;
+ else if (RE_TRANSLATE (eqv_table, eqv) == start)
+ this_type = two_case;
+ else
+ this_type = strange;
+
+ if (start < eqv)
+ minchar = start, maxchar = eqv;
+ else
+ minchar = eqv, maxchar = start;
+
+ /* Can this character extend the run in progress? */
+ if (this_type == strange || this_type != run_type
+ || !(minchar == run_end + 1
+ && (run_type == two_case
+ ? maxchar == run_eqv_end + 1 : 1)))
+ {
+ /* No, end the run.
+ Record each of its equivalent ranges. */
+ if (run_type == one_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ }
+ else if (run_type == two_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 4);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_start);
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_end);
+ }
+ run_type = strange;
+ }
+
+ if (this_type == strange)
+ {
+ /* For a strange character, add each of its equivalents, one
+ by one. Don't start a range. */
+ do
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = eqv;
+ work_area->table[work_area->used++] = eqv;
+ eqv = RE_TRANSLATE (eqv_table, eqv);
+ }
+ while (eqv != start);
+ }
+
+ /* Add this char to the run, or start a new run. */
+ else if (run_type == strange)
+ {
+ /* Initialize a new range. */
+ run_type = this_type;
+ run_start = start;
+ run_end = start;
+ run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+ }
+ else
+ {
+ /* Extend a running range. */
+ run_end = minchar;
+ run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+ }
+ }
+
+ /* If a run is still in progress at the end, finish it now
+ by recording its equivalent ranges. */
+ if (run_type == one_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ }
+ else if (run_type == two_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 4);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_start);
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_end);
+ }
+
+ return -1;
+}
+
+#endif /* emacs */
+
+/* Record the the image of the range start..end when passed through
+ TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+ and is not even necessarily contiguous.
+ Normally we approximate it with the smallest contiguous range that contains
+ all the chars we need. However, for Latin-1 we go to extra effort
+ to do a better job.
+
+ This function is not called for ASCII ranges.
+
+ Returns -1 if successful, REG_ESPACE if ran out of space. */
+
+static int
+set_image_of_range (work_area, start, end, translate)
+ RE_TRANSLATE_TYPE translate;
+ struct range_table_work_area *work_area;
+ re_wchar_t start, end;
+{
+ re_wchar_t cmin, cmax;
+
+#ifdef emacs
+ /* For Latin-1 ranges, use set_image_of_range_1
+ to get proper handling of ranges that include letters and nonletters.
+ For a range that includes the whole of Latin-1, this is not necessary.
+ For other character sets, we don't bother to get this right. */
+ if (RE_TRANSLATE_P (translate) && start < 04400
+ && !(start < 04200 && end >= 04377))
+ {
+ int newend;
+ int tem;
+ newend = end;
+ if (newend > 04377)
+ newend = 04377;
+ tem = set_image_of_range_1 (work_area, start, newend, translate);
+ if (tem > 0)
+ return tem;
+
+ start = 04400;
+ if (end < 04400)
+ return -1;
+ }
#endif
+
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (start);
+ work_area->table[work_area->used++] = (end);
+
+ cmin = -1, cmax = -1;
+
+ if (RE_TRANSLATE_P (translate))
+ {
+ int ch;
+
+ for (ch = start; ch <= end; ch++)
+ {
+ re_wchar_t c = TRANSLATE (ch);
+ if (! (start <= c && c <= end))
+ {
+ if (cmin == -1)
+ cmin = c, cmax = c;
+ else
+ {
+ cmin = MIN (cmin, c);
+ cmax = MAX (cmax, c);
+ }
+ }
+ }
+
+ if (cmin != -1)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (cmin);
+ work_area->table[work_area->used++] = (cmax);
+ }
+ }
+
+ return -1;
+}
\f
#ifndef MATCH_MAY_ALLOCATE
unsigned int startoffset = 0;
re_opcode_t ofj =
/* Check if the loop can match the empty string. */
- (simple || !analyse_first (laststart, b, NULL, 0)) ?
- on_failure_jump : on_failure_jump_loop;
+ (simple || !analyse_first (laststart, b, NULL, 0))
+ ? on_failure_jump : on_failure_jump_loop;
assert (skip_one_char (laststart) <= b);
-
+
if (!zero_times_ok && simple)
{ /* Since simple * loops can be made faster by using
on_failure_keep_string_jump, we turn simple P+
{
boolean emptyp = analyse_first (laststart, b, NULL, 0);
- /* The non-greedy multiple match looks like a repeat..until:
- we only need a conditional jump at the end of the loop */
+ /* The non-greedy multiple match looks like
+ a repeat..until: we only need a conditional jump
+ at the end of the loop. */
if (emptyp) BUF_PUSH (no_op);
STORE_JUMP (emptyp ? on_failure_jump_nastyloop
: on_failure_jump, b, laststart);
{
/* The repeat...until naturally matches one or more.
To also match zero times, we need to first jump to
- the end of the loop (its conditional jump). */
+ the end of the loop (its conditional jump). */
INSERT_JUMP (jump, laststart, b);
b += 3;
}
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+ /* Don't translate yet. The range TRANSLATE(X..Y) cannot
+ always be determined from TRANSLATE(X) and TRANSLATE(Y)
+ So the translation is done later in a loop. Example:
+ (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
PATFETCH (c);
/* \ might escape characters inside [...] and [^...]. */
them). */
if (c == ':' && *p == ']')
{
- int ch;
+ re_wchar_t ch;
re_wctype_t cc;
cc = re_wctype (str);
starting at the smallest character in
the charset of C1 and ending at C1. */
int charset = CHAR_CHARSET (c1);
- int c2 = MAKE_CHAR (charset, 0, 0);
-
+ re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
+
SET_RANGE_TABLE_WORK_AREA (range_table_work,
c2, c1);
c1 = 0377;
/* ... into bitmap. */
{
re_wchar_t this_char;
- int range_start = c, range_end = c1;
+ re_wchar_t range_start = c, range_end = c1;
/* If the start is after the end, the range is empty. */
if (range_start > range_end)
/* Do not translate the character after the \, so that we can
distinguish, e.g., \B from \b, even if we normally would
translate, e.g., B to b. */
- PATFETCH_RAW (c);
+ PATFETCH (c);
switch (c)
{
goto unfetch_interval;
}
- if (upper_bound == 0)
- /* If the upper bound is zero, just drop the sub pattern
- altogether. */
- b = laststart;
- else if (lower_bound == 1 && upper_bound == 1)
- /* Just match it once: nothing to do here. */
- ;
-
- /* Otherwise, we have a nontrivial interval. When
- we're all done, the pattern will look like:
- set_number_at <jump count> <upper bound>
- set_number_at <succeed_n count> <lower bound>
- succeed_n <after jump addr> <succeed_n count>
- <body of loop>
- jump_n <succeed_n addr> <jump count>
- (The upper bound and `jump_n' are omitted if
- `upper_bound' is 1, though.) */
- else
- { /* If the upper bound is > 1, we need to insert
- more at the end of the loop. */
- unsigned int nbytes = (upper_bound < 0 ? 3
- : upper_bound > 1 ? 5 : 0);
- unsigned int startoffset = 0;
-
- GET_BUFFER_SPACE (20); /* We might use less. */
-
- if (lower_bound == 0)
- {
- /* A succeed_n that starts with 0 is really a
- a simple on_failure_jump_loop. */
- INSERT_JUMP (on_failure_jump_loop, laststart,
- b + 3 + nbytes);
- b += 3;
- }
- else
- {
- /* Initialize lower bound of the `succeed_n', even
- though it will be set during matching by its
- attendant `set_number_at' (inserted next),
- because `re_compile_fastmap' needs to know.
- Jump to the `jump_n' we might insert below. */
- INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + nbytes,
- lower_bound);
- b += 5;
-
- /* Code to initialize the lower bound. Insert
- before the `succeed_n'. The `5' is the last two
- bytes of this `set_number_at', plus 3 bytes of
- the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
- startoffset += 5;
- }
-
- if (upper_bound < 0)
- {
- /* A negative upper bound stands for infinity,
- in which case it degenerates to a plain jump. */
- STORE_JUMP (jump, b, laststart + startoffset);
- b += 3;
- }
- else if (upper_bound > 1)
- { /* More than one repetition is allowed, so
- append a backward jump to the `succeed_n'
- that starts this interval.
-
- When we've reached this during matching,
- we'll have matched the interval once, so
- jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + startoffset,
- upper_bound - 1);
- b += 5;
-
- /* The location we want to set is the second
- parameter of the `jump_n'; that is `b-2' as
- an absolute address. `laststart' will be
- the `set_number_at' we're about to insert;
- `laststart+3' the number to set, the source
- for the relative address. But we are
- inserting into the middle of the pattern --
- so everything is getting moved up by 5.
- Conclusion: (b - 2) - (laststart + 3) + 5,
- i.e., b - laststart.
-
- We insert this at the beginning of the loop
- so that if we fail during matching, we'll
- reinitialize the bounds. */
- insert_op2 (set_number_at, laststart, b - laststart,
- upper_bound - 1, b);
- b += 5;
- }
- }
+ if (upper_bound == 0)
+ /* If the upper bound is zero, just drop the sub pattern
+ altogether. */
+ b = laststart;
+ else if (lower_bound == 1 && upper_bound == 1)
+ /* Just match it once: nothing to do here. */
+ ;
+
+ /* Otherwise, we have a nontrivial interval. When
+ we're all done, the pattern will look like:
+ set_number_at <jump count> <upper bound>
+ set_number_at <succeed_n count> <lower bound>
+ succeed_n <after jump addr> <succeed_n count>
+ <body of loop>
+ jump_n <succeed_n addr> <jump count>
+ (The upper bound and `jump_n' are omitted if
+ `upper_bound' is 1, though.) */
+ else
+ { /* If the upper bound is > 1, we need to insert
+ more at the end of the loop. */
+ unsigned int nbytes = (upper_bound < 0 ? 3
+ : upper_bound > 1 ? 5 : 0);
+ unsigned int startoffset = 0;
+
+ GET_BUFFER_SPACE (20); /* We might use less. */
+
+ if (lower_bound == 0)
+ {
+ /* A succeed_n that starts with 0 is really a
+ a simple on_failure_jump_loop. */
+ INSERT_JUMP (on_failure_jump_loop, laststart,
+ b + 3 + nbytes);
+ b += 3;
+ }
+ else
+ {
+ /* Initialize lower bound of the `succeed_n', even
+ though it will be set during matching by its
+ attendant `set_number_at' (inserted next),
+ because `re_compile_fastmap' needs to know.
+ Jump to the `jump_n' we might insert below. */
+ INSERT_JUMP2 (succeed_n, laststart,
+ b + 5 + nbytes,
+ lower_bound);
+ b += 5;
+
+ /* Code to initialize the lower bound. Insert
+ before the `succeed_n'. The `5' is the last two
+ bytes of this `set_number_at', plus 3 bytes of
+ the following `succeed_n'. */
+ insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+ b += 5;
+ startoffset += 5;
+ }
+
+ if (upper_bound < 0)
+ {
+ /* A negative upper bound stands for infinity,
+ in which case it degenerates to a plain jump. */
+ STORE_JUMP (jump, b, laststart + startoffset);
+ b += 3;
+ }
+ else if (upper_bound > 1)
+ { /* More than one repetition is allowed, so
+ append a backward jump to the `succeed_n'
+ that starts this interval.
+
+ When we've reached this during matching,
+ we'll have matched the interval once, so
+ jump back only `upper_bound - 1' times. */
+ STORE_JUMP2 (jump_n, b, laststart + startoffset,
+ upper_bound - 1);
+ b += 5;
+
+ /* The location we want to set is the second
+ parameter of the `jump_n'; that is `b-2' as
+ an absolute address. `laststart' will be
+ the `set_number_at' we're about to insert;
+ `laststart+3' the number to set, the source
+ for the relative address. But we are
+ inserting into the middle of the pattern --
+ so everything is getting moved up by 5.
+ Conclusion: (b - 2) - (laststart + 3) + 5,
+ i.e., b - laststart.
+
+ We insert this at the beginning of the loop
+ so that if we fail during matching, we'll
+ reinitialize the bounds. */
+ insert_op2 (set_number_at, laststart, b - laststart,
+ upper_bound - 1, b);
+ b += 5;
+ }
+ }
pending_exact = 0;
beg_interval = NULL;
}
case 'c':
laststart = b;
- PATFETCH_RAW (c);
+ PATFETCH (c);
BUF_PUSH_2 (categoryspec, c);
break;
case 'C':
laststart = b;
- PATFETCH_RAW (c);
+ PATFETCH (c);
BUF_PUSH_2 (notcategoryspec, c);
break;
#endif /* emacs */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
- if (syntax & RE_NO_BK_REFS)
- goto normal_char;
+ {
+ regnum_t reg;
- c1 = c - '0';
+ if (syntax & RE_NO_BK_REFS)
+ goto normal_backslash;
- if (c1 > regnum)
- FREE_STACK_RETURN (REG_ESUBREG);
+ reg = c - '0';
- /* Can't back reference to a subexpression if inside of it. */
- if (group_in_compile_stack (compile_stack, (regnum_t) c1))
- goto normal_char;
+ /* Can't back reference to a subexpression before its end. */
+ if (reg > regnum || group_in_compile_stack (compile_stack, reg))
+ FREE_STACK_RETURN (REG_ESUBREG);
- laststart = b;
- BUF_PUSH_2 (duplicate, c1);
+ laststart = b;
+ BUF_PUSH_2 (duplicate, reg);
+ }
break;
/* You might think it would be useful for \ to mean
not to translate; but if we don't translate it
it will never match anything. */
- c = TRANSLATE (c);
goto normal_char;
}
break;
default:
/* Expects the character in `c'. */
normal_char:
- /* If no exactn currently being built. */
+ /* If no exactn currently being built. */
if (!pending_exact
/* If last exactn not at current position. */
{
int len;
+ c = TRANSLATE (c);
if (multibyte)
len = CHAR_STRING (c, b);
else
case has already been handled, so we only need to look at the
fallthrough case. */
continue;
-
+
case succeed_n:
/* If N == 0, it should be an on_failure_jump_loop instead. */
DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
}
WEAK_ALIAS (__re_search, re_search)
+/* Head address of virtual concatenation of string. */
+#define HEAD_ADDR_VSTRING(P) \
+ (((P) >= size1 ? string2 : string1))
+
/* End address of virtual concatenation of string. */
#define STOP_ADDR_VSTRING(P) \
(((P) >= size1 ? string2 + size2 : string1 + size1))
/* Update STARTPOS to the previous character boundary. */
if (multibyte)
{
- re_char *p = POS_ADDR_VSTRING (startpos);
- int len = 0;
+ re_char *p = POS_ADDR_VSTRING (startpos) + 1;
+ re_char *p0 = p;
+ re_char *phead = HEAD_ADDR_VSTRING (startpos);
/* Find the head of multibyte form. */
- while (!CHAR_HEAD_P (*p))
- p--, len++;
-
- /* Adjust it. */
-#if 0 /* XXX */
- if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
- ;
- else
-#endif
- {
- range += len;
- if (range > 0)
- break;
+ PREV_CHAR_BOUNDARY (p, phead);
+ range += p0 - 1 - p;
+ if (range > 0)
+ break;
- startpos -= len;
- }
+ startpos -= p0 - 1 - p;
}
}
}
{
case anychar:
break;
-
+
case exactn:
p += *p + 1;
break;
else
p += 1 + CHARSET_BITMAP_SIZE (p - 1);
break;
-
+
case syntaxspec:
case notsyntaxspec:
#ifdef emacs
return 1;
}
break;
-
+
case endline:
case exactn:
{
register re_wchar_t c
= (re_opcode_t) *p2 == endline ? '\n'
- : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
+ : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
if ((re_opcode_t) *p1 == exactn)
{
break;
case charset:
- case charset_not:
{
if ((re_opcode_t) *p1 == exactn)
/* Reuse the code above. */
return mutually_exclusive_p (bufp, p2, p1);
-
/* It is hard to list up all the character in charset
P2 if it includes multibyte character. Give up in
such case. */
P2 is ASCII, it is enough to test only bitmap
table of P1. */
- if (*p1 == *p2)
+ if ((re_opcode_t) *p1 == charset)
{
int idx;
/* We win if the charset inside the loop
return 1;
}
}
- else if ((re_opcode_t) *p1 == charset
- || (re_opcode_t) *p1 == charset_not)
+ else if ((re_opcode_t) *p1 == charset_not)
{
int idx;
/* We win if the charset_not inside the loop lists
}
}
}
-
+ break;
+
+ case charset_not:
+ switch (SWITCH_ENUM_CAST (*p1))
+ {
+ case exactn:
+ case charset:
+ /* Reuse the code above. */
+ return mutually_exclusive_p (bufp, p2, p1);
+ case charset_not:
+ /* When we have two charset_not, it's very unlikely that
+ they don't overlap. The union of the two sets of excluded
+ chars should cover all possible chars, which, as a matter of
+ fact, is virtually impossible in multibyte buffers. */
+ break;
+ }
+ break;
+
case wordend:
case notsyntaxspec:
return ((re_opcode_t) *p1 == syntaxspec
assert (!REG_UNSET (regstart[*p]));
/* Strictly speaking, there should be code such as:
-
+
assert (REG_UNSET (regend[*p]));
PUSH_FAILURE_REGSTOP ((unsigned int)*p);
cycle detection cannot work. Worse yet, such a detection
can not only fail to detect a cycle, but it can also wrongly
detect a cycle (between different instantiations of the same
- loop.
+ loop).
So the method used for those nasty loops is a little different:
We use a special cycle-detection-stack-frame which is pushed
when the on_failure_jump_nastyloop failure-point is *popped*.
mcnt, p + mcnt);
assert ((re_opcode_t)p[-4] == no_op);
- CHECK_INFINITE_LOOP (p - 4, d);
- PUSH_FAILURE_POINT (p - 3, d);
+ {
+ int cycle = 0;
+ CHECK_INFINITE_LOOP (p - 4, d);
+ if (!cycle)
+ /* If there's a cycle, just continue without pushing
+ this failure point. The failure point is the "try again"
+ option, which shouldn't be tried.
+ We want (x?)*?y\1z to match both xxyz and xxyxz. */
+ PUSH_FAILURE_POINT (p - 3, d);
+ }
break;
-
/* Simple loop detecting on_failure_jump: just check on the
failure stack if the same spot was already hit earlier. */
case on_failure_jump_loop:
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
mcnt, p + mcnt);
-
- CHECK_INFINITE_LOOP (p - 3, d);
- PUSH_FAILURE_POINT (p - 3, d);
+ {
+ int cycle = 0;
+ CHECK_INFINITE_LOOP (p - 3, d);
+ if (cycle)
+ /* If there's a cycle, get out of the loop, as if the matching
+ had failed. We used to just `goto fail' here, but that was
+ aborting the search a bit too early: we want to keep the
+ empty-loop-match and keep matching after the loop.
+ We want (x?)*y\1z to match both xxyz and xxyxz. */
+ p += mcnt;
+ else
+ PUSH_FAILURE_POINT (p - 3, d);
+ }
break;
mcnt, p + mcnt);
{
re_char *p1 = p; /* Next operation. */
- /* Please don't add casts to try and shut up GCC. */
- unsigned char *p2 = p + mcnt; /* Destination of the jump. */
- unsigned char *p3 = p - 3; /* Location of the opcode. */
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
+ unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
p -= 3; /* Reset so that we will re-execute the
instruction once it's been changed. */
/* Originally, mcnt is how many times we HAVE to succeed. */
if (mcnt != 0)
{
- /* Please don't add a cast to try and shut up GCC. */
- unsigned char *p2 = p + 2; /* Location of the counter. */
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
mcnt--;
p += 4;
PUSH_NUMBER (p2, mcnt);
/* Originally, this is how many times we CAN jump. */
if (mcnt != 0)
{
- /* Please don't add a cast to try and shut up GCC. */
- unsigned char *p2 = p + 2; /* Location of the counter. */
+ /* Here, we discard `const', making re_match non-reentrant. */
+ unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
mcnt--;
PUSH_NUMBER (p2, mcnt);
goto unconditional_jump;
DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
EXTRACT_NUMBER_AND_INCR (mcnt, p);
- /* Please don't add a cast to try and shut up GCC. */
- p2 = p + mcnt;
+ /* Here, we discard `const', making re_match non-reentrant. */
+ p2 = (unsigned char*) p + mcnt;
/* Signedness doesn't matter since we only copy MCNT's bits . */
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
PREFETCH ();
c2 = RE_STRING_CHAR (d, dend - d);
s2 = SYNTAX (c2);
-
+
/* Case 2: S2 is not Sword. */
if (s2 != Sword)
goto fail;
int
regcomp (preg, pattern, cflags)
- regex_t *preg;
- const char *pattern;
+ regex_t *__restrict preg;
+ const char *__restrict pattern;
int cflags;
{
reg_errcode_t ret;
int
regexec (preg, string, nmatch, pmatch, eflags)
- const regex_t *preg;
- const char *string;
+ const regex_t *__restrict preg;
+ const char *__restrict string;
size_t nmatch;
- regmatch_t pmatch[];
+ regmatch_t pmatch[__restrict_arr];
int eflags;
{
int ret;
WEAK_ALIAS (__regfree, regfree)
#endif /* not emacs */
+
+/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
+ (do not change this comment) */