# define US_CHAR_TYPE wchar_t/* unsigned character type */
# define COMPILED_BUFFER_VAR wc_buffer
# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
+# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
# define PUT_CHAR(c) \
do { \
- if (MC_CUR_MAX == 1) \
+ if (MB_CUR_MAX == 1) \
putchar (c); \
else \
printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \
static char re_syntax_table[CHAR_SET_SIZE];
+static void init_syntax_once PARAMS ((void));
+
static void
init_syntax_once ()
{
static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
size_t len, int *offset_buffer,
- int *is_binary);
+ char *is_binary);
static size_t
convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
CHAR_TYPE *dest;
= {0, 3, 4, 6}
*/
int *offset_buffer;
- int *is_binary;
+ char *is_binary;
{
wchar_t *pdest = dest;
const unsigned char *psrc = src;
#ifndef TRANSLATE
# ifdef MBS_SUPPORT
# define TRANSLATE(d) \
- (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d))
+ ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
+ ? (char) translate[(unsigned char) (d)] : (d))
#else
# define TRANSLATE(d) \
(translate ? (char) translate[(unsigned char) (d)] : (d))
/* Get the next unsigned number in the uncompiled pattern. */
-#define GET_UNSIGNED_NUMBER(num) \
- { if (p != pend) \
- { \
- PATFETCH (c); \
- while ('0' <= c && c <= '9') \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- if (p == pend) \
- break; \
- PATFETCH (c); \
- } \
- } \
- }
+#define GET_UNSIGNED_NUMBER(num) \
+ { \
+ while (p != pend) \
+ { \
+ PATFETCH (c); \
+ if (c < '0' || c > '9') \
+ break; \
+ if (num <= RE_DUP_MAX) \
+ { \
+ if (num < 0) \
+ num = 0; \
+ num = num * 10 + c - '0'; \
+ } \
+ } \
+ }
#if defined _LIBC || WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
/* offset buffer for optimizatoin. See convert_mbs_to_wc. */
int *mbs_offset = NULL;
/* It hold whether each wchar_t is binary data or not. */
- int *is_binary = NULL;
+ char *is_binary = NULL;
/* A flag whether exactn is handling binary data or not. */
- int is_exactn_bin = FALSE;
+ char is_exactn_bin = FALSE;
#endif /* MBS_SUPPORT */
/* A random temporary spot in PATTERN. */
/* Address of beginning of regexp, or inside of last group. */
US_CHAR_TYPE *begalt;
- /* Place in the uncompiled pattern (i.e., the {) to
- which to go back if the interval is invalid. */
-#ifdef MBS_SUPPORT
- const US_CHAR_TYPE *beg_interval;
-#else
- const char *beg_interval;
-#endif /* MBS_SUPPORT */
-
/* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or' -- except the
last -- ends with a forward jump of this sort. */
#ifdef MBS_SUPPORT
/* Initialize the wchar_t PATTERN and offset_buffer. */
- p = pend = pattern = TALLOC(csize, CHAR_TYPE);
+ p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
mbs_offset = TALLOC(csize + 1, int);
- is_binary = TALLOC(csize + 1, int);
+ is_binary = TALLOC(csize + 1, char);
if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
{
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
return REG_ESPACE;
}
+ pattern[csize] = L'\0'; /* sentinel */
size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
pend = p + size;
if (size < 0)
{
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
return REG_BADPAT;
}
#endif
if (compile_stack.stack == NULL)
{
#ifdef MBS_SUPPORT
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
#endif
return REG_ESPACE;
}
charset[5] = p (= length of chars)
charset[6] = char_class (wctype_t)
+ charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
...
charset[l+5] = char_class (wctype_t)
if (c == ':' && *p == ']')
{
wctype_t wt;
+ uintptr_t alignedp;
+
/* Query the character class as wctype_t. */
wt = IS_CHAR_CLASS (str);
if (wt == 0)
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
/* Allocate the space for character class. */
- GET_BUFFER_SPACE(1);
+ GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
/* Update the pointer to indicate end of buffer. */
- b++;
+ b += CHAR_CLASS_SIZE;
/* Move data which follow character classes
not to violate the data. */
- insert_space(1, laststart+6, b-1);
+ insert_space(CHAR_CLASS_SIZE,
+ laststart + 6 + laststart[1],
+ b - 1);
+ alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
+ + __alignof__(wctype_t) - 1)
+ & ~(uintptr_t)(__alignof__(wctype_t) - 1);
/* Store the character class. */
- laststart[6] = (CHAR_TYPE) wt;
- laststart[1]++; /* Update length of char_classes */
+ *((wctype_t*)alignedp) = wt;
+ /* Update length of char_classes */
+ laststart[1] += CHAR_CLASS_SIZE;
had_char_class = true;
}
/* Adjust for the alignment. */
idx = (idx + 3) & ~4;
- str[0] = (wchar_t) &extra[idx + 4];
+ str[0] = (wchar_t) idx + 4;
}
else if (symb_table[2 * elem] == 0 && c1 == 1)
{
/* At least (most) this many matches must be made. */
int lower_bound = -1, upper_bound = -1;
- beg_interval = p - 1;
+
+ /* Place in the uncompiled pattern (i.e., just after
+ the '{') to go back to if the interval is invalid. */
+ const CHAR_TYPE *beg_interval = p;
if (p == pend)
- {
- if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_EBRACE);
- }
+ goto invalid_interval;
GET_UNSIGNED_NUMBER (lower_bound);
if (c == ',')
{
GET_UNSIGNED_NUMBER (upper_bound);
- if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
- || ((syntax & RE_NO_BK_BRACES) && c != '}'))
- FREE_STACK_RETURN (REG_BADBR);
-
if (upper_bound < 0)
upper_bound = RE_DUP_MAX;
}
/* Interval such as `{1}' => match exactly once. */
upper_bound = lower_bound;
- if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound)
- {
- if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
- }
+ if (! (0 <= lower_bound && lower_bound <= upper_bound))
+ goto invalid_interval;
if (!(syntax & RE_NO_BK_BRACES))
{
- if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
-
+ if (c != '\\' || p == pend)
+ goto invalid_interval;
PATFETCH (c);
}
if (c != '}')
- {
- if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
- }
-
- /* We just parsed a valid interval. */
+ goto invalid_interval;
/* If it's invalid to have no preceding re. */
if (!laststart)
{
- if (syntax & RE_CONTEXT_INVALID_OPS)
+ if (syntax & RE_CONTEXT_INVALID_OPS
+ && !(syntax & RE_INVALID_INTERVAL_ORD))
FREE_STACK_RETURN (REG_BADRPT);
else if (syntax & RE_CONTEXT_INDEP_OPS)
laststart = b;
goto unfetch_interval;
}
+ /* We just parsed a valid interval. */
+
+ if (RE_DUP_MAX < upper_bound)
+ FREE_STACK_RETURN (REG_BADBR);
+
/* If the upper bound is zero, don't want to succeed at
all; jump from `laststart' to `b + 3', which will be
the end of the buffer after we insert the jump. */
}
}
pending_exact = 0;
- beg_interval = NULL;
- }
- break;
-
- unfetch_interval:
- /* If an invalid interval, match the characters as literals. */
- assert (beg_interval);
- p = beg_interval;
- beg_interval = NULL;
-
- /* normal_char and normal_backslash need `c'. */
- PATFETCH (c);
-
- if (!(syntax & RE_NO_BK_BRACES))
- {
- if (p > pattern && p[-1] == '\\')
- goto normal_backslash;
- }
- goto normal_char;
+ break;
+
+ invalid_interval:
+ if (!(syntax & RE_INVALID_INTERVAL_ORD))
+ FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
+ unfetch_interval:
+ /* Match the characters as literals. */
+ p = beg_interval;
+ c = '{';
+ if (syntax & RE_NO_BK_BRACES)
+ goto normal_char;
+ else
+ goto normal_backslash;
+ }
#ifdef emacs
/* There is no way to specify the before_dot and after_dot
}
#ifdef MBS_SUPPORT
-/* This insert space into the pattern. */
+/* This insert space, which size is "num", into the pattern at "loc".
+ "end" must point the end of the allocated buffer. */
static void
insert_space (num, loc, end)
int num;
{
const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
_NL_COLLATE_COLLSEQWC);
+ const unsigned char *extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
if (range_start_char < -1)
{
/* range_start is a collating symbol. */
int32_t *wextra;
/* Retreive the index and get collation sequence value. */
- wextra = (int32_t*)char_set[-range_start_char];
+ wextra = (int32_t*)(extra + char_set[-range_start_char]);
start_val = wextra[1 + *wextra];
}
else
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ register CHAR_TYPE c = (size1 == 0 || startpos >= size1
+ ? string2[startpos - size1]
+ : string1[startpos]);
if (!fastmap[(unsigned char) TRANSLATE (c)])
goto advance;
FREE_VAR (string2); \
FREE_VAR (mbs_offset1); \
FREE_VAR (mbs_offset2); \
- FREE_VAR (is_binary1); \
- FREE_VAR (is_binary2); \
} while (0)
# else /* not MBS_SUPPORT */
# define FREE_VARIABLES() \
} while (0)
# endif /* MBS_SUPPORT */
#else
+# define FREE_VAR(var) if (var) free (var); var = NULL
# ifdef MBS_SUPPORT
# define FREE_VARIABLES() \
do { \
- if (string1) free (string1); \
- if (string2) free (string2); \
- if (mbs_offset1) free (mbs_offset1); \
- if (mbs_offset2) free (mbs_offset2); \
- if (is_binary1) free (is_binary1); \
- if (is_binary2) free (is_binary2); \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
} while (0)
-# eles
+# else
# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
# endif /* MBS_SUPPORT */
#endif /* not MATCH_MAY_ALLOCATE */
#endif
#ifdef MBS_SUPPORT
+
+static int count_mbs_length PARAMS ((int *, int));
+
/* This check the substring (from 0, to length) of the multibyte string,
to which offset_buffer correspond. And count how many wchar_t_characters
the substring occupy. We use offset_buffer to optimization.
See convert_mbs_to_wcs. */
+
static int
count_mbs_length(offset_buffer, length)
int *offset_buffer;
/* offset buffer for optimizatoin. See convert_mbs_to_wc. */
int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
/* They hold whether each wchar_t is binary data or not. */
- int *is_binary1 = NULL, *is_binary2 = NULL;
+ char *is_binary = NULL;
#endif /* MBS_SUPPORT */
/* Just past the end of the corresponding string. */
fill them with converted string. */
if (csize1 != 0)
{
- string1 = TALLOC (csize1 + 1, CHAR_TYPE);
- mbs_offset1 = TALLOC (csize1 + 1, int);
- is_binary1 = TALLOC (csize1 + 1, int);
- if (!string1 || !mbs_offset1 || !is_binary1)
+ string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
+ mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
+ is_binary = REGEX_TALLOC (csize1 + 1, char);
+ if (!string1 || !mbs_offset1 || !is_binary)
{
- if (string1) free(string1);
- if (mbs_offset1) free(mbs_offset1);
- if (is_binary1) free(is_binary1);
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (is_binary);
return -2;
}
size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
- mbs_offset1, is_binary1);
+ mbs_offset1, is_binary);
string1[size1] = L'\0'; /* for a sentinel */
+ FREE_VAR (is_binary);
}
if (csize2 != 0)
{
string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
- is_binary2 = TALLOC (csize2 + 1, int);
- if (!string2 || !mbs_offset2 || !is_binary2)
+ is_binary = REGEX_TALLOC (csize2 + 1, char);
+ if (!string2 || !mbs_offset2 || !is_binary)
{
- if (string1) free(string1);
- if (mbs_offset1) free(mbs_offset1);
- if (is_binary1) free(is_binary1);
- if (string2) free(string2);
- if (mbs_offset2) free(mbs_offset2);
- if (is_binary2) free(is_binary2);
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (string2);
+ FREE_VAR (mbs_offset2);
+ FREE_VAR (is_binary);
return -2;
}
size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
- mbs_offset2, is_binary2);
+ mbs_offset2, is_binary);
string2[size2] = L'\0'; /* for a sentinel */
+ FREE_VAR (is_binary);
}
/* We need to cast pattern to (wchar_t*), because we casted this compiled
size2 = size1;
string1 = 0;
size1 = 0;
+#ifdef MBS_SUPPORT
+ mbs_offset2 = mbs_offset1;
+ csize2 = csize1;
+ mbs_offset1 = NULL;
+ csize1 = 0;
+#endif
}
end1 = string1 + size1;
end2 = string2 + size2;
}
else
{
+ if (stop > csize1 + csize2)
+ stop = csize1 + csize2;
end_match_1 = end1;
mcnt = count_mbs_length(mbs_offset2, stop-csize1);
end_match_2 = string2 + mcnt;
2*ranges_length + chars_length;
/* match with char_class? */
- for (i = 0; i < char_class_length ; i++)
- if (iswctype((wint_t)c, (wctype_t)(*workp++)))
- goto char_set_matched;
+ for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
+ {
+ wctype_t wctype;
+ uintptr_t alignedp = ((uintptr_t)workp
+ + __alignof__(wctype_t) - 1)
+ & ~(uintptr_t)(__alignof__(wctype_t) - 1);
+ wctype = *((wctype_t*)alignedp);
+ workp += CHAR_CLASS_SIZE;
+ if (iswctype((wint_t)c, wctype))
+ goto char_set_matched;
+ }
/* match with collating_symbol? */
# ifdef _LIBC
if (nrules != 0)
{
+ const unsigned char *extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+
for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
workp++)
{
int32_t *wextra;
- wextra = (int32_t*) *workp++;
+ wextra = (int32_t*)(extra + *workp++);
for (i = 0; i < *wextra; ++i)
if (TRANSLATE(d[i]) != wextra[1 + i])
break;
/* Update d, however d will be incremented at
char_set_matched:, we decrement d here. */
- d = backup_d + (wint_t)cp - (wint_t)str_buf - 1;
+ d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
if (d >= dend)
{
if (dend == end_match_2)
case wordbeg:
DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
- if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
+ if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
+ && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
break;
goto fail;
case wordend:
DEBUG_PRINT1 ("EXECUTING wordend.\n");
if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
- && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
+ && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
break;
goto fail;