X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregex.c;h=c0ded3905ef335bf3dbe5ccbe7170be8b3dcb5a9;hb=468e57cb4025983239ff40bb1c05b7d8e4328185;hp=54b9249db4fa000415eaff9eb278efa61227cb03;hpb=8a0e870178bec0eec6c4d0760829da51cad381f2;p=gnulib.git diff --git a/lib/regex.c b/lib/regex.c index 54b9249db..c0ded3905 100644 --- a/lib/regex.c +++ b/lib/regex.c @@ -62,9 +62,10 @@ # define US_CHAR_TYPE wchar_t/* unsigned character type */ # define COMPILED_BUFFER_VAR wc_buffer # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ +# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1) # define PUT_CHAR(c) \ do { \ - if (MC_CUR_MAX == 1) \ + if (MB_CUR_MAX == 1) \ putchar (c); \ else \ printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \ @@ -288,6 +289,8 @@ extern char *re_syntax_table; static char re_syntax_table[CHAR_SET_SIZE]; +static void init_syntax_once PARAMS ((void)); + static void init_syntax_once () { @@ -1172,7 +1175,7 @@ printchar (c) static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src, size_t len, int *offset_buffer, - int *is_binary); + char *is_binary); static size_t convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary) CHAR_TYPE *dest; @@ -1189,7 +1192,7 @@ convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary) = {0, 3, 4, 6} */ int *offset_buffer; - int *is_binary; + char *is_binary; { wchar_t *pdest = dest; const unsigned char *psrc = src; @@ -1904,7 +1907,8 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start, #ifndef TRANSLATE # ifdef MBS_SUPPORT # define TRANSLATE(d) \ - (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d)) + ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \ + ? (char) translate[(unsigned char) (d)] : (d)) #else # define TRANSLATE(d) \ (translate ? (char) translate[(unsigned char) (d)] : (d)) @@ -2133,21 +2137,21 @@ typedef struct /* Get the next unsigned number in the uncompiled pattern. */ -#define GET_UNSIGNED_NUMBER(num) \ - { if (p != pend) \ - { \ - PATFETCH (c); \ - while ('0' <= c && c <= '9') \ - { \ - if (num < 0) \ - num = 0; \ - num = num * 10 + c - '0'; \ - if (p == pend) \ - break; \ - PATFETCH (c); \ - } \ - } \ - } +#define GET_UNSIGNED_NUMBER(num) \ + { \ + while (p != pend) \ + { \ + PATFETCH (c); \ + if (c < '0' || c > '9') \ + break; \ + if (num <= RE_DUP_MAX) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + } \ + } \ + } #if defined _LIBC || WIDE_CHAR_SUPPORT /* The GNU C library provides support for user-defined character classes @@ -2281,9 +2285,9 @@ regex_compile (pattern, size, syntax, bufp) /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ int *mbs_offset = NULL; /* It hold whether each wchar_t is binary data or not. */ - int *is_binary = NULL; + char *is_binary = NULL; /* A flag whether exactn is handling binary data or not. */ - int is_exactn_bin = FALSE; + char is_exactn_bin = FALSE; #endif /* MBS_SUPPORT */ /* A random temporary spot in PATTERN. */ @@ -2321,14 +2325,6 @@ regex_compile (pattern, size, syntax, bufp) /* Address of beginning of regexp, or inside of last group. */ US_CHAR_TYPE *begalt; - /* Place in the uncompiled pattern (i.e., the {) to - which to go back if the interval is invalid. */ -#ifdef MBS_SUPPORT - const US_CHAR_TYPE *beg_interval; -#else - const char *beg_interval; -#endif /* MBS_SUPPORT */ - /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ @@ -2341,23 +2337,24 @@ regex_compile (pattern, size, syntax, bufp) #ifdef MBS_SUPPORT /* Initialize the wchar_t PATTERN and offset_buffer. */ - p = pend = pattern = TALLOC(csize, CHAR_TYPE); + p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE); mbs_offset = TALLOC(csize + 1, int); - is_binary = TALLOC(csize + 1, int); + is_binary = TALLOC(csize + 1, char); if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) { - if (pattern) free(pattern); - if (mbs_offset) free(mbs_offset); - if (is_binary) free(is_binary); + free(pattern); + free(mbs_offset); + free(is_binary); return REG_ESPACE; } + pattern[csize] = L'\0'; /* sentinel */ size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); pend = p + size; if (size < 0) { - if (pattern) free(pattern); - if (mbs_offset) free(mbs_offset); - if (is_binary) free(is_binary); + free(pattern); + free(mbs_offset); + free(is_binary); return REG_BADPAT; } #endif @@ -2379,9 +2376,9 @@ regex_compile (pattern, size, syntax, bufp) if (compile_stack.stack == NULL) { #ifdef MBS_SUPPORT - if (pattern) free(pattern); - if (mbs_offset) free(mbs_offset); - if (is_binary) free(is_binary); + free(pattern); + free(mbs_offset); + free(is_binary); #endif return REG_ESPACE; } @@ -2642,6 +2639,7 @@ regex_compile (pattern, size, syntax, bufp) charset[5] = p (= length of chars) charset[6] = char_class (wctype_t) + charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) ... charset[l+5] = char_class (wctype_t) @@ -2804,6 +2802,8 @@ regex_compile (pattern, size, syntax, bufp) if (c == ':' && *p == ']') { wctype_t wt; + uintptr_t alignedp; + /* Query the character class as wctype_t. */ wt = IS_CHAR_CLASS (str); if (wt == 0) @@ -2816,15 +2816,21 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) FREE_STACK_RETURN (REG_EBRACK); /* Allocate the space for character class. */ - GET_BUFFER_SPACE(1); + GET_BUFFER_SPACE(CHAR_CLASS_SIZE); /* Update the pointer to indicate end of buffer. */ - b++; + b += CHAR_CLASS_SIZE; /* Move data which follow character classes not to violate the data. */ - insert_space(1, laststart+6, b-1); + insert_space(CHAR_CLASS_SIZE, + laststart + 6 + laststart[1], + b - 1); + alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) + + __alignof__(wctype_t) - 1) + & ~(uintptr_t)(__alignof__(wctype_t) - 1); /* Store the character class. */ - laststart[6] = (CHAR_TYPE) wt; - laststart[1]++; /* Update length of char_classes */ + *((wctype_t*)alignedp) = wt; + /* Update length of char_classes */ + laststart[1] += CHAR_CLASS_SIZE; had_char_class = true; } @@ -2990,7 +2996,7 @@ regex_compile (pattern, size, syntax, bufp) /* Adjust for the alignment. */ idx = (idx + 3) & ~4; - str[0] = (wchar_t) &extra[idx + 4]; + str[0] = (wchar_t) idx + 4; } else if (symb_table[2 * elem] == 0 && c1 == 1) { @@ -3813,25 +3819,19 @@ regex_compile (pattern, size, syntax, bufp) /* At least (most) this many matches must be made. */ int lower_bound = -1, upper_bound = -1; - beg_interval = p - 1; + + /* Place in the uncompiled pattern (i.e., just after + the '{') to go back to if the interval is invalid. */ + const CHAR_TYPE *beg_interval = p; if (p == pend) - { - if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_EBRACE); - } + goto invalid_interval; GET_UNSIGNED_NUMBER (lower_bound); if (c == ',') { GET_UNSIGNED_NUMBER (upper_bound); - if ((!(syntax & RE_NO_BK_BRACES) && c != '\\') - || ((syntax & RE_NO_BK_BRACES) && c != '}')) - FREE_STACK_RETURN (REG_BADBR); - if (upper_bound < 0) upper_bound = RE_DUP_MAX; } @@ -3839,36 +3839,24 @@ regex_compile (pattern, size, syntax, bufp) /* Interval such as `{1}' => match exactly once. */ upper_bound = lower_bound; - if (lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound) - { - if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_BADBR); - } + if (! (0 <= lower_bound && lower_bound <= upper_bound)) + goto invalid_interval; if (!(syntax & RE_NO_BK_BRACES)) { - if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); - + if (c != '\\' || p == pend) + goto invalid_interval; PATFETCH (c); } if (c != '}') - { - if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_BADBR); - } - - /* We just parsed a valid interval. */ + goto invalid_interval; /* If it's invalid to have no preceding re. */ if (!laststart) { - if (syntax & RE_CONTEXT_INVALID_OPS) + if (syntax & RE_CONTEXT_INVALID_OPS + && !(syntax & RE_INVALID_INTERVAL_ORD)) FREE_STACK_RETURN (REG_BADRPT); else if (syntax & RE_CONTEXT_INDEP_OPS) laststart = b; @@ -3876,6 +3864,11 @@ regex_compile (pattern, size, syntax, bufp) goto unfetch_interval; } + /* We just parsed a valid interval. */ + + if (RE_DUP_MAX < upper_bound) + FREE_STACK_RETURN (REG_BADBR); + /* If the upper bound is zero, don't want to succeed at all; jump from `laststart' to `b + 3', which will be the end of the buffer after we insert the jump. */ @@ -3961,25 +3954,20 @@ regex_compile (pattern, size, syntax, bufp) } } pending_exact = 0; - beg_interval = NULL; - } - break; - - unfetch_interval: - /* If an invalid interval, match the characters as literals. */ - assert (beg_interval); - p = beg_interval; - beg_interval = NULL; - - /* normal_char and normal_backslash need `c'. */ - PATFETCH (c); - - if (!(syntax & RE_NO_BK_BRACES)) - { - if (p > pattern && p[-1] == '\\') - goto normal_backslash; - } - goto normal_char; + break; + + invalid_interval: + if (!(syntax & RE_INVALID_INTERVAL_ORD)) + FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR); + unfetch_interval: + /* Match the characters as literals. */ + p = beg_interval; + c = '{'; + if (syntax & RE_NO_BK_BRACES) + goto normal_char; + else + goto normal_backslash; + } #ifdef emacs /* There is no way to specify the before_dot and after_dot @@ -4355,7 +4343,8 @@ group_in_compile_stack (compile_stack, regnum) } #ifdef MBS_SUPPORT -/* This insert space into the pattern. */ +/* This insert space, which size is "num", into the pattern at "loc". + "end" must point the end of the allocated buffer. */ static void insert_space (num, loc, end) int num; @@ -4396,13 +4385,15 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b, { const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC); + const unsigned char *extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); if (range_start_char < -1) { /* range_start is a collating symbol. */ int32_t *wextra; /* Retreive the index and get collation sequence value. */ - wextra = (int32_t*)char_set[-range_start_char]; + wextra = (int32_t*)(extra + char_set[-range_start_char]); start_val = wextra[1 + *wextra]; } else @@ -5044,9 +5035,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); + register CHAR_TYPE c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); if (!fastmap[(unsigned char) TRANSLATE (c)]) goto advance; @@ -5180,8 +5171,6 @@ weak_alias (__re_search_2, re_search_2) FREE_VAR (string2); \ FREE_VAR (mbs_offset1); \ FREE_VAR (mbs_offset2); \ - FREE_VAR (is_binary1); \ - FREE_VAR (is_binary2); \ } while (0) # else /* not MBS_SUPPORT */ # define FREE_VARIABLES() \ @@ -5199,17 +5188,16 @@ weak_alias (__re_search_2, re_search_2) } while (0) # endif /* MBS_SUPPORT */ #else +# define FREE_VAR(var) if (var) free (var); var = NULL # ifdef MBS_SUPPORT # define FREE_VARIABLES() \ do { \ - if (string1) free (string1); \ - if (string2) free (string2); \ - if (mbs_offset1) free (mbs_offset1); \ - if (mbs_offset2) free (mbs_offset2); \ - if (is_binary1) free (is_binary1); \ - if (is_binary2) free (is_binary2); \ + FREE_VAR (string1); \ + FREE_VAR (string2); \ + FREE_VAR (mbs_offset1); \ + FREE_VAR (mbs_offset2); \ } while (0) -# eles +# else # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ # endif /* MBS_SUPPORT */ #endif /* not MATCH_MAY_ALLOCATE */ @@ -5298,10 +5286,14 @@ weak_alias (__re_match_2, re_match_2) #endif #ifdef MBS_SUPPORT + +static int count_mbs_length PARAMS ((int *, int)); + /* This check the substring (from 0, to length) of the multibyte string, to which offset_buffer correspond. And count how many wchar_t_characters the substring occupy. We use offset_buffer to optimization. See convert_mbs_to_wcs. */ + static int count_mbs_length(offset_buffer, length) int *offset_buffer; @@ -5359,7 +5351,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ int *mbs_offset1 = NULL, *mbs_offset2 = NULL; /* They hold whether each wchar_t is binary data or not. */ - int *is_binary1 = NULL, *is_binary2 = NULL; + char *is_binary = NULL; #endif /* MBS_SUPPORT */ /* Just past the end of the corresponding string. */ @@ -5538,38 +5530,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) fill them with converted string. */ if (csize1 != 0) { - string1 = TALLOC (csize1 + 1, CHAR_TYPE); - mbs_offset1 = TALLOC (csize1 + 1, int); - is_binary1 = TALLOC (csize1 + 1, int); - if (!string1 || !mbs_offset1 || !is_binary1) + string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE); + mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); + is_binary = REGEX_TALLOC (csize1 + 1, char); + if (!string1 || !mbs_offset1 || !is_binary) { - if (string1) free(string1); - if (mbs_offset1) free(mbs_offset1); - if (is_binary1) free(is_binary1); + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (is_binary); return -2; } size1 = convert_mbs_to_wcs(string1, cstring1, csize1, - mbs_offset1, is_binary1); + mbs_offset1, is_binary); string1[size1] = L'\0'; /* for a sentinel */ + FREE_VAR (is_binary); } if (csize2 != 0) { string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE); mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); - is_binary2 = TALLOC (csize2 + 1, int); - if (!string2 || !mbs_offset2 || !is_binary2) + is_binary = REGEX_TALLOC (csize2 + 1, char); + if (!string2 || !mbs_offset2 || !is_binary) { - if (string1) free(string1); - if (mbs_offset1) free(mbs_offset1); - if (is_binary1) free(is_binary1); - if (string2) free(string2); - if (mbs_offset2) free(mbs_offset2); - if (is_binary2) free(is_binary2); + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (string2); + FREE_VAR (mbs_offset2); + FREE_VAR (is_binary); return -2; } size2 = convert_mbs_to_wcs(string2, cstring2, csize2, - mbs_offset2, is_binary2); + mbs_offset2, is_binary); string2[size2] = L'\0'; /* for a sentinel */ + FREE_VAR (is_binary); } /* We need to cast pattern to (wchar_t*), because we casted this compiled @@ -5601,6 +5594,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) size2 = size1; string1 = 0; size1 = 0; +#ifdef MBS_SUPPORT + mbs_offset2 = mbs_offset1; + csize2 = csize1; + mbs_offset1 = NULL; + csize1 = 0; +#endif } end1 = string1 + size1; end2 = string2 + size2; @@ -5615,6 +5614,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } else { + if (stop > csize1 + csize2) + stop = csize1 + csize2; end_match_1 = end1; mcnt = count_mbs_length(mbs_offset2, stop-csize1); end_match_2 = string2 + mcnt; @@ -5992,19 +5993,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) 2*ranges_length + chars_length; /* match with char_class? */ - for (i = 0; i < char_class_length ; i++) - if (iswctype((wint_t)c, (wctype_t)(*workp++))) - goto char_set_matched; + for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) + { + wctype_t wctype; + uintptr_t alignedp = ((uintptr_t)workp + + __alignof__(wctype_t) - 1) + & ~(uintptr_t)(__alignof__(wctype_t) - 1); + wctype = *((wctype_t*)alignedp); + workp += CHAR_CLASS_SIZE; + if (iswctype((wint_t)c, wctype)) + goto char_set_matched; + } /* match with collating_symbol? */ # ifdef _LIBC if (nrules != 0) { + const unsigned char *extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + for (workp2 = workp + coll_symbol_length ; workp < workp2 ; workp++) { int32_t *wextra; - wextra = (int32_t*) *workp++; + wextra = (int32_t*)(extra + *workp++); for (i = 0; i < *wextra; ++i) if (TRANSLATE(d[i]) != wextra[1 + i]) break; @@ -6124,7 +6136,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Update d, however d will be incremented at char_set_matched:, we decrement d here. */ - d = backup_d + (wint_t)cp - (wint_t)str_buf - 1; + d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1); if (d >= dend) { if (dend == end_match_2) @@ -7081,14 +7093,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case wordbeg: DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); - if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) + if (!AT_STRINGS_END (d) && WORDCHAR_P (d) + && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) break; goto fail; case wordend: DEBUG_PRINT1 ("EXECUTING wordend.\n"); if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) - && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) + && (AT_STRINGS_END (d) || !WORDCHAR_P (d))) break; goto fail;