X-Git-Url: https://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregcomp.c;h=0fdc3bcaef3bb13e2b3ffa89bb46a7e28828a91a;hb=84d1749c4b5ad026522be96537320556ec6f7a57;hp=6e87a48aff5316432bf42d8c3b36361bdecda307;hpb=6bd173f7ffcd7f02646ad1086035e8ba9cbddcba;p=gnulib.git diff --git a/lib/regcomp.c b/lib/regcomp.c index 6e87a48af..0fdc3bcae 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,22 +1,21 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free - Software Foundation, Inc. + Copyright (C) 2002-2013 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, + The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax); @@ -95,20 +94,20 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, re_charset_t *mbcset, Idx *char_class_alloc, - const unsigned char *class_name, + const char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ static reg_errcode_t build_equiv_class (bitset_t sbcset, const unsigned char *name); static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, - const unsigned char *class_name, + const char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, - const unsigned char *class_name, - const unsigned char *extra, + const char *class_name, + const char *extra, bool non_match, reg_errcode_t *err); static bin_tree_t *create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, @@ -207,7 +206,7 @@ static const size_t __re_error_msgid_idx[] = compiles PATTERN (of length LENGTH) and puts the result in BUFP. Returns 0 if the pattern was valid, otherwise an error string. - Assumes the `allocated' (and perhaps `buffer') and `translate' fields + Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields are set in BUFP on entry. */ #ifdef _LIBC @@ -242,7 +241,7 @@ re_compile_pattern (const char *pattern, size_t length, weak_alias (__re_compile_pattern, re_compile_pattern) #endif -/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can +/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ /* This has no initializer because initialized variables in Emacs @@ -274,7 +273,7 @@ int re_compile_fastmap (bufp) struct re_pattern_buffer *bufp; { - re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + re_dfa_t *dfa = bufp->buffer; char *fastmap = bufp->fastmap; memset (fastmap, '\0', sizeof (char) * SBC_MAX); @@ -308,7 +307,7 @@ static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap) { - re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + re_dfa_t *dfa = bufp->buffer; Idx node_cnt; bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) @@ -440,15 +439,15 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, PREG is a regex_t *. We do not expect any fields to be initialized, since POSIX says we shouldn't. Thus, we set - `buffer' to the compiled pattern; - `used' to the length of the compiled pattern; - `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + 'buffer' to the compiled pattern; + 'used' to the length of the compiled pattern; + 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the REG_EXTENDED bit in CFLAGS is set; otherwise, to RE_SYNTAX_POSIX_BASIC; - `newline_anchor' to REG_NEWLINE being set in CFLAGS; - `fastmap' to an allocated space for the fastmap; - `fastmap_accurate' to zero; - `re_nsub' to the number of subexpressions in PATTERN. + 'newline_anchor' to REG_NEWLINE being set in CFLAGS; + 'fastmap' to an allocated space for the fastmap; + 'fastmap_accurate' to zero; + 're_nsub' to the number of subexpressions in PATTERN. PATTERN is the address of the pattern string. @@ -587,19 +586,23 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +# ifdef __GNUC__ + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX +# else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) +# endif }; #endif @@ -658,7 +661,7 @@ void regfree (preg) regex_t *preg; { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; if (BE (dfa != NULL, 1)) free_dfa_content (dfa); preg->buffer = NULL; @@ -719,7 +722,7 @@ re_comp (s) + __re_error_msgid_idx[(int) REG_ESPACE]); } - /* Since `re_exec' always passes NULL for the `regs' argument, we + /* Since 're_exec' always passes NULL for the 'regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ /* Match anchors at newlines. */ @@ -730,7 +733,7 @@ re_comp (s) if (!ret) return NULL; - /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } @@ -765,7 +768,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, preg->regs_allocated = REGS_UNALLOCATED; /* Initialize the dfa. */ - dfa = (re_dfa_t *) preg->buffer; + dfa = preg->buffer; if (BE (preg->allocated < sizeof (re_dfa_t), 0)) { /* If zero allocated, but buffer is non-null, try to realloc @@ -776,7 +779,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, if (dfa == NULL) return REG_ESPACE; preg->allocated = sizeof (re_dfa_t); - preg->buffer = (unsigned char *) dfa; + preg->buffer = dfa; } preg->used = sizeof (re_dfa_t); @@ -851,7 +854,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) { __re_size_t table_size; #ifndef _LIBC - char *codeset_name; + const char *codeset_name; #endif #ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); @@ -874,7 +877,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) calculation below, and for similar doubling calculations elsewhere. And it's <= rather than <, because some of the doubling calculations add 1 afterwards. */ - if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) return REG_ESPACE; dfa->nodes_alloc = pat_len + 1; @@ -897,8 +900,10 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) != 0); #else codeset_name = nl_langinfo (CODESET); - if (strcasecmp (codeset_name, "UTF-8") == 0 - || strcasecmp (codeset_name, "UTF8") == 0) + if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') + && (codeset_name[1] == 'T' || codeset_name[1] == 't') + && (codeset_name[2] == 'F' || codeset_name[2] == 'f') + && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) dfa->is_utf8 = 1; /* We check exhaustively in the loop below if this charset is a @@ -948,9 +953,43 @@ static void internal_function init_word_char (re_dfa_t *dfa) { - int i, j, ch; + int i = 0; + int j; + int ch = 0; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + if (BE (dfa->map_notascii == 0, 1)) + { + bitset_word_t bits0 = 0x00000000; + bitset_word_t bits1 = 0x03ff0000; + bitset_word_t bits2 = 0x87fffffe; + bitset_word_t bits3 = 0x07fffffe; + if (BITSET_WORD_BITS == 64) + { + dfa->word_char[0] = bits1 << 31 << 1 | bits0; + dfa->word_char[1] = bits3 << 31 << 1 | bits2; + i = 2; + } + else if (BITSET_WORD_BITS == 32) + { + dfa->word_char[0] = bits0; + dfa->word_char[1] = bits1; + dfa->word_char[2] = bits2; + dfa->word_char[3] = bits3; + i = 4; + } + else + goto general_case; + ch = 128; + + if (BE (dfa->is_utf8, 1)) + { + memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); + return; + } + } + + general_case: + for (; i < BITSET_WORDS; ++i) for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') dfa->word_char[i] |= (bitset_word_t) 1 << j; @@ -961,7 +1000,7 @@ init_word_char (re_dfa_t *dfa) static void free_workarea_compile (regex_t *preg) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_storage_t *storage, *next; for (storage = dfa->str_tree_storage; storage; storage = next) { @@ -1021,11 +1060,10 @@ create_initial_state (re_dfa_t *dfa) Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - reg_errcode_t err = re_node_set_merge (&init_nodes, - dfa->eclosures - + dest_idx); - if (err != REG_NOERROR) - return err; + reg_errcode_t merge_err + = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + if (merge_err != REG_NOERROR) + return merge_err; i = 0; } } @@ -1146,7 +1184,7 @@ optimize_utf8 (re_dfa_t *dfa) static reg_errcode_t analyze (regex_t *preg) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; reg_errcode_t ret; /* Allocate arrays. */ @@ -1327,7 +1365,7 @@ lower_subexps (void *extra, bin_tree_t *node) static bin_tree_t * lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *body = node->left; bin_tree_t *op, *cls, *tree1, *tree; @@ -1661,7 +1699,7 @@ calc_eclosure (re_dfa_t *dfa) /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; - /* Calculate epsilon closure of `node_idx'. */ + /* Calculate epsilon closure of 'node_idx'. */ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); if (BE (err != REG_NOERROR, 0)) return err; @@ -1682,10 +1720,9 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { reg_errcode_t err; Idx i; - bool incomplete; - bool ok; re_node_set eclosure; - incomplete = false; + bool ok; + bool incomplete = false; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); if (BE (err != REG_NOERROR, 0)) return err; @@ -1712,14 +1749,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { re_node_set eclosure_elem; Idx edest = dfa->edests[node].elems[i]; - /* If calculating the epsilon closure of `edest' is in progress, + /* If calculating the epsilon closure of 'edest' is in progress, return intermediate result. */ if (dfa->eclosures[edest].nelem == REG_MISSING) { incomplete = true; continue; } - /* If we haven't calculated the epsilon closure of `edest' yet, + /* If we haven't calculated the epsilon closure of 'edest' yet, calculate now. Otherwise use calculated epsilon closure. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1729,11 +1766,11 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } else eclosure_elem = dfa->eclosures[edest]; - /* Merge the epsilon closure of `edest'. */ + /* Merge the epsilon closure of 'edest'. */ err = re_node_set_merge (&eclosure, &eclosure_elem); if (BE (err != REG_NOERROR, 0)) return err; - /* If the epsilon closure of `edest' is incomplete, + /* If the epsilon closure of 'edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1742,7 +1779,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } } - /* Epsilon closures include itself. */ + /* An epsilon closure includes itself. */ ok = re_node_set_insert (&eclosure, node); if (BE (! ok, 0)) return REG_ESPACE; @@ -2095,7 +2132,7 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) /* Entry point of the parser. Parse the regular expression REGEXP and return the structure tree. - If an error is occured, ERR is set by error code, and return NULL. + If an error occurs, ERR is set by error code, and return NULL. This function build the following tree, from regular expression : CAT / \ @@ -2109,7 +2146,7 @@ static bin_tree_t * parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree, *eor, *root; re_token_t current_token; dfa->syntax = syntax; @@ -2137,13 +2174,13 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, / \ - ALT means alternative, which represents the operator `|'. */ + ALT means alternative, which represents the operator '|'. */ static bin_tree_t * parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree, *branch = NULL; tree = parse_branch (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) @@ -2185,7 +2222,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { bin_tree_t *tree, *expr; - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; tree = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2196,16 +2233,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, expr = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && expr == NULL, 0)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); return NULL; } if (tree != NULL && expr != NULL) { - tree = create_tree (dfa, tree, expr, CONCAT); - if (tree == NULL) + bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); + if (newtree == NULL) { + postorder (expr, free_tree, NULL); + postorder (tree, free_tree, NULL); *err = REG_ESPACE; return NULL; } + tree = newtree; } else if (tree == NULL) tree = expr; @@ -2224,7 +2266,7 @@ static bin_tree_t * parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree; switch (token->type) { @@ -2380,8 +2422,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, case OP_WORD: case OP_NOTWORD: tree = build_charclass_op (dfa, regexp->trans, - (const unsigned char *) "alnum", - (const unsigned char *) "_", + "alnum", + "_", token->type == OP_NOTWORD, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2389,8 +2431,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, case OP_SPACE: case OP_NOTSPACE: tree = build_charclass_op (dfa, regexp->trans, - (const unsigned char *) "space", - (const unsigned char *) "", + "space", + "", token->type == OP_NOTSPACE, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2440,7 +2482,7 @@ static bin_tree_t * parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree; size_t cur_nsub; cur_nsub = preg->re_nsub++; @@ -2454,7 +2496,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + *err = REG_EPAREN; + } if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -2532,6 +2578,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, *err = REG_BADBR; return NULL; } + + if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) + { + *err = REG_ESIZE; + return NULL; + } } else { @@ -2572,17 +2624,24 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, old_tree = NULL; if (elem->token.type == SUBEXP) - postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); + { + uintptr_t subidx = elem->token.opr.idx; + postorder (elem, mark_opt_subexp, (void *) subidx); + } tree = create_tree (dfa, elem, NULL, (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); if (BE (tree == NULL, 0)) goto parse_dup_op_espace; +/* From gnulib's "intprops.h": + True if the arithmetic type T is signed. */ +#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) + /* This loop is actually executed only when end != REG_MISSING, to rewrite {0,n} as ((...?)?)?... We have already created the start+1-th copy. */ - if ((Idx) -1 < 0 || end != REG_MISSING) + if (TYPE_SIGNED (Idx) || end != REG_MISSING) for (i = start + 2; i <= end; ++i) { elem = duplicate_tree (elem, dfa); @@ -2614,17 +2673,23 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ static reg_errcode_t internal_function # ifdef RE_ENABLE_I18N -build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + re_charset_t *mbcset, + Idx *range_alloc, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # else /* not RE_ENABLE_I18N */ -build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, - bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; @@ -2647,7 +2712,6 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, wchar_t wc; wint_t start_wc; wint_t end_wc; - wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] @@ -2661,9 +2725,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, ? __btowc (end_ch) : end_elem->opr.wch); if (start_wc == WEOF || end_wc == WEOF) return REG_ECOLLATE; - cmp_buf[0] = start_wc; - cmp_buf[4] = end_wc; - if (wcscoll (cmp_buf, cmp_buf + 4) > 0) + else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2704,9 +2766,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, /* Build the table for single byte characters. */ for (wc = 0; wc < SBC_MAX; ++wc) { - cmp_buf[2] = wc; - if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) + if (start_wc <= wc && wc <= end_wc) bitset_set (sbcset, wc); } } @@ -2740,11 +2800,12 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, static reg_errcode_t internal_function -build_collating_symbol (bitset_t sbcset, # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, Idx *coll_sym_alloc, -# endif - const unsigned char *name) +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name) +# else /* not RE_ENABLE_I18N */ +build_collating_symbol (bitset_t sbcset, const unsigned char *name) +# endif /* not RE_ENABLE_I18N */ { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -2772,8 +2833,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, const int32_t *symb_table; const unsigned char *extra; - /* Local function for parse_bracket_exp used in _LIBC environement. - Seek the collating symbol entry correspondings to NAME. + /* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. Return the index of the symbol in the SYMB_TABLE. */ auto inline int32_t @@ -2876,11 +2937,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return UINT_MAX; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ auto inline reg_errcode_t @@ -2902,6 +2963,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 0)) return REG_ERANGE; + /* FIXME: Implement rational ranges here, too. */ start_collseq = lookup_collation_sequence_value (start_elem); end_collseq = lookup_collation_sequence_value (end_elem); /* Check start/end collation sequence values. */ @@ -2960,11 +3022,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return REG_NOERROR; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the collating element which is represented by NAME. The result are written to MBCSET and SBCSET. COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument sinse we may update it. */ + pointer argument since we may update it. */ auto inline reg_errcode_t __attribute ((always_inline)) @@ -3066,6 +3128,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (BE (sbcset == NULL, 0)) #endif /* RE_ENABLE_I18N */ { + re_free (sbcset); +#ifdef RE_ENABLE_I18N + re_free (mbcset); +#endif *err = REG_ESPACE; return NULL; } @@ -3166,11 +3232,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N - *err = build_range_exp (sbcset, + *err = build_range_exp (syntax, sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, &range_alloc, &start_elem, &end_elem); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) @@ -3404,7 +3470,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, Build the equivalence class which is represented by NAME. The result are written to MBCSET and SBCSET. EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N @@ -3435,19 +3501,18 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp); - if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) + idx1 = findidx (&cp, -1); + if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; - /* Build single byte matcing table for this equivalence class. */ - char_buf[1] = (unsigned char) '\0'; + /* Build single byte matching table for this equivalence class. */ len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp); + idx2 = findidx (&cp, 1); /* idx2 = table[ch]; */ @@ -3500,20 +3565,20 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) Build the character class which is represented by NAME. The result are written to MBCSET and SBCSET. CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, re_charset_t *mbcset, Idx *char_class_alloc, - const unsigned char *class_name, reg_syntax_t syntax) + const char *class_name, reg_syntax_t syntax) #else /* not RE_ENABLE_I18N */ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, - const unsigned char *class_name, reg_syntax_t syntax) + const char *class_name, reg_syntax_t syntax) #endif /* not RE_ENABLE_I18N */ { int i; - const char *name = (const char *) class_name; + const char *name = class_name; /* In case of REG_ICASE "upper" and "lower" match the both of upper and lower cases. */ @@ -3587,8 +3652,8 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, static bin_tree_t * build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, - const unsigned char *class_name, - const unsigned char *extra, bool non_match, + const char *class_name, + const char *extra, bool non_match, reg_errcode_t *err) { re_bitset_ptr_t sbcset; @@ -3694,8 +3759,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, } /* This is intended for the expressions like "a{1,3}". - Fetch a number from `input', and return the number. + Fetch a number from 'input', and return the number. Return REG_MISSING if the number field is empty like "{,1}". + Return RE_DUP_MAX + 1 if the number field is too large. Return REG_ERROR if an error occurred. */ static Idx @@ -3714,8 +3780,9 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) num = ((token->type != CHARACTER || c < '0' || '9' < c || num == REG_ERROR) ? REG_ERROR - : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); - num = (num > RE_DUP_MAX) ? REG_ERROR : num; + : num == REG_MISSING + ? c - '0' + : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); } return num; } @@ -3789,7 +3856,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node) { - Idx idx = (Idx) (long) extra; + Idx idx = (uintptr_t) extra; if (node->token.type == SUBEXP && node->token.opr.idx == idx) node->token.opt_subexp = 1;