X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregcomp.c;h=6d5525a6f0956ba7d94b0aa04081cc84047f241a;hb=991d8a3a009e937f6ad6f5df6207969294f96b13;hp=629f97142f6bd16ccd1ef7f6594cefc5741c871d;hpb=7f206b679ba1a86cc09f34516ff9dd5dce6e1f60;p=gnulib.git diff --git a/lib/regcomp.c b/lib/regcomp.c index 629f97142..6d5525a6f 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,6 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free - Software Foundation, Inc. + Copyright (C) 2002-2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -15,8 +14,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + with this program; if not, see . */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax); @@ -207,7 +205,7 @@ static const size_t __re_error_msgid_idx[] = compiles PATTERN (of length LENGTH) and puts the result in BUFP. Returns 0 if the pattern was valid, otherwise an error string. - Assumes the `allocated' (and perhaps `buffer') and `translate' fields + Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields are set in BUFP on entry. */ #ifdef _LIBC @@ -242,7 +240,7 @@ re_compile_pattern (const char *pattern, size_t length, weak_alias (__re_compile_pattern, re_compile_pattern) #endif -/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can +/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ /* This has no initializer because initialized variables in Emacs @@ -274,7 +272,7 @@ int re_compile_fastmap (bufp) struct re_pattern_buffer *bufp; { - re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + re_dfa_t *dfa = bufp->buffer; char *fastmap = bufp->fastmap; memset (fastmap, '\0', sizeof (char) * SBC_MAX); @@ -308,7 +306,7 @@ static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap) { - re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + re_dfa_t *dfa = bufp->buffer; Idx node_cnt; bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) @@ -440,15 +438,15 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, PREG is a regex_t *. We do not expect any fields to be initialized, since POSIX says we shouldn't. Thus, we set - `buffer' to the compiled pattern; - `used' to the length of the compiled pattern; - `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + 'buffer' to the compiled pattern; + 'used' to the length of the compiled pattern; + 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the REG_EXTENDED bit in CFLAGS is set; otherwise, to RE_SYNTAX_POSIX_BASIC; - `newline_anchor' to REG_NEWLINE being set in CFLAGS; - `fastmap' to an allocated space for the fastmap; - `fastmap_accurate' to zero; - `re_nsub' to the number of subexpressions in PATTERN. + 'newline_anchor' to REG_NEWLINE being set in CFLAGS; + 'fastmap' to an allocated space for the fastmap; + 'fastmap_accurate' to zero; + 're_nsub' to the number of subexpressions in PATTERN. PATTERN is the address of the pattern string. @@ -587,19 +585,23 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +# ifdef __GNUC__ + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX +# else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) +# endif }; #endif @@ -636,7 +638,7 @@ free_dfa_content (re_dfa_t *dfa) re_dfastate_t *state = entry->array[j]; free_state (state); } - re_free (entry->array); + re_free (entry->array); } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N @@ -658,7 +660,7 @@ void regfree (preg) regex_t *preg; { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; if (BE (dfa != NULL, 1)) free_dfa_content (dfa); preg->buffer = NULL; @@ -719,7 +721,7 @@ re_comp (s) + __re_error_msgid_idx[(int) REG_ESPACE]); } - /* Since `re_exec' always passes NULL for the `regs' argument, we + /* Since 're_exec' always passes NULL for the 'regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ /* Match anchors at newlines. */ @@ -730,7 +732,7 @@ re_comp (s) if (!ret) return NULL; - /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } @@ -765,7 +767,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, preg->regs_allocated = REGS_UNALLOCATED; /* Initialize the dfa. */ - dfa = (re_dfa_t *) preg->buffer; + dfa = preg->buffer; if (BE (preg->allocated < sizeof (re_dfa_t), 0)) { /* If zero allocated, but buffer is non-null, try to realloc @@ -776,7 +778,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, if (dfa == NULL) return REG_ESPACE; preg->allocated = sizeof (re_dfa_t); - preg->buffer = (unsigned char *) dfa; + preg->buffer = dfa; } preg->used = sizeof (re_dfa_t); @@ -851,7 +853,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) { __re_size_t table_size; #ifndef _LIBC - char *codeset_name; + const char *codeset_name; #endif #ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); @@ -874,7 +876,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) calculation below, and for similar doubling calculations elsewhere. And it's <= rather than <, because some of the doubling calculations add 1 afterwards. */ - if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) return REG_ESPACE; dfa->nodes_alloc = pat_len + 1; @@ -897,8 +899,10 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) != 0); #else codeset_name = nl_langinfo (CODESET); - if (strcasecmp (codeset_name, "UTF-8") == 0 - || strcasecmp (codeset_name, "UTF8") == 0) + if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') + && (codeset_name[1] == 'T' || codeset_name[1] == 't') + && (codeset_name[2] == 'F' || codeset_name[2] == 'f') + && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) dfa->is_utf8 = 1; /* We check exhaustively in the loop below if this charset is a @@ -948,9 +952,43 @@ static void internal_function init_word_char (re_dfa_t *dfa) { - int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + int i = 0; + int j; + int ch = 0; + if (BE (dfa->map_notascii == 0, 1)) + { + bitset_word_t bits0 = 0x00000000; + bitset_word_t bits1 = 0x03ff0000; + bitset_word_t bits2 = 0x87fffffe; + bitset_word_t bits3 = 0x07fffffe; + if (BITSET_WORD_BITS == 64) + { + dfa->word_char[0] = bits1 << 31 << 1 | bits0; + dfa->word_char[1] = bits3 << 31 << 1 | bits2; + i = 2; + } + else if (BITSET_WORD_BITS == 32) + { + dfa->word_char[0] = bits0; + dfa->word_char[1] = bits1; + dfa->word_char[2] = bits2; + dfa->word_char[3] = bits3; + i = 4; + } + else + goto general_case; + ch = 128; + + if (BE (dfa->is_utf8, 1)) + { + memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); + return; + } + } + + general_case: + for (; i < BITSET_WORDS; ++i) for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') dfa->word_char[i] |= (bitset_word_t) 1 << j; @@ -961,7 +999,7 @@ init_word_char (re_dfa_t *dfa) static void free_workarea_compile (regex_t *preg) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_storage_t *storage, *next; for (storage = dfa->str_tree_storage; storage; storage = next) { @@ -1021,7 +1059,10 @@ create_initial_state (re_dfa_t *dfa) Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + reg_errcode_t merge_err + = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + if (merge_err != REG_NOERROR) + return merge_err; i = 0; } } @@ -1090,8 +1131,8 @@ optimize_utf8 (re_dfa_t *dfa) } break; case OP_PERIOD: - has_period = true; - break; + has_period = true; + break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: @@ -1142,7 +1183,7 @@ optimize_utf8 (re_dfa_t *dfa) static reg_errcode_t analyze (regex_t *preg) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; reg_errcode_t ret; /* Allocate arrays. */ @@ -1192,7 +1233,7 @@ analyze (regex_t *preg) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1214,16 +1255,16 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), if that's the only child). */ while (node->left || node->right) if (node->left) - node = node->left; - else - node = node->right; + node = node->left; + else + node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; - if (node->parent == NULL) + if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; @@ -1257,7 +1298,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), prev = node; node = node->parent; if (!node) - return REG_NOERROR; + return REG_NOERROR; } node = node->right; } @@ -1280,13 +1321,13 @@ optimize_subexps (void *extra, bin_tree_t *node) } else if (node->token.type == SUBEXP - && node->left && node->left->token.type == SUBEXP) + && node->left && node->left->token.type == SUBEXP) { Idx other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) - node->left->parent = node; + node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; if (other_idx < BITSET_WORD_BITS) @@ -1323,7 +1364,7 @@ lower_subexps (void *extra, bin_tree_t *node) static bin_tree_t * lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *body = node->left; bin_tree_t *op, *cls, *tree1, *tree; @@ -1371,9 +1412,9 @@ calc_first (void *extra, bin_tree_t *node) node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); if (BE (node->node_idx == REG_MISSING, 0)) - return REG_ESPACE; + return REG_ESPACE; if (node->token.type == ANCHOR) - dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; + dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; } return REG_NOERROR; } @@ -1395,7 +1436,7 @@ calc_next (void *extra, bin_tree_t *node) if (node->left) node->left->next = node->next; if (node->right) - node->right->next = node->next; + node->right->next = node->next; break; } return REG_NOERROR; @@ -1446,7 +1487,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) case OP_BACK_REF: dfa->nexts[idx] = node->next->node_idx; if (node->token.type == OP_BACK_REF) - re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); + err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); break; default: @@ -1657,7 +1698,7 @@ calc_eclosure (re_dfa_t *dfa) /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; - /* Calculate epsilon closure of `node_idx'. */ + /* Calculate epsilon closure of 'node_idx'. */ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); if (BE (err != REG_NOERROR, 0)) return err; @@ -1678,10 +1719,9 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { reg_errcode_t err; Idx i; - bool incomplete; - bool ok; re_node_set eclosure; - incomplete = false; + bool ok; + bool incomplete = false; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); if (BE (err != REG_NOERROR, 0)) return err; @@ -1708,14 +1748,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { re_node_set eclosure_elem; Idx edest = dfa->edests[node].elems[i]; - /* If calculating the epsilon closure of `edest' is in progress, + /* If calculating the epsilon closure of 'edest' is in progress, return intermediate result. */ if (dfa->eclosures[edest].nelem == REG_MISSING) { incomplete = true; continue; } - /* If we haven't calculated the epsilon closure of `edest' yet, + /* If we haven't calculated the epsilon closure of 'edest' yet, calculate now. Otherwise use calculated epsilon closure. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1725,9 +1765,11 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } else eclosure_elem = dfa->eclosures[edest]; - /* Merge the epsilon closure of `edest'. */ - re_node_set_merge (&eclosure, &eclosure_elem); - /* If the epsilon closure of `edest' is incomplete, + /* Merge the epsilon closure of 'edest'. */ + err = re_node_set_merge (&eclosure, &eclosure_elem); + if (BE (err != REG_NOERROR, 0)) + return err; + /* If the epsilon closure of 'edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1736,7 +1778,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } } - /* Epsilon closures include itself. */ + /* An epsilon closure includes itself. */ ok = re_node_set_insert (&eclosure, node); if (BE (! ok, 0)) return REG_ESPACE; @@ -2089,7 +2131,7 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) /* Entry point of the parser. Parse the regular expression REGEXP and return the structure tree. - If an error is occured, ERR is set by error code, and return NULL. + If an error occurs, ERR is set by error code, and return NULL. This function build the following tree, from regular expression : CAT / \ @@ -2103,7 +2145,7 @@ static bin_tree_t * parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree, *eor, *root; re_token_t current_token; dfa->syntax = syntax; @@ -2131,13 +2173,13 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, / \ - ALT means alternative, which represents the operator `|'. */ + ALT means alternative, which represents the operator '|'. */ static bin_tree_t * parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree, *branch = NULL; tree = parse_branch (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) @@ -2179,7 +2221,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { bin_tree_t *tree, *expr; - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; tree = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2190,16 +2232,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, expr = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && expr == NULL, 0)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); return NULL; } if (tree != NULL && expr != NULL) { - tree = create_tree (dfa, tree, expr, CONCAT); - if (tree == NULL) + bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); + if (newtree == NULL) { + postorder (expr, free_tree, NULL); + postorder (tree, free_tree, NULL); *err = REG_ESPACE; return NULL; } + tree = newtree; } else if (tree == NULL) tree = expr; @@ -2218,7 +2265,7 @@ static bin_tree_t * parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree; switch (token->type) { @@ -2323,7 +2370,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, && dfa->word_ops_used == 0) init_word_char (dfa); if (token->opr.ctx_type == WORD_DELIM - || token->opr.ctx_type == NOT_WORD_DELIM) + || token->opr.ctx_type == NOT_WORD_DELIM) { bin_tree_t *tree_first, *tree_last; if (token->opr.ctx_type == WORD_DELIM) @@ -2331,13 +2378,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, token->opr.ctx_type = WORD_FIRST; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = WORD_LAST; - } - else - { + } + else + { token->opr.ctx_type = INSIDE_WORD; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = INSIDE_NOTWORD; - } + } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) @@ -2434,7 +2481,7 @@ static bin_tree_t * parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfa_t *dfa = preg->buffer; bin_tree_t *tree; size_t cur_nsub; cur_nsub = preg->re_nsub++; @@ -2448,7 +2495,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + *err = REG_EPAREN; + } if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -2526,6 +2577,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, *err = REG_BADBR; return NULL; } + + if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) + { + *err = REG_ESIZE; + return NULL; + } } else { @@ -2566,17 +2623,24 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, old_tree = NULL; if (elem->token.type == SUBEXP) - postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); + { + uintptr_t subidx = elem->token.opr.idx; + postorder (elem, mark_opt_subexp, (void *) subidx); + } tree = create_tree (dfa, elem, NULL, (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); if (BE (tree == NULL, 0)) goto parse_dup_op_espace; +/* From gnulib's "intprops.h": + True if the arithmetic type T is signed. */ +#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) + /* This loop is actually executed only when end != REG_MISSING, to rewrite {0,n} as ((...?)?)?... We have already created the start+1-th copy. */ - if ((Idx) -1 < 0 || end != REG_MISSING) + if (TYPE_SIGNED (Idx) || end != REG_MISSING) for (i = start + 2; i <= end; ++i) { elem = duplicate_tree (elem, dfa); @@ -2608,17 +2672,23 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ static reg_errcode_t internal_function # ifdef RE_ENABLE_I18N -build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + re_charset_t *mbcset, + Idx *range_alloc, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # else /* not RE_ENABLE_I18N */ -build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, - bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; @@ -2657,7 +2727,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, return REG_ECOLLATE; cmp_buf[0] = start_wc; cmp_buf[4] = end_wc; - if (wcscoll (cmp_buf, cmp_buf + 4) > 0) + + if (BE ((syntax & RE_NO_EMPTY_RANGES) + && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2667,9 +2739,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, no MBCSET if dfa->mb_cur_max == 1. */ if (mbcset) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) + { /* There is not enough space, need realloc. */ wchar_t *new_array_start, *new_array_end; Idx new_nranges; @@ -2679,9 +2751,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, /* Use realloc since mbcset->range_starts and mbcset->range_ends are NULL if *range_alloc == 0. */ new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); + new_nranges); new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) return REG_ESPACE; @@ -2689,10 +2761,10 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; - } + } - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; } /* Build the table for single byte characters. */ @@ -2734,11 +2806,12 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, static reg_errcode_t internal_function -build_collating_symbol (bitset_t sbcset, # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, Idx *coll_sym_alloc, -# endif - const unsigned char *name) +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name) +# else /* not RE_ENABLE_I18N */ +build_collating_symbol (bitset_t sbcset, const unsigned char *name) +# endif /* not RE_ENABLE_I18N */ { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -2766,8 +2839,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, const int32_t *symb_table; const unsigned char *extra; - /* Local function for parse_bracket_exp used in _LIBC environement. - Seek the collating symbol entry correspondings to NAME. + /* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. Return the index of the symbol in the SYMB_TABLE. */ auto inline int32_t @@ -2804,7 +2877,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2828,7 +2901,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { @@ -2869,11 +2943,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return UINT_MAX; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ auto inline reg_errcode_t @@ -2909,8 +2983,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, build below suffices. */ if (nrules > 0 || dfa->mb_cur_max > 1) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) { /* There is not enough space, need realloc. */ uint32_t *new_array_start; @@ -2922,18 +2996,18 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, new_array_start = re_realloc (mbcset->range_starts, uint32_t, new_nranges); new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } /* Build the table for single byte characters. */ @@ -2953,11 +3027,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return REG_NOERROR; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the collating element which is represented by NAME. The result are written to MBCSET and SBCSET. COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument sinse we may update it. */ + pointer argument since we may update it. */ auto inline reg_errcode_t __attribute ((always_inline)) @@ -3059,6 +3133,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (BE (sbcset == NULL, 0)) #endif /* RE_ENABLE_I18N */ { + re_free (sbcset); +#ifdef RE_ENABLE_I18N + re_free (mbcset); +#endif *err = REG_ESPACE; return NULL; } @@ -3159,11 +3237,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N - *err = build_range_exp (sbcset, + *err = build_range_exp (syntax, sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, &range_alloc, &start_elem, &end_elem); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) @@ -3267,17 +3345,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ if (sbc_idx < BITSET_WORDS) { - /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; - work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + work_tree = create_token_tree (dfa, NULL, NULL, &br_token); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; - /* Then join them by ALT node. */ - work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; } else { @@ -3296,7 +3374,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + goto parse_bracket_exp_espace; } return work_tree; @@ -3397,7 +3475,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, Build the equivalence class which is represented by NAME. The result are written to MBCSET and SBCSET. EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N @@ -3428,30 +3506,33 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp); - if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) + idx1 = findidx (&cp, -1); + if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; - /* Build single byte matcing table for this equivalence class. */ - char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + /* Build single byte matching table for this equivalence class. */ + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp); + idx2 = findidx (&cp, 1); /* idx2 = table[ch]; */ if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) @@ -3489,7 +3570,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) Build the character class which is represented by NAME. The result are written to MBCSET and SBCSET. CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N @@ -3683,8 +3764,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, } /* This is intended for the expressions like "a{1,3}". - Fetch a number from `input', and return the number. + Fetch a number from 'input', and return the number. Return REG_MISSING if the number field is empty like "{,1}". + Return RE_DUP_MAX + 1 if the number field is too large. Return REG_ERROR if an error occurred. */ static Idx @@ -3703,8 +3785,9 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) num = ((token->type != CHARACTER || c < '0' || '9' < c || num == REG_ERROR) ? REG_ERROR - : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); - num = (num > RE_DUP_MAX) ? REG_ERROR : num; + : num == REG_MISSING + ? c - '0' + : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); } return num; } @@ -3778,7 +3861,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node) { - Idx idx = (Idx) (long) extra; + Idx idx = (uintptr_t) extra; if (node->token.type == SUBEXP && node->token.opr.idx == idx) node->token.opt_subexp = 1; @@ -3847,7 +3930,7 @@ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) node = node->parent; dup_node = dup_node->parent; if (!node) - return dup_root; + return dup_root; } node = node->right; p_new = &dup_node->right;