X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregcomp.c;h=e6d9c999d91110b378867e7bbaad9fd35859f2a2;hb=9b71dd21dd3425978a8b121335bfca267a45a617;hp=6546be47e846365d94cb9641e1d78923588012f1;hpb=f7328f9d0aed9f61e4f38b731ac9fa88a3c55622;p=gnulib.git diff --git a/lib/regcomp.c b/lib/regcomp.c index 6546be47e..e6d9c999d 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc. + Copyright (C) 2002-2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,8 +14,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + with this program; if not, see . */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax); @@ -206,7 +205,7 @@ static const size_t __re_error_msgid_idx[] = compiles PATTERN (of length LENGTH) and puts the result in BUFP. Returns 0 if the pattern was valid, otherwise an error string. - Assumes the `allocated' (and perhaps `buffer') and `translate' fields + Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields are set in BUFP on entry. */ #ifdef _LIBC @@ -241,7 +240,7 @@ re_compile_pattern (const char *pattern, size_t length, weak_alias (__re_compile_pattern, re_compile_pattern) #endif -/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can +/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ /* This has no initializer because initialized variables in Emacs @@ -333,8 +332,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; memset (&state, '\0', sizeof (state)); - if (mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf + if (__mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) != (size_t) -1)) re_set_fastmap (fastmap, false, buf[0]); @@ -356,45 +355,65 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { - Idx i; re_charset_t *cset = dfa->nodes[node].opr.mbcset; - if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes - || cset->nranges || cset->nchar_classes) - { + Idx i; + # ifdef _LIBC - if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + /* See if we have to try all bytes which start multiple collation + elements. + e.g. In da_DK, we want to catch 'a' since "aa" is a valid + collation element, and don't catch 'b' since 'b' is + the only collation element which starts from 'b' (and + it is caught by SIMPLE_BRACKET). */ + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 + && (cset->ncoll_syms || cset->nranges)) { - /* In this case we want to catch the bytes which are - the first byte of any collation elements. - e.g. In da_DK, we want to catch 'a' since "aa" - is a valid collation element, and don't catch - 'b' since 'b' is the only collation element - which starts from 'b'. */ const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); for (i = 0; i < SBC_MAX; ++i) if (table[i] < 0) re_set_fastmap (fastmap, icase, i); } -# else - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - re_set_fastmap (fastmap, icase, i); -# endif /* not _LIBC */ +# endif /* _LIBC */ + + /* See if we have to start the match at all multibyte characters, + i.e. where we would not find an invalid sequence. This only + applies to multibyte character sets; for single byte character + sets, the SIMPLE_BRACKET again suffices. */ + if (dfa->mb_cur_max > 1 + && (cset->nchar_classes || cset->non_match || cset->nranges +# ifdef _LIBC + || cset->nequiv_classes +# endif /* _LIBC */ + )) + { + unsigned char c = 0; + do + { + mbstate_t mbs; + memset (&mbs, 0, sizeof (mbs)); + if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) + re_set_fastmap (fastmap, false, (int) c); + } + while (++c != 0); } - for (i = 0; i < cset->nmbchars; ++i) + + else { - char buf[256]; - mbstate_t state; - memset (&state, '\0', sizeof (state)); - if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + /* ... Else catch all bytes which can start the mbchars. */ + for (i = 0; i < cset->nmbchars; ++i) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, false, *(unsigned char *) buf); + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, false, *(unsigned char *) buf); + } } } } @@ -419,15 +438,15 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, PREG is a regex_t *. We do not expect any fields to be initialized, since POSIX says we shouldn't. Thus, we set - `buffer' to the compiled pattern; - `used' to the length of the compiled pattern; - `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + 'buffer' to the compiled pattern; + 'used' to the length of the compiled pattern; + 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the REG_EXTENDED bit in CFLAGS is set; otherwise, to RE_SYNTAX_POSIX_BASIC; - `newline_anchor' to REG_NEWLINE being set in CFLAGS; - `fastmap' to an allocated space for the fastmap; - `fastmap_accurate' to zero; - `re_nsub' to the number of subexpressions in PATTERN. + 'newline_anchor' to REG_NEWLINE being set in CFLAGS; + 'fastmap' to an allocated space for the fastmap; + 'fastmap_accurate' to zero; + 're_nsub' to the number of subexpressions in PATTERN. PATTERN is the address of the pattern string. @@ -566,19 +585,23 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +# ifdef __GNUC__ + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX +# else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) +# endif }; #endif @@ -615,7 +638,7 @@ free_dfa_content (re_dfa_t *dfa) re_dfastate_t *state = entry->array[j]; free_state (state); } - re_free (entry->array); + re_free (entry->array); } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N @@ -698,7 +721,7 @@ re_comp (s) + __re_error_msgid_idx[(int) REG_ESPACE]); } - /* Since `re_exec' always passes NULL for the `regs' argument, we + /* Since 're_exec' always passes NULL for the 'regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ /* Match anchors at newlines. */ @@ -709,7 +732,7 @@ re_comp (s) if (!ret) return NULL; - /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } @@ -776,7 +799,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, __libc_lock_init (dfa->lock); err = re_string_construct (®exp, pattern, length, preg->translate, - syntax & RE_ICASE, dfa); + (syntax & RE_ICASE) != 0, dfa); if (BE (err != REG_NOERROR, 0)) { re_compile_internal_free_return: @@ -853,7 +876,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) calculation below, and for similar doubling calculations elsewhere. And it's <= rather than <, because some of the doubling calculations add 1 afterwards. */ - if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) return REG_ESPACE; dfa->nodes_alloc = pat_len + 1; @@ -875,20 +898,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); #else -# ifdef HAVE_LANGINFO_CODESET codeset_name = nl_langinfo (CODESET); -# else - codeset_name = getenv ("LC_ALL"); - if (codeset_name == NULL || codeset_name[0] == '\0') - codeset_name = getenv ("LC_CTYPE"); - if (codeset_name == NULL || codeset_name[0] == '\0') - codeset_name = getenv ("LANG"); - if (codeset_name == NULL) - codeset_name = ""; - else if (strchr (codeset_name, '.') != NULL) - codeset_name = strchr (codeset_name, '.') + 1; -# endif - if (strcasecmp (codeset_name, "UTF-8") == 0 || strcasecmp (codeset_name, "UTF8") == 0) dfa->is_utf8 = 1; @@ -940,9 +950,39 @@ static void internal_function init_word_char (re_dfa_t *dfa) { - int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + int i = 0; + int j; + int ch = 0; + if (BE (dfa->map_notascii == 0, 1)) + { + if (BITSET_WORD_BITS == 64) + { + dfa->word_char[0] = UINT64_C (0x03ff000000000000); + dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe); + i = 2; + } + else if (BITSET_WORD_BITS == 32) + { + dfa->word_char[0] = UINT32_C (0x00000000); + dfa->word_char[1] = UINT32_C (0x03ff0000); + dfa->word_char[2] = UINT32_C (0x87fffffe); + dfa->word_char[3] = UINT32_C (0x07fffffe); + i = 4; + } + else + goto general_case; + ch = 128; + + if (BE (dfa->is_utf8, 1)) + { + memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); + return; + } + } + + general_case: + for (; i < BITSET_WORDS; ++i) for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') dfa->word_char[i] |= (bitset_word_t) 1 << j; @@ -1013,7 +1053,10 @@ create_initial_state (re_dfa_t *dfa) Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + reg_errcode_t merge_err + = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + if (merge_err != REG_NOERROR) + return merge_err; i = 0; } } @@ -1067,7 +1110,7 @@ optimize_utf8 (re_dfa_t *dfa) mb_chars = true; break; case ANCHOR: - switch (dfa->nodes[node].opr.idx) + switch (dfa->nodes[node].opr.ctx_type) { case LINE_FIRST: case LINE_LAST: @@ -1075,13 +1118,15 @@ optimize_utf8 (re_dfa_t *dfa) case BUF_LAST: break; default: - /* Word anchors etc. cannot be handled. */ + /* Word anchors etc. cannot be handled. It's okay to test + opr.ctx_type since constraints (for all DFA nodes) are + created by ORing one or more opr.ctx_type values. */ return; } break; case OP_PERIOD: - has_period = true; - break; + has_period = true; + break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: @@ -1182,7 +1227,7 @@ analyze (regex_t *preg) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1204,16 +1249,16 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), if that's the only child). */ while (node->left || node->right) if (node->left) - node = node->left; - else - node = node->right; + node = node->left; + else + node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; - if (node->parent == NULL) + if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; @@ -1247,7 +1292,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), prev = node; node = node->parent; if (!node) - return REG_NOERROR; + return REG_NOERROR; } node = node->right; } @@ -1270,13 +1315,13 @@ optimize_subexps (void *extra, bin_tree_t *node) } else if (node->token.type == SUBEXP - && node->left && node->left->token.type == SUBEXP) + && node->left && node->left->token.type == SUBEXP) { Idx other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) - node->left->parent = node; + node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; if (other_idx < BITSET_WORD_BITS) @@ -1361,7 +1406,9 @@ calc_first (void *extra, bin_tree_t *node) node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); if (BE (node->node_idx == REG_MISSING, 0)) - return REG_ESPACE; + return REG_ESPACE; + if (node->token.type == ANCHOR) + dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; } return REG_NOERROR; } @@ -1383,7 +1430,7 @@ calc_next (void *extra, bin_tree_t *node) if (node->left) node->left->next = node->next; if (node->right) - node->right->next = node->next; + node->right->next = node->next; break; } return REG_NOERROR; @@ -1434,7 +1481,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) case OP_BACK_REF: dfa->nexts[idx] = node->next->node_idx; if (node->token.type == OP_BACK_REF) - re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); + err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); break; default: @@ -1491,21 +1538,17 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, destination. */ org_dest = dfa->edests[org_node].elems[0]; re_node_set_empty (dfa->edests + clone_node); - if (dfa->nodes[org_node].type == ANCHOR) + /* If the node is root_node itself, it means the epsilon closure + has a loop. Then tie it to the destination of the root_node. */ + if (org_node == root_node && clone_node != org_node) { - /* In case of the node has another constraint, append it. */ - if (org_node == root_node && clone_node != org_node) - { - /* ...but if the node is root_node itself, it means the - epsilon closure have a loop, then tie it to the - destination of the root_node. */ - ok = re_node_set_insert (dfa->edests + clone_node, org_dest); - if (BE (! ok, 0)) - return REG_ESPACE; - break; - } - constraint |= dfa->nodes[org_node].opr.ctx_type; + ok = re_node_set_insert (dfa->edests + clone_node, org_dest); + if (BE (! ok, 0)) + return REG_ESPACE; + break; } + /* In case the node has another constraint, append it. */ + constraint |= dfa->nodes[org_node].constraint; clone_dest = duplicate_node (dfa, org_dest, constraint); if (BE (clone_dest == REG_MISSING, 0)) return REG_ESPACE; @@ -1523,7 +1566,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, clone_dest = search_duplicated_node (dfa, org_dest, constraint); if (clone_dest == REG_MISSING) { - /* There are no such a duplicated node, create a new one. */ + /* There is no such duplicated node, create a new one. */ reg_errcode_t err; clone_dest = duplicate_node (dfa, org_dest, constraint); if (BE (clone_dest == REG_MISSING, 0)) @@ -1538,7 +1581,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, } else { - /* There are a duplicated node which satisfy the constraint, + /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (! ok, 0)) @@ -1587,8 +1630,7 @@ duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) if (BE (dup_idx != REG_MISSING, 1)) { dfa->nodes[dup_idx].constraint = constraint; - if (dfa->nodes[org_idx].type == ANCHOR) - dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; + dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; dfa->nodes[dup_idx].duplicated = 1; /* Store the index of the original node. */ @@ -1650,7 +1692,7 @@ calc_eclosure (re_dfa_t *dfa) /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; - /* Calculate epsilon closure of `node_idx'. */ + /* Calculate epsilon closure of 'node_idx'. */ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); if (BE (err != REG_NOERROR, 0)) return err; @@ -1670,12 +1712,10 @@ static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { reg_errcode_t err; - unsigned int constraint; Idx i; - bool incomplete; - bool ok; re_node_set eclosure; - incomplete = false; + bool ok; + bool incomplete = false; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); if (BE (err != REG_NOERROR, 0)) return err; @@ -1684,15 +1724,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) We reference this value to avoid infinite loop. */ dfa->eclosures[node].nelem = REG_MISSING; - constraint = ((dfa->nodes[node].type == ANCHOR) - ? dfa->nodes[node].opr.ctx_type : 0); - /* If the current node has constraints, duplicate all nodes. - Since they must inherit the constraints. */ - if (constraint + /* If the current node has constraints, duplicate all nodes + since they must inherit the constraints. */ + if (dfa->nodes[node].constraint && dfa->edests[node].nelem && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) { - err = duplicate_node_closure (dfa, node, node, node, constraint); + err = duplicate_node_closure (dfa, node, node, node, + dfa->nodes[node].constraint); if (BE (err != REG_NOERROR, 0)) return err; } @@ -1703,14 +1742,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { re_node_set eclosure_elem; Idx edest = dfa->edests[node].elems[i]; - /* If calculating the epsilon closure of `edest' is in progress, + /* If calculating the epsilon closure of 'edest' is in progress, return intermediate result. */ if (dfa->eclosures[edest].nelem == REG_MISSING) { incomplete = true; continue; } - /* If we haven't calculated the epsilon closure of `edest' yet, + /* If we haven't calculated the epsilon closure of 'edest' yet, calculate now. Otherwise use calculated epsilon closure. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1720,9 +1759,11 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } else eclosure_elem = dfa->eclosures[edest]; - /* Merge the epsilon closure of `edest'. */ - re_node_set_merge (&eclosure, &eclosure_elem); - /* If the epsilon closure of `edest' is incomplete, + /* Merge the epsilon closure of 'edest'. */ + err = re_node_set_merge (&eclosure, &eclosure_elem); + if (BE (err != REG_NOERROR, 0)) + return err; + /* If the epsilon closure of 'edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) { @@ -1731,7 +1772,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } } - /* Epsilon closures include itself. */ + /* An epsilon closure includes itself. */ ok = re_node_set_insert (&eclosure, node); if (BE (! ok, 0)) return REG_ESPACE; @@ -2084,7 +2125,7 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) /* Entry point of the parser. Parse the regular expression REGEXP and return the structure tree. - If an error is occured, ERR is set by error code, and return NULL. + If an error occurs, ERR is set by error code, and return NULL. This function build the following tree, from regular expression : CAT / \ @@ -2126,7 +2167,7 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, / \ - ALT means alternative, which represents the operator `|'. */ + ALT means alternative, which represents the operator '|'. */ static bin_tree_t * parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, @@ -2185,16 +2226,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, expr = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && expr == NULL, 0)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); return NULL; } if (tree != NULL && expr != NULL) { - tree = create_tree (dfa, tree, expr, CONCAT); - if (tree == NULL) + bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); + if (newtree == NULL) { + postorder (expr, free_tree, NULL); + postorder (tree, free_tree, NULL); *err = REG_ESPACE; return NULL; } + tree = newtree; } else if (tree == NULL) tree = expr; @@ -2318,7 +2364,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, && dfa->word_ops_used == 0) init_word_char (dfa); if (token->opr.ctx_type == WORD_DELIM - || token->opr.ctx_type == NOT_WORD_DELIM) + || token->opr.ctx_type == NOT_WORD_DELIM) { bin_tree_t *tree_first, *tree_last; if (token->opr.ctx_type == WORD_DELIM) @@ -2326,13 +2372,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, token->opr.ctx_type = WORD_FIRST; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = WORD_LAST; - } - else - { + } + else + { token->opr.ctx_type = INSIDE_WORD; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = INSIDE_NOTWORD; - } + } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) @@ -2443,7 +2489,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + *err = REG_EPAREN; + } if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -2514,7 +2564,8 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return elem; } - if (BE (end != REG_MISSING && start > end, 0)) + if (BE ((end != REG_MISSING && start > end) + || token->type != OP_CLOSE_DUP_NUM, 0)) { /* First number greater than second. */ *err = REG_BADBR; @@ -2567,10 +2618,14 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, if (BE (tree == NULL, 0)) goto parse_dup_op_espace; +/* From gnulib's "intprops.h": + True if the arithmetic type T is signed. */ +#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) + /* This loop is actually executed only when end != REG_MISSING, to rewrite {0,n} as ((...?)?)?... We have already created the start+1-th copy. */ - if ((Idx) -1 < 0 || end != REG_MISSING) + if (TYPE_SIGNED (Idx) || end != REG_MISSING) for (i = start + 2; i <= end; ++i) { elem = duplicate_tree (elem, dfa); @@ -2602,17 +2657,23 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ static reg_errcode_t internal_function # ifdef RE_ENABLE_I18N -build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + re_charset_t *mbcset, + Idx *range_alloc, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # else /* not RE_ENABLE_I18N */ -build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, - bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; @@ -2651,7 +2712,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, return REG_ECOLLATE; cmp_buf[0] = start_wc; cmp_buf[4] = end_wc; - if (wcscoll (cmp_buf, cmp_buf + 4) > 0) + + if (BE ((syntax & RE_NO_EMPTY_RANGES) + && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2661,9 +2724,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, no MBCSET if dfa->mb_cur_max == 1. */ if (mbcset) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) + { /* There is not enough space, need realloc. */ wchar_t *new_array_start, *new_array_end; Idx new_nranges; @@ -2673,9 +2736,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, /* Use realloc since mbcset->range_starts and mbcset->range_ends are NULL if *range_alloc == 0. */ new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); + new_nranges); new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) return REG_ESPACE; @@ -2683,10 +2746,10 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; - } + } - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; } /* Build the table for single byte characters. */ @@ -2728,11 +2791,12 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, static reg_errcode_t internal_function -build_collating_symbol (bitset_t sbcset, # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, Idx *coll_sym_alloc, -# endif - const unsigned char *name) +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name) +# else /* not RE_ENABLE_I18N */ +build_collating_symbol (bitset_t sbcset, const unsigned char *name) +# endif /* not RE_ENABLE_I18N */ { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -2760,8 +2824,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, const int32_t *symb_table; const unsigned char *extra; - /* Local function for parse_bracket_exp used in _LIBC environement. - Seek the collating symbol entry correspondings to NAME. + /* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. Return the index of the symbol in the SYMB_TABLE. */ auto inline int32_t @@ -2798,7 +2862,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2822,7 +2886,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { @@ -2863,11 +2928,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return UINT_MAX; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the range expression which starts from START_ELEM, and ends at END_ELEM. The result are written to MBCSET and SBCSET. RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument sinse we may + mbcset->range_ends, is a pointer argument since we may update it. */ auto inline reg_errcode_t @@ -2903,8 +2968,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, build below suffices. */ if (nrules > 0 || dfa->mb_cur_max > 1) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) { /* There is not enough space, need realloc. */ uint32_t *new_array_start; @@ -2916,18 +2981,18 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, new_array_start = re_realloc (mbcset->range_starts, uint32_t, new_nranges); new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } /* Build the table for single byte characters. */ @@ -2947,11 +3012,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return REG_NOERROR; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Build the collating element which is represented by NAME. The result are written to MBCSET and SBCSET. COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument sinse we may update it. */ + pointer argument since we may update it. */ auto inline reg_errcode_t __attribute ((always_inline)) @@ -3053,6 +3118,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (BE (sbcset == NULL, 0)) #endif /* RE_ENABLE_I18N */ { + re_free (sbcset); +#ifdef RE_ENABLE_I18N + re_free (mbcset); +#endif *err = REG_ESPACE; return NULL; } @@ -3070,7 +3139,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #endif /* not RE_ENABLE_I18N */ non_match = true; if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set (sbcset, '\0'); + bitset_set (sbcset, '\n'); re_string_skip_bytes (regexp, token_len); /* Skip a token. */ token_len = peek_token_bracket (token, regexp, syntax); if (BE (token->type == END_OF_RE, 0)) @@ -3153,11 +3222,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N - *err = build_range_exp (sbcset, + *err = build_range_exp (syntax, sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, &range_alloc, &start_elem, &end_elem); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) @@ -3261,17 +3330,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ if (sbc_idx < BITSET_WORDS) { - /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; - work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + work_tree = create_token_tree (dfa, NULL, NULL, &br_token); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; - /* Then join them by ALT node. */ - work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; } else { @@ -3290,7 +3359,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + goto parse_bracket_exp_espace; } return work_tree; @@ -3391,7 +3460,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, Build the equivalence class which is represented by NAME. The result are written to MBCSET and SBCSET. EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N @@ -3422,30 +3491,33 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp); - if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) + idx1 = findidx (&cp, -1); + if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; - /* Build single byte matcing table for this equivalence class. */ - char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + /* Build single byte matching table for this equivalence class. */ + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp); + idx2 = findidx (&cp, 1); /* idx2 = table[ch]; */ if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) @@ -3483,7 +3555,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) Build the character class which is represented by NAME. The result are written to MBCSET and SBCSET. CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, - is a pointer argument sinse we may update it. */ + is a pointer argument since we may update it. */ static reg_errcode_t #ifdef RE_ENABLE_I18N @@ -3601,10 +3673,6 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, if (non_match) { #ifdef RE_ENABLE_I18N - /* - if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set(cset->sbcset, '\0'); - */ mbcset->non_match = 1; #endif /* not RE_ENABLE_I18N */ } @@ -3681,7 +3749,7 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, } /* This is intended for the expressions like "a{1,3}". - Fetch a number from `input', and return the number. + Fetch a number from 'input', and return the number. Return REG_MISSING if the number field is empty like "{,1}". Return REG_ERROR if an error occurred. */ @@ -3845,7 +3913,7 @@ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) node = node->parent; dup_node = dup_node->parent; if (!node) - return dup_root; + return dup_root; } node = node->right; p_new = &dup_node->right;