X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;ds=sidebyside;f=lib%2Fregcomp.c;h=6e87a48aff5316432bf42d8c3b36361bdecda307;hb=6bd173f7ffcd7f02646ad1086035e8ba9cbddcba;hp=a3a745db863b2fe77048b36c1a68d0050b0a9938;hpb=39fc05fb065833938aad7ab0285f017e616a60f2;p=gnulib.git diff --git a/lib/regcomp.c b/lib/regcomp.c index a3a745db8..6e87a48af 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,5 +1,6 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Free Software Foundation, Inc. + Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free + Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -333,8 +334,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; memset (&state, '\0', sizeof (state)); - if (mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf + if (__mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) != (size_t) -1)) re_set_fastmap (fastmap, false, buf[0]); @@ -356,45 +357,65 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { - Idx i; re_charset_t *cset = dfa->nodes[node].opr.mbcset; - if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes - || cset->nranges || cset->nchar_classes) - { + Idx i; + # ifdef _LIBC - if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + /* See if we have to try all bytes which start multiple collation + elements. + e.g. In da_DK, we want to catch 'a' since "aa" is a valid + collation element, and don't catch 'b' since 'b' is + the only collation element which starts from 'b' (and + it is caught by SIMPLE_BRACKET). */ + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 + && (cset->ncoll_syms || cset->nranges)) { - /* In this case we want to catch the bytes which are - the first byte of any collation elements. - e.g. In da_DK, we want to catch 'a' since "aa" - is a valid collation element, and don't catch - 'b' since 'b' is the only collation element - which starts from 'b'. */ const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); for (i = 0; i < SBC_MAX; ++i) if (table[i] < 0) re_set_fastmap (fastmap, icase, i); } -# else - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - re_set_fastmap (fastmap, icase, i); -# endif /* not _LIBC */ +# endif /* _LIBC */ + + /* See if we have to start the match at all multibyte characters, + i.e. where we would not find an invalid sequence. This only + applies to multibyte character sets; for single byte character + sets, the SIMPLE_BRACKET again suffices. */ + if (dfa->mb_cur_max > 1 + && (cset->nchar_classes || cset->non_match || cset->nranges +# ifdef _LIBC + || cset->nequiv_classes +# endif /* _LIBC */ + )) + { + unsigned char c = 0; + do + { + mbstate_t mbs; + memset (&mbs, 0, sizeof (mbs)); + if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) + re_set_fastmap (fastmap, false, (int) c); + } + while (++c != 0); } - for (i = 0; i < cset->nmbchars; ++i) + + else { - char buf[256]; - mbstate_t state; - memset (&state, '\0', sizeof (state)); - if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + /* ... Else catch all bytes which can start the mbchars. */ + for (i = 0; i < cset->nmbchars; ++i) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, false, *(unsigned char *) buf); + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, false, *(unsigned char *) buf); + } } } } @@ -615,7 +636,7 @@ free_dfa_content (re_dfa_t *dfa) re_dfastate_t *state = entry->array[j]; free_state (state); } - re_free (entry->array); + re_free (entry->array); } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N @@ -829,6 +850,9 @@ static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len) { __re_size_t table_size; +#ifndef _LIBC + char *codeset_name; +#endif #ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); #else @@ -872,7 +896,9 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); #else - if (strcmp (locale_charset (), "UTF-8") == 0) + codeset_name = nl_langinfo (CODESET); + if (strcasecmp (codeset_name, "UTF-8") == 0 + || strcasecmp (codeset_name, "UTF8") == 0) dfa->is_utf8 = 1; /* We check exhaustively in the loop below if this charset is a @@ -995,7 +1021,11 @@ create_initial_state (re_dfa_t *dfa) Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + reg_errcode_t err = re_node_set_merge (&init_nodes, + dfa->eclosures + + dest_idx); + if (err != REG_NOERROR) + return err; i = 0; } } @@ -1064,8 +1094,8 @@ optimize_utf8 (re_dfa_t *dfa) } break; case OP_PERIOD: - has_period = true; - break; + has_period = true; + break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: @@ -1166,7 +1196,7 @@ analyze (regex_t *preg) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1188,16 +1218,16 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), if that's the only child). */ while (node->left || node->right) if (node->left) - node = node->left; - else - node = node->right; + node = node->left; + else + node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; - if (node->parent == NULL) + if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; @@ -1231,7 +1261,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), prev = node; node = node->parent; if (!node) - return REG_NOERROR; + return REG_NOERROR; } node = node->right; } @@ -1254,13 +1284,13 @@ optimize_subexps (void *extra, bin_tree_t *node) } else if (node->token.type == SUBEXP - && node->left && node->left->token.type == SUBEXP) + && node->left && node->left->token.type == SUBEXP) { Idx other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) - node->left->parent = node; + node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; if (other_idx < BITSET_WORD_BITS) @@ -1345,9 +1375,9 @@ calc_first (void *extra, bin_tree_t *node) node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); if (BE (node->node_idx == REG_MISSING, 0)) - return REG_ESPACE; + return REG_ESPACE; if (node->token.type == ANCHOR) - dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; + dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; } return REG_NOERROR; } @@ -1369,7 +1399,7 @@ calc_next (void *extra, bin_tree_t *node) if (node->left) node->left->next = node->next; if (node->right) - node->right->next = node->next; + node->right->next = node->next; break; } return REG_NOERROR; @@ -1420,7 +1450,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) case OP_BACK_REF: dfa->nexts[idx] = node->next->node_idx; if (node->token.type == OP_BACK_REF) - re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); + err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); break; default: @@ -1477,7 +1507,6 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, destination. */ org_dest = dfa->edests[org_node].elems[0]; re_node_set_empty (dfa->edests + clone_node); - clone_dest = search_duplicated_node (dfa, org_dest, constraint); /* If the node is root_node itself, it means the epsilon closure has a loop. Then tie it to the destination of the root_node. */ if (org_node == root_node && clone_node != org_node) @@ -1521,7 +1550,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, } else { - /* There is a duplicated node which satisfy the constraint, + /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (! ok, 0)) @@ -1701,7 +1730,9 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) else eclosure_elem = dfa->eclosures[edest]; /* Merge the epsilon closure of `edest'. */ - re_node_set_merge (&eclosure, &eclosure_elem); + err = re_node_set_merge (&eclosure, &eclosure_elem); + if (BE (err != REG_NOERROR, 0)) + return err; /* If the epsilon closure of `edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) @@ -2298,7 +2329,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, && dfa->word_ops_used == 0) init_word_char (dfa); if (token->opr.ctx_type == WORD_DELIM - || token->opr.ctx_type == NOT_WORD_DELIM) + || token->opr.ctx_type == NOT_WORD_DELIM) { bin_tree_t *tree_first, *tree_last; if (token->opr.ctx_type == WORD_DELIM) @@ -2306,13 +2337,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, token->opr.ctx_type = WORD_FIRST; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = WORD_LAST; - } - else - { + } + else + { token->opr.ctx_type = INSIDE_WORD; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = INSIDE_NOTWORD; - } + } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) @@ -2423,7 +2454,7 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + *err = REG_EPAREN; if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -2494,7 +2525,8 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return elem; } - if (BE (end != REG_MISSING && start > end, 0)) + if (BE ((end != REG_MISSING && start > end) + || token->type != OP_CLOSE_DUP_NUM, 0)) { /* First number greater than second. */ *err = REG_BADBR; @@ -2641,9 +2673,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, no MBCSET if dfa->mb_cur_max == 1. */ if (mbcset) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) + { /* There is not enough space, need realloc. */ wchar_t *new_array_start, *new_array_end; Idx new_nranges; @@ -2653,9 +2685,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, /* Use realloc since mbcset->range_starts and mbcset->range_ends are NULL if *range_alloc == 0. */ new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); + new_nranges); new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) return REG_ESPACE; @@ -2663,10 +2695,10 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; - } + } - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; } /* Build the table for single byte characters. */ @@ -2778,7 +2810,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2802,7 +2834,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { @@ -2883,8 +2916,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, build below suffices. */ if (nrules > 0 || dfa->mb_cur_max > 1) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) { /* There is not enough space, need realloc. */ uint32_t *new_array_start; @@ -2896,18 +2929,18 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, new_array_start = re_realloc (mbcset->range_starts, uint32_t, new_nranges); new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } /* Build the table for single byte characters. */ @@ -3241,17 +3274,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ if (sbc_idx < BITSET_WORDS) { - /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; - work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + work_tree = create_token_tree (dfa, NULL, NULL, &br_token); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; - /* Then join them by ALT node. */ - work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; } else { @@ -3270,7 +3303,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + goto parse_bracket_exp_espace; } return work_tree; @@ -3409,7 +3442,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) /* Build single byte matcing table for this equivalence class. */ char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; @@ -3421,11 +3454,15 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) @@ -3821,7 +3858,7 @@ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) node = node->parent; dup_node = dup_node->parent; if (!node) - return dup_root; + return dup_root; } node = node->right; p_new = &dup_node->right;