X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregcomp.c;h=81c5d4a53398973458ec50af1c17da2ec0f5ff0d;hb=f9e83fd48cb5c5a0b5a65751710138d42c814402;hp=ade91595ea158bdc2002dbb1ecae452c1db6b1ac;hpb=620e0d080536f8d8655e24b2965b05b95c00396d;p=gnulib.git diff --git a/lib/regcomp.c b/lib/regcomp.c index ade91595e..81c5d4a53 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002-2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -18,12 +18,11 @@ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, - int length, reg_syntax_t syntax); + size_t length, reg_syntax_t syntax); static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); -static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); -static void init_word_char (re_dfa_t *dfa); +static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); #ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); #endif /* RE_ENABLE_I18N */ @@ -46,38 +45,31 @@ static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, static reg_errcode_t calc_first (void *extra, bin_tree_t *node); static reg_errcode_t calc_next (void *extra, bin_tree_t *node); static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); -static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, - int top_clone_node, int root_node, - unsigned int constraint); -static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); -static int search_duplicated_node (re_dfa_t *dfa, int org_node, +static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); +static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, unsigned int constraint); static reg_errcode_t calc_eclosure (re_dfa_t *dfa); static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, - int node, int root); + Idx node, bool root); static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); -static int fetch_number (re_string_t *input, re_token_t *token, - reg_syntax_t syntax); -static void fetch_token (re_token_t *result, re_string_t *input, +static Idx fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax); static int peek_token (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); -static int peek_token_bracket (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); + reg_syntax_t syntax) internal_function; static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err); static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, - int nest, reg_errcode_t *err); + Idx nest, reg_errcode_t *err); static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, - int nest, reg_errcode_t *err); + Idx nest, reg_errcode_t *err); static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, - int nest, reg_errcode_t *err); + Idx nest, reg_errcode_t *err); static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, reg_syntax_t syntax, - int nest, reg_errcode_t *err); + Idx nest, reg_errcode_t *err); static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err); @@ -89,52 +81,34 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, re_token_t *token, int token_len, re_dfa_t *dfa, reg_syntax_t syntax, - int accept_hyphen); + bool accept_hyphen); static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); -#ifndef _LIBC -# ifdef RE_ENABLE_I18N -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, - int *coll_sym_alloc, - const unsigned char *name); -# else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - const unsigned char *name); -# endif /* not RE_ENABLE_I18N */ -#endif /* not _LIBC */ #ifdef RE_ENABLE_I18N -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, - int *equiv_class_alloc, + Idx *equiv_class_alloc, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, re_charset_t *mbcset, - int *char_class_alloc, + Idx *char_class_alloc, const unsigned char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, const unsigned char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, - unsigned RE_TRANSLATE_TYPE trans, + RE_TRANSLATE_TYPE trans, const unsigned char *class_name, const unsigned char *extra, - int non_match, reg_errcode_t *err); + bool non_match, reg_errcode_t *err); static bin_tree_t *create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, re_token_type_t type); @@ -151,7 +125,7 @@ static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); POSIX doesn't require that we do anything for REG_NOERROR, but why not be nice? */ -const char __re_error_msgid[] attribute_hidden = +static const char __re_error_msgid[] = { #define REG_NOERROR_IDX 0 gettext_noop ("Success") /* REG_NOERROR */ @@ -205,7 +179,7 @@ const char __re_error_msgid[] attribute_hidden = gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ }; -const size_t __re_error_msgid_idx[] attribute_hidden = +static const size_t __re_error_msgid_idx[] = { REG_NOERROR_IDX, REG_NOMATCH_IDX, @@ -235,9 +209,17 @@ const size_t __re_error_msgid_idx[] attribute_hidden = Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. */ +#ifdef _LIBC +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + size_t length; + struct re_pattern_buffer *bufp; +#else /* size_t might promote */ const char * re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp) +#endif { reg_errcode_t ret; @@ -275,7 +257,8 @@ reg_syntax_t re_syntax_options; defined in regex.h. We return the old syntax. */ reg_syntax_t -re_set_syntax (reg_syntax_t syntax) +re_set_syntax (syntax) + reg_syntax_t syntax; { reg_syntax_t ret = re_syntax_options; @@ -287,7 +270,8 @@ weak_alias (__re_set_syntax, re_set_syntax) #endif int -re_compile_fastmap (struct re_pattern_buffer *bufp) +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; { re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; char *fastmap = bufp->fastmap; @@ -309,7 +293,7 @@ weak_alias (__re_compile_fastmap, re_compile_fastmap) static inline void __attribute ((always_inline)) -re_set_fastmap (char *fastmap, int icase, int ch) +re_set_fastmap (char *fastmap, bool icase, int ch) { fastmap[ch] = 1; if (icase) @@ -324,11 +308,11 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap) { re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; - int node_cnt; - int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); + Idx node_cnt; + bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) { - int node = init_state->nodes.elems[node_cnt]; + Idx node = init_state->nodes.elems[node_cnt]; re_token_type_t type = dfa->nodes[node].type; if (type == CHARACTER) @@ -337,7 +321,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, #ifdef RE_ENABLE_I18N if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { - unsigned char *buf = alloca (dfa->mb_cur_max), *p; + unsigned char buf[MB_LEN_MAX]; + unsigned char *p; wchar_t wc; mbstate_t state; @@ -347,67 +332,89 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, && dfa->nodes[node].type == CHARACTER && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; - memset (&state, 0, sizeof (state)); - if (mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf + memset (&state, '\0', sizeof (state)); + if (__mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) != (size_t) -1)) - re_set_fastmap (fastmap, 0, buf[0]); + re_set_fastmap (fastmap, false, buf[0]); } #endif } else if (type == SIMPLE_BRACKET) { - int i, j, ch; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) - re_set_fastmap (fastmap, icase, ch); + int i, ch; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + { + int j; + bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (w & ((bitset_word_t) 1 << j)) + re_set_fastmap (fastmap, icase, ch); + } } #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { - int i; re_charset_t *cset = dfa->nodes[node].opr.mbcset; - if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes - || cset->nranges || cset->nchar_classes) - { + Idx i; + # ifdef _LIBC - if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + /* See if we have to try all bytes which start multiple collation + elements. + e.g. In da_DK, we want to catch 'a' since "aa" is a valid + collation element, and don't catch 'b' since 'b' is + the only collation element which starts from 'b' (and + it is caught by SIMPLE_BRACKET). */ + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 + && (cset->ncoll_syms || cset->nranges)) { - /* In this case we want to catch the bytes which are - the first byte of any collation elements. - e.g. In da_DK, we want to catch 'a' since "aa" - is a valid collation element, and don't catch - 'b' since 'b' is the only collation element - which starts from 'b'. */ - int j, ch; const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (table[ch] < 0) - re_set_fastmap (fastmap, icase, ch); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); } -# else - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - re_set_fastmap (fastmap, icase, i); -# endif /* not _LIBC */ +# endif /* _LIBC */ + + /* See if we have to start the match at all multibyte characters, + i.e. where we would not find an invalid sequence. This only + applies to multibyte character sets; for single byte character + sets, the SIMPLE_BRACKET again suffices. */ + if (dfa->mb_cur_max > 1 + && (cset->nchar_classes || cset->non_match || cset->nranges +# ifdef _LIBC + || cset->nequiv_classes +# endif /* _LIBC */ + )) + { + unsigned char c = 0; + do + { + mbstate_t mbs; + memset (&mbs, 0, sizeof (mbs)); + if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) + re_set_fastmap (fastmap, false, (int) c); + } + while (++c != 0); } - for (i = 0; i < cset->nmbchars; ++i) + + else { - char buf[256]; - mbstate_t state; - memset (&state, '\0', sizeof (state)); - if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + /* ... Else catch all bytes which can start the mbchars. */ + for (i = 0; i < cset->nmbchars; ++i) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, 0, *(unsigned char *) buf); + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, false, *(unsigned char *) buf); + } } } } @@ -463,7 +470,10 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, the return codes and their meanings.) */ int -regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags) +regcomp (preg, pattern, cflags) + regex_t *_Restrict_ preg; + const char *_Restrict_ pattern; + int cflags; { reg_errcode_t ret; reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED @@ -521,8 +531,18 @@ weak_alias (__regcomp, regcomp) /* Returns a message corresponding to an error code, ERRCODE, returned from either regcomp or regexec. We don't use PREG here. */ +#ifdef _LIBC +size_t +regerror (errcode, preg, errbuf, errbuf_size) + int errcode; + const regex_t *_Restrict_ preg; + char *_Restrict_ errbuf; + size_t errbuf_size; +#else /* size_t might promote */ size_t -regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +regerror (int errcode, const regex_t *_Restrict_ preg, + char *_Restrict_ errbuf, size_t errbuf_size) +#endif { const char *msg; size_t msg_size; @@ -542,17 +562,13 @@ regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) if (BE (errbuf_size != 0, 1)) { + size_t cpy_size = msg_size; if (BE (msg_size > errbuf_size, 0)) { -#if defined HAVE_MEMPCPY || defined _LIBC - *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; -#else - memcpy (errbuf, msg, errbuf_size - 1); - errbuf[errbuf_size - 1] = 0; -#endif + cpy_size = errbuf_size - 1; + errbuf[cpy_size] = '\0'; } - else - memcpy (errbuf, msg, msg_size); + memcpy (errbuf, msg, cpy_size); } return msg_size; @@ -567,14 +583,22 @@ weak_alias (__regerror, regerror) UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is a worthwhile optimization. */ -static const bitset utf8_sb_map = +static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if UINT_MAX == 0xffffffff - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff -# else -# error "Add case for new unsigned int size" +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS + BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS + BITSET_WORD_MAX, BITSET_WORD_MAX, +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS + BITSET_WORD_MAX, # endif + (BITSET_WORD_MAX + >> (SBC_MAX % BITSET_WORD_BITS == 0 + ? 0 + : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) }; #endif @@ -582,7 +606,7 @@ static const bitset utf8_sb_map = static void free_dfa_content (re_dfa_t *dfa) { - int i, j; + Idx i, j; if (dfa->nodes) for (i = 0; i < dfa->nodes_len; ++i) @@ -611,7 +635,7 @@ free_dfa_content (re_dfa_t *dfa) re_dfastate_t *state = entry->array[j]; free_state (state); } - re_free (entry->array); + re_free (entry->array); } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N @@ -630,7 +654,8 @@ free_dfa_content (re_dfa_t *dfa) /* Free dynamically allocated space used by PREG. */ void -regfree (regex_t *preg) +regfree (preg) + regex_t *preg; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; if (BE (dfa != NULL, 1)) @@ -722,7 +747,7 @@ libc_freeres_fn (free_mem) SYNTAX indicate regular expression's syntax. */ static reg_errcode_t -re_compile_internal (regex_t *preg, const char * pattern, int length, +re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax) { reg_errcode_t err = REG_NOERROR; @@ -754,8 +779,6 @@ re_compile_internal (regex_t *preg, const char * pattern, int length, } preg->used = sizeof (re_dfa_t); - __libc_lock_init (dfa->lock); - err = init_dfa (dfa, length); if (BE (err != REG_NOERROR, 0)) { @@ -765,12 +788,15 @@ re_compile_internal (regex_t *preg, const char * pattern, int length, return err; } #ifdef DEBUG + /* Note: length+1 will not overflow since it is checked in init_dfa. */ dfa->re_str = re_malloc (char, length + 1); strncpy (dfa->re_str, pattern, length + 1); #endif + __libc_lock_init (dfa->lock); + err = re_string_construct (®exp, pattern, length, preg->translate, - syntax & RE_ICASE, dfa); + (syntax & RE_ICASE) != 0, dfa); if (BE (err != REG_NOERROR, 0)) { re_compile_internal_free_return: @@ -820,25 +846,41 @@ re_compile_internal (regex_t *preg, const char * pattern, int length, as the initial length of some arrays. */ static reg_errcode_t -init_dfa (re_dfa_t *dfa, int pat_len) +init_dfa (re_dfa_t *dfa, size_t pat_len) { - int table_size; + __re_size_t table_size; #ifndef _LIBC char *codeset_name; #endif +#ifdef RE_ENABLE_I18N + size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); +#else + size_t max_i18n_object_size = 0; +#endif + size_t max_object_size = + MAX (sizeof (struct re_state_table_entry), + MAX (sizeof (re_token_t), + MAX (sizeof (re_node_set), + MAX (sizeof (regmatch_t), + max_i18n_object_size)))); memset (dfa, '\0', sizeof (re_dfa_t)); /* Force allocation of str_tree_storage the first time. */ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + /* Avoid overflows. The extra "/ 2" is for the table_size doubling + calculation below, and for similar doubling calculations + elsewhere. And it's <= rather than <, because some of the + doubling calculations add 1 afterwards. */ + if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) + return REG_ESPACE; + dfa->nodes_alloc = pat_len + 1; dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); - dfa->states_alloc = pat_len + 1; - /* table_size = 2 ^ ceil(log pat_len) */ - for (table_size = 1; table_size > 0; table_size <<= 1) + for (table_size = 1; ; table_size <<= 1) if (table_size > pat_len) break; @@ -853,20 +895,7 @@ init_dfa (re_dfa_t *dfa, int pat_len) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); #else -# ifdef HAVE_LANGINFO_CODESET codeset_name = nl_langinfo (CODESET); -# else - codeset_name = getenv ("LC_ALL"); - if (codeset_name == NULL || codeset_name[0] == '\0') - codeset_name = getenv ("LC_CTYPE"); - if (codeset_name == NULL || codeset_name[0] == '\0') - codeset_name = getenv ("LANG"); - if (codeset_name == NULL) - codeset_name = ""; - else if (strchr (codeset_name, '.') != NULL) - codeset_name = strchr (codeset_name, '.') + 1; -# endif - if (strcasecmp (codeset_name, "UTF-8") == 0 || strcasecmp (codeset_name, "UTF8") == 0) dfa->is_utf8 = 1; @@ -885,20 +914,17 @@ init_dfa (re_dfa_t *dfa, int pat_len) { int i, j, ch; - dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); if (BE (dfa->sb_char == NULL, 0)) return REG_ESPACE; - /* Clear all bits by, then set those corresponding to single - byte chars. */ - bitset_empty (dfa->sb_char); - - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) { wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1 << j; + dfa->sb_char[i] |= (bitset_word_t) 1 << j; # ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; @@ -918,14 +944,15 @@ init_dfa (re_dfa_t *dfa, int pat_len) character used by some operators like "\<", "\>", etc. */ static void +internal_function init_word_char (re_dfa_t *dfa) { int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1 << j; + dfa->word_char[i] |= (bitset_word_t) 1 << j; } /* Free the work area which are only used while compiling. */ @@ -952,7 +979,7 @@ free_workarea_compile (regex_t *preg) static reg_errcode_t create_initial_state (re_dfa_t *dfa) { - int first, i; + Idx first, i; reg_errcode_t err; re_node_set init_nodes; @@ -971,10 +998,10 @@ create_initial_state (re_dfa_t *dfa) if (dfa->nbackref > 0) for (i = 0; i < init_nodes.nelem; ++i) { - int node_idx = init_nodes.elems[i]; + Idx node_idx = init_nodes.elems[i]; re_token_type_t type = dfa->nodes[node_idx].type; - int clexp_idx; + Idx clexp_idx; if (type != OP_BACK_REF) continue; for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) @@ -990,10 +1017,13 @@ create_initial_state (re_dfa_t *dfa) if (type == OP_BACK_REF) { - int dest_idx = dfa->edests[node_idx].elems[0]; + Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + reg_errcode_t merge_err + = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + if (merge_err != REG_NOERROR) + return merge_err; i = 0; } } @@ -1034,17 +1064,20 @@ create_initial_state (re_dfa_t *dfa) static void optimize_utf8 (re_dfa_t *dfa) { - int node, i, mb_chars = 0, has_period = 0; + Idx node; + int i; + bool mb_chars = false; + bool has_period = false; for (node = 0; node < dfa->nodes_len; ++node) switch (dfa->nodes[node].type) { case CHARACTER: - if (dfa->nodes[node].opr.c >= 0x80) - mb_chars = 1; + if (dfa->nodes[node].opr.c >= ASCII_CHARS) + mb_chars = true; break; case ANCHOR: - switch (dfa->nodes[node].opr.idx) + switch (dfa->nodes[node].opr.ctx_type) { case LINE_FIRST: case LINE_LAST: @@ -1052,13 +1085,15 @@ optimize_utf8 (re_dfa_t *dfa) case BUF_LAST: break; default: - /* Word anchors etc. cannot be handled. */ + /* Word anchors etc. cannot be handled. It's okay to test + opr.ctx_type since constraints (for all DFA nodes) are + created by ORing one or more opr.ctx_type values. */ return; } break; case OP_PERIOD: - has_period = 1; - break; + has_period = true; + break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: @@ -1070,9 +1105,17 @@ optimize_utf8 (re_dfa_t *dfa) return; case SIMPLE_BRACKET: /* Just double check. */ - for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) - if (dfa->nodes[node].opr.sbcset[i]) - return; + { + int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 + ? 0 + : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); + for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) + { + if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) + return; + rshift = 0; + } + } break; default: abort (); @@ -1082,7 +1125,7 @@ optimize_utf8 (re_dfa_t *dfa) for (node = 0; node < dfa->nodes_len; ++node) { if (dfa->nodes[node].type == CHARACTER - && dfa->nodes[node].opr.c >= 0x80) + && dfa->nodes[node].opr.c >= ASCII_CHARS) dfa->nodes[node].mb_partial = 0; else if (dfa->nodes[node].type == OP_PERIOD) dfa->nodes[node].type = OP_UTF8_PERIOD; @@ -1105,18 +1148,18 @@ analyze (regex_t *preg) reg_errcode_t ret; /* Allocate arrays. */ - dfa->nexts = re_malloc (int, dfa->nodes_alloc); - dfa->org_indices = re_malloc (int, dfa->nodes_alloc); + dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); + dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL || dfa->eclosures == NULL, 0)) return REG_ESPACE; - dfa->subexp_map = re_malloc (int, preg->re_nsub); + dfa->subexp_map = re_malloc (Idx, preg->re_nsub); if (dfa->subexp_map != NULL) { - int i; + Idx i; for (i = 0; i < preg->re_nsub; i++) dfa->subexp_map[i] = i; preorder (dfa->str_tree, optimize_subexps, dfa); @@ -1151,7 +1194,7 @@ analyze (regex_t *preg) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1173,16 +1216,16 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), if that's the only child). */ while (node->left || node->right) if (node->left) - node = node->left; - else - node = node->right; + node = node->left; + else + node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; - if (node->parent == NULL) + if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; @@ -1216,7 +1259,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), prev = node; node = node->parent; if (!node) - return REG_NOERROR; + return REG_NOERROR; } node = node->right; } @@ -1239,17 +1282,17 @@ optimize_subexps (void *extra, bin_tree_t *node) } else if (node->token.type == SUBEXP - && node->left && node->left->token.type == SUBEXP) + && node->left && node->left->token.type == SUBEXP) { - int other_idx = node->left->token.opr.idx; + Idx other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) - node->left->parent = node; + node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < 8 * sizeof (dfa->used_bkref_map)) - dfa->used_bkref_map &= ~(1 << other_idx); + if (other_idx < BITSET_WORD_BITS) + dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); } return REG_NOERROR; @@ -1292,8 +1335,9 @@ lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL - && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) - || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) + && (node->token.opr.idx >= BITSET_WORD_BITS + || !(dfa->used_bkref_map + & ((bitset_word_t) 1 << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -1328,8 +1372,10 @@ calc_first (void *extra, bin_tree_t *node) { node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); - if (BE (node->node_idx == -1, 0)) - return REG_ESPACE; + if (BE (node->node_idx == REG_MISSING, 0)) + return REG_ESPACE; + if (node->token.type == ANCHOR) + dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; } return REG_NOERROR; } @@ -1351,7 +1397,7 @@ calc_next (void *extra, bin_tree_t *node) if (node->left) node->left->next = node->next; if (node->right) - node->right->next = node->next; + node->right->next = node->next; break; } return REG_NOERROR; @@ -1362,7 +1408,7 @@ static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; - int idx = node->node_idx; + Idx idx = node->node_idx; reg_errcode_t err = REG_NOERROR; switch (node->token.type) @@ -1377,7 +1423,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) case OP_DUP_ASTERISK: case OP_ALT: { - int left, right; + Idx left, right; dfa->has_plural_match = 1; if (node->left != NULL) left = node->left->first->node_idx; @@ -1387,8 +1433,8 @@ link_nfa_nodes (void *extra, bin_tree_t *node) right = node->right->first->node_idx; else right = node->next->node_idx; - assert (left > -1); - assert (right > -1); + assert (REG_VALID_INDEX (left)); + assert (REG_VALID_INDEX (right)); err = re_node_set_init_2 (dfa->edests + idx, left, right); } break; @@ -1402,7 +1448,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) case OP_BACK_REF: dfa->nexts[idx] = node->next->node_idx; if (node->token.type == OP_BACK_REF) - re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); + err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); break; default: @@ -1419,14 +1465,16 @@ link_nfa_nodes (void *extra, bin_tree_t *node) to their own constraint. */ static reg_errcode_t -duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, - int root_node, unsigned int init_constraint) +internal_function +duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, + Idx root_node, unsigned int init_constraint) { - int org_node, clone_node, ret; + Idx org_node, clone_node; + bool ok; unsigned int constraint = init_constraint; for (org_node = top_org_node, clone_node = top_clone_node;;) { - int org_dest, clone_dest; + Idx org_dest, clone_dest; if (dfa->nodes[org_node].type == OP_BACK_REF) { /* If the back reference epsilon-transit, its destination must @@ -1436,11 +1484,11 @@ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, org_dest = dfa->nexts[org_node]; re_node_set_empty (dfa->edests + clone_node); clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == -1, 0)) + if (BE (clone_dest == REG_MISSING, 0)) return REG_ESPACE; dfa->nexts[clone_node] = dfa->nexts[org_node]; - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) + ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (! ok, 0)) return REG_ESPACE; } else if (dfa->edests[org_node].nelem == 0) @@ -1457,27 +1505,22 @@ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, destination. */ org_dest = dfa->edests[org_node].elems[0]; re_node_set_empty (dfa->edests + clone_node); - if (dfa->nodes[org_node].type == ANCHOR) + /* If the node is root_node itself, it means the epsilon closure + has a loop. Then tie it to the destination of the root_node. */ + if (org_node == root_node && clone_node != org_node) { - /* In case of the node has another constraint, append it. */ - if (org_node == root_node && clone_node != org_node) - { - /* ...but if the node is root_node itself, it means the - epsilon closure have a loop, then tie it to the - destination of the root_node. */ - ret = re_node_set_insert (dfa->edests + clone_node, - org_dest); - if (BE (ret < 0, 0)) - return REG_ESPACE; - break; - } - constraint |= dfa->nodes[org_node].opr.ctx_type; + ok = re_node_set_insert (dfa->edests + clone_node, org_dest); + if (BE (! ok, 0)) + return REG_ESPACE; + break; } + /* In case the node has another constraint, append it. */ + constraint |= dfa->nodes[org_node].constraint; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == -1, 0)) + if (BE (clone_dest == REG_MISSING, 0)) return REG_ESPACE; - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) + ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (! ok, 0)) return REG_ESPACE; } else /* dfa->edests[org_node].nelem == 2 */ @@ -1488,15 +1531,15 @@ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, re_node_set_empty (dfa->edests + clone_node); /* Search for a duplicated node which satisfies the constraint. */ clone_dest = search_duplicated_node (dfa, org_dest, constraint); - if (clone_dest == -1) + if (clone_dest == REG_MISSING) { - /* There are no such a duplicated node, create a new one. */ + /* There is no such duplicated node, create a new one. */ reg_errcode_t err; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == -1, 0)) + if (BE (clone_dest == REG_MISSING, 0)) return REG_ESPACE; - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) + ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (! ok, 0)) return REG_ESPACE; err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node, constraint); @@ -1505,19 +1548,19 @@ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, } else { - /* There are a duplicated node which satisfy the constraint, + /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) + ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (! ok, 0)) return REG_ESPACE; } org_dest = dfa->edests[org_node].elems[1]; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == -1, 0)) + if (BE (clone_dest == REG_MISSING, 0)) return REG_ESPACE; - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) + ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (! ok, 0)) return REG_ESPACE; } org_node = org_dest; @@ -1529,32 +1572,32 @@ duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, /* Search for a node which is duplicated from the node ORG_NODE, and satisfies the constraint CONSTRAINT. */ -static int -search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint) +static Idx +search_duplicated_node (const re_dfa_t *dfa, Idx org_node, + unsigned int constraint) { - int idx; + Idx idx; for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) { if (org_node == dfa->org_indices[idx] && constraint == dfa->nodes[idx].constraint) return idx; /* Found. */ } - return -1; /* Not found. */ + return REG_MISSING; /* Not found. */ } /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. - Return the index of the new node, or -1 if insufficient storage is + Return the index of the new node, or REG_MISSING if insufficient storage is available. */ -static int -duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) +static Idx +duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) { - int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); - if (BE (dup_idx != -1, 1)) + Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); + if (BE (dup_idx != REG_MISSING, 1)) { dfa->nodes[dup_idx].constraint = constraint; - if (dfa->nodes[org_idx].type == ANCHOR) - dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; + dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; dfa->nodes[dup_idx].duplicated = 1; /* Store the index of the original node. */ @@ -1566,17 +1609,18 @@ duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) static reg_errcode_t calc_inveclosure (re_dfa_t *dfa) { - int src, idx, ret; + Idx src, idx; + bool ok; for (idx = 0; idx < dfa->nodes_len; ++idx) re_node_set_init_empty (dfa->inveclosures + idx); for (src = 0; src < dfa->nodes_len; ++src) { - int *elems = dfa->eclosures[src].elems; + Idx *elems = dfa->eclosures[src].elems; for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) { - ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); - if (BE (ret == -1, 0)) + ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); + if (BE (! ok, 0)) return REG_ESPACE; } } @@ -1589,11 +1633,12 @@ calc_inveclosure (re_dfa_t *dfa) static reg_errcode_t calc_eclosure (re_dfa_t *dfa) { - int node_idx, incomplete; + Idx node_idx; + bool incomplete; #ifdef DEBUG assert (dfa->nodes_len > 0); #endif - incomplete = 0; + incomplete = false; /* For each nodes, calculate epsilon closure. */ for (node_idx = 0; ; ++node_idx) { @@ -1603,25 +1648,25 @@ calc_eclosure (re_dfa_t *dfa) { if (!incomplete) break; - incomplete = 0; + incomplete = false; node_idx = 0; } #ifdef DEBUG - assert (dfa->eclosures[node_idx].nelem != -1); + assert (dfa->eclosures[node_idx].nelem != REG_MISSING); #endif /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; /* Calculate epsilon closure of `node_idx'. */ - err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); + err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); if (BE (err != REG_NOERROR, 0)) return err; if (dfa->eclosures[node_idx].nelem == 0) { - incomplete = 1; + incomplete = true; re_node_set_free (&eclosure_elem); } } @@ -1631,32 +1676,29 @@ calc_eclosure (re_dfa_t *dfa) /* Calculate epsilon closure of NODE. */ static reg_errcode_t -calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) +calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { reg_errcode_t err; - unsigned int constraint; - int i, incomplete; + Idx i; re_node_set eclosure; - incomplete = 0; + bool ok; + bool incomplete = false; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); if (BE (err != REG_NOERROR, 0)) return err; /* This indicates that we are calculating this node now. We reference this value to avoid infinite loop. */ - dfa->eclosures[node].nelem = -1; + dfa->eclosures[node].nelem = REG_MISSING; - constraint = ((dfa->nodes[node].type == ANCHOR) - ? dfa->nodes[node].opr.ctx_type : 0); - /* If the current node has constraints, duplicate all nodes. - Since they must inherit the constraints. */ - if (constraint + /* If the current node has constraints, duplicate all nodes + since they must inherit the constraints. */ + if (dfa->nodes[node].constraint && dfa->edests[node].nelem && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) { - int org_node, cur_node; - org_node = cur_node = node; - err = duplicate_node_closure (dfa, node, node, node, constraint); + err = duplicate_node_closure (dfa, node, node, node, + dfa->nodes[node].constraint); if (BE (err != REG_NOERROR, 0)) return err; } @@ -1666,37 +1708,41 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) for (i = 0; i < dfa->edests[node].nelem; ++i) { re_node_set eclosure_elem; - int edest = dfa->edests[node].elems[i]; + Idx edest = dfa->edests[node].elems[i]; /* If calculating the epsilon closure of `edest' is in progress, return intermediate result. */ - if (dfa->eclosures[edest].nelem == -1) + if (dfa->eclosures[edest].nelem == REG_MISSING) { - incomplete = 1; + incomplete = true; continue; } /* If we haven't calculated the epsilon closure of `edest' yet, calculate now. Otherwise use calculated epsilon closure. */ if (dfa->eclosures[edest].nelem == 0) { - err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); + err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); if (BE (err != REG_NOERROR, 0)) return err; } else eclosure_elem = dfa->eclosures[edest]; /* Merge the epsilon closure of `edest'. */ - re_node_set_merge (&eclosure, &eclosure_elem); + err = re_node_set_merge (&eclosure, &eclosure_elem); + if (BE (err != REG_NOERROR, 0)) + return err; /* If the epsilon closure of `edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) { - incomplete = 1; + incomplete = true; re_node_set_free (&eclosure_elem); } } - /* Epsilon closures include itself. */ - re_node_set_insert (&eclosure, node); + /* An epsilon closure includes itself. */ + ok = re_node_set_insert (&eclosure, node); + if (BE (! ok, 0)) + return REG_ESPACE; if (incomplete && !root) dfa->eclosures[node].nelem = 0; else @@ -1711,6 +1757,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) We must not use this function inside bracket expressions. */ static void +internal_function fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) { re_string_skip_bytes (input, peek_token (result, input, syntax)); @@ -1720,6 +1767,7 @@ fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) We must not use this function inside bracket expressions. */ static int +internal_function peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -1958,6 +2006,7 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) We must not use this function out of bracket expressions. */ static int +internal_function peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -2089,7 +2138,7 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, static bin_tree_t * parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, - reg_syntax_t syntax, int nest, reg_errcode_t *err) + reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree, *branch = NULL; @@ -2130,9 +2179,9 @@ parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, static bin_tree_t * parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, - reg_syntax_t syntax, int nest, reg_errcode_t *err) + reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { - bin_tree_t *tree, *exp; + bin_tree_t *tree, *expr; re_dfa_t *dfa = (re_dfa_t *) preg->buffer; tree = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) @@ -2141,14 +2190,14 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, while (token->type != OP_ALT && token->type != END_OF_RE && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) { - exp = parse_expression (regexp, preg, token, syntax, nest, err); - if (BE (*err != REG_NOERROR && exp == NULL, 0)) + expr = parse_expression (regexp, preg, token, syntax, nest, err); + if (BE (*err != REG_NOERROR && expr == NULL, 0)) { return NULL; } - if (tree != NULL && exp != NULL) + if (tree != NULL && expr != NULL) { - tree = create_tree (dfa, tree, exp, CONCAT); + tree = create_tree (dfa, tree, expr, CONCAT); if (tree == NULL) { *err = REG_ESPACE; @@ -2156,8 +2205,8 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, } } else if (tree == NULL) - tree = exp; - /* Otherwise exp == NULL, we don't need to create new tree. */ + tree = expr; + /* Otherwise expr == NULL, we don't need to create new tree. */ } return tree; } @@ -2170,7 +2219,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, static bin_tree_t * parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, - reg_syntax_t syntax, int nest, reg_errcode_t *err) + reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2277,7 +2326,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, && dfa->word_ops_used == 0) init_word_char (dfa); if (token->opr.ctx_type == WORD_DELIM - || token->opr.ctx_type == NOT_WORD_DELIM) + || token->opr.ctx_type == NOT_WORD_DELIM) { bin_tree_t *tree_first, *tree_last; if (token->opr.ctx_type == WORD_DELIM) @@ -2285,13 +2334,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, token->opr.ctx_type = WORD_FIRST; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = WORD_LAST; - } - else - { + } + else + { token->opr.ctx_type = INSIDE_WORD; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = INSIDE_NOTWORD; - } + } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) @@ -2386,7 +2435,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, static bin_tree_t * parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, - reg_syntax_t syntax, int nest, reg_errcode_t *err) + reg_syntax_t syntax, Idx nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2402,11 +2451,13 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + *err = REG_EPAREN; if (BE (*err != REG_NOERROR, 0)) return NULL; } - dfa->completed_bkref_map |= 1 << cur_nsub; + + if (cur_nsub <= '9' - '1') + dfa->completed_bkref_map |= 1 << cur_nsub; tree = create_tree (dfa, tree, NULL, SUBEXP); if (BE (tree == NULL, 0)) @@ -2425,14 +2476,14 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) { bin_tree_t *tree = NULL, *old_tree = NULL; - int i, start, end, start_idx = re_string_cur_idx (regexp); + Idx i, start, end, start_idx = re_string_cur_idx (regexp); re_token_t start_token = *token; if (token->type == OP_OPEN_DUP_NUM) { end = 0; start = fetch_number (regexp, token, syntax); - if (start == -1) + if (start == REG_MISSING) { if (token->type == CHARACTER && token->opr.c == ',') start = 0; /* We treat "{,m}" as "{0,m}". */ @@ -2442,14 +2493,14 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return NULL; } } - if (BE (start != -2, 1)) + if (BE (start != REG_ERROR, 1)) { /* We treat "{n}" as "{n,n}". */ end = ((token->type == OP_CLOSE_DUP_NUM) ? start : ((token->type == CHARACTER && token->opr.c == ',') - ? fetch_number (regexp, token, syntax) : -2)); + ? fetch_number (regexp, token, syntax) : REG_ERROR)); } - if (BE (start == -2 || end == -2, 0)) + if (BE (start == REG_ERROR || end == REG_ERROR, 0)) { /* Invalid sequence. */ if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) @@ -2471,7 +2522,8 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return elem; } - if (BE (end != -1 && start > end, 0)) + if (BE ((end != REG_MISSING && start > end) + || token->type != OP_CLOSE_DUP_NUM, 0)) { /* First number greater than second. */ *err = REG_BADBR; @@ -2481,7 +2533,7 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, else { start = (token->type == OP_DUP_PLUS) ? 1 : 0; - end = (token->type == OP_DUP_QUESTION) ? 1 : -1; + end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; } fetch_token (token, regexp, syntax); @@ -2519,24 +2571,30 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, if (elem->token.type == SUBEXP) postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); - tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); + tree = create_tree (dfa, elem, NULL, + (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); if (BE (tree == NULL, 0)) goto parse_dup_op_espace; - /* This loop is actually executed only when end != -1, +/* From gnulib's "intprops.h": + True if the arithmetic type T is signed. */ +#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) + + /* This loop is actually executed only when end != REG_MISSING, to rewrite {0,n} as ((...?)?)?... We have already created the start+1-th copy. */ - for (i = start + 2; i <= end; ++i) - { - elem = duplicate_tree (elem, dfa); - tree = create_tree (dfa, tree, elem, CONCAT); - if (BE (elem == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - - tree = create_tree (dfa, tree, NULL, OP_ALT); - if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; - } + if (TYPE_SIGNED (Idx) || end != REG_MISSING) + for (i = start + 2; i <= end; ++i) + { + elem = duplicate_tree (elem, dfa); + tree = create_tree (dfa, tree, elem, CONCAT); + if (BE (elem == NULL || tree == NULL, 0)) + goto parse_dup_op_espace; + + tree = create_tree (dfa, tree, NULL, OP_ALT); + if (BE (tree == NULL, 0)) + goto parse_dup_op_espace; + } if (old_tree) tree = create_tree (dfa, old_tree, tree, CONCAT); @@ -2561,11 +2619,20 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, update it. */ static reg_errcode_t -build_range_exp (re_bitset_ptr_t sbcset, +internal_function # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, int *range_alloc, -# endif - bracket_elem_t *start_elem, bracket_elem_t *end_elem) +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + re_charset_t *mbcset, + Idx *range_alloc, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) +# else /* not RE_ENABLE_I18N */ +build_range_exp (const reg_syntax_t syntax, + bitset_t sbcset, + const bracket_elem_t *start_elem, + const bracket_elem_t *end_elem) +# endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; /* Equivalence Classes and Character Classes can't be a range start/end. */ @@ -2585,7 +2652,8 @@ build_range_exp (re_bitset_ptr_t sbcset, # ifdef RE_ENABLE_I18N { wchar_t wc; - wint_t start_wc, end_wc; + wint_t start_wc; + wint_t end_wc; wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch @@ -2602,7 +2670,9 @@ build_range_exp (re_bitset_ptr_t sbcset, return REG_ECOLLATE; cmp_buf[0] = start_wc; cmp_buf[4] = end_wc; - if (wcscoll (cmp_buf, cmp_buf + 4) > 0) + + if (BE ((syntax & RE_NO_EMPTY_RANGES) + && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2612,21 +2682,21 @@ build_range_exp (re_bitset_ptr_t sbcset, no MBCSET if dfa->mb_cur_max == 1. */ if (mbcset) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) + { /* There is not enough space, need realloc. */ wchar_t *new_array_start, *new_array_end; - int new_nranges; + Idx new_nranges; /* +1 in case of mbcset->nranges is 0. */ new_nranges = 2 * mbcset->nranges + 1; /* Use realloc since mbcset->range_starts and mbcset->range_ends are NULL if *range_alloc == 0. */ new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); + new_nranges); new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) return REG_ESPACE; @@ -2634,10 +2704,10 @@ build_range_exp (re_bitset_ptr_t sbcset, mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; - } + } - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; } /* Build the table for single byte characters. */ @@ -2678,9 +2748,10 @@ build_range_exp (re_bitset_ptr_t sbcset, pointer argument since we may update it. */ static reg_errcode_t -build_collating_symbol (re_bitset_ptr_t sbcset, +internal_function +build_collating_symbol (bitset_t sbcset, # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, int *coll_sym_alloc, + re_charset_t *mbcset, Idx *coll_sym_alloc, # endif const unsigned char *name) { @@ -2722,28 +2793,33 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { int32_t hash = elem_hash ((const char *) name, name_len); int32_t elem = hash % table_size; - int32_t second = hash % (table_size - 2); - while (symb_table[2 * elem] != 0) - { - /* First compare the hashing value. */ - if (symb_table[2 * elem] == hash - /* Compare the length of the name. */ - && name_len == extra[symb_table[2 * elem + 1]] - /* Compare the name. */ - && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], - name_len) == 0) + if (symb_table[2 * elem] != 0) + { + int32_t second = hash % (table_size - 2) + 1; + + do { - /* Yep, this is the entry. */ - break; - } + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + /* Compare the length of the name. */ + && name_len == extra[symb_table[2 * elem + 1]] + /* Compare the name. */ + && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], + name_len) == 0) + { + /* Yep, this is the entry. */ + break; + } - /* Next entry. */ - elem += second; + /* Next entry. */ + elem += second; + } + while (symb_table[2 * elem] != 0); } return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2767,7 +2843,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { @@ -2819,8 +2896,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, __attribute ((always_inline)) build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) re_charset_t *mbcset; - int *range_alloc; - re_bitset_ptr_t sbcset; + Idx *range_alloc; + bitset_t sbcset; bracket_elem_t *start_elem, *end_elem; { unsigned int ch; @@ -2848,31 +2925,31 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, build below suffices. */ if (nrules > 0 || dfa->mb_cur_max > 1) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) { /* There is not enough space, need realloc. */ uint32_t *new_array_start; uint32_t *new_array_end; - int new_nranges; + Idx new_nranges; /* +1 in case of mbcset->nranges is 0. */ new_nranges = 2 * mbcset->nranges + 1; new_array_start = re_realloc (mbcset->range_starts, uint32_t, new_nranges); new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } /* Build the table for single byte characters. */ @@ -2902,8 +2979,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, __attribute ((always_inline)) build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) re_charset_t *mbcset; - int *coll_sym_alloc; - re_bitset_ptr_t sbcset; + Idx *coll_sym_alloc; + bitset_t sbcset; const unsigned char *name; { int32_t elem, idx; @@ -2934,7 +3011,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { /* Not enough, realloc it. */ /* +1 in case of mbcset->ncoll_syms is 0. */ - int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; + Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; /* Use realloc since mbcset->coll_syms is NULL if *alloc == 0. */ int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, @@ -2964,13 +3041,13 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, re_bitset_ptr_t sbcset; #ifdef RE_ENABLE_I18N re_charset_t *mbcset; - int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; - int equiv_class_alloc = 0, char_class_alloc = 0; + Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; + Idx equiv_class_alloc = 0, char_class_alloc = 0; #endif /* not RE_ENABLE_I18N */ - int non_match = 0; + bool non_match = false; bin_tree_t *work_tree; int token_len; - int first_round = 1; + bool first_round = true; #ifdef _LIBC collseqmb = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); @@ -2980,7 +3057,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, /* if (MB_CUR_MAX > 1) */ - collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); + collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); @@ -2988,7 +3065,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, _NL_COLLATE_SYMB_EXTRAMB); } #endif - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3013,9 +3090,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef RE_ENABLE_I18N mbcset->non_match = 1; #endif /* not RE_ENABLE_I18N */ - non_match = 1; + non_match = true; if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set (sbcset, '\0'); + bitset_set (sbcset, '\n'); re_string_skip_bytes (regexp, token_len); /* Skip a token. */ token_len = peek_token_bracket (token, regexp, syntax); if (BE (token->type == END_OF_RE, 0)) @@ -3035,7 +3112,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; reg_errcode_t ret; - int token_len2 = 0, is_range_exp = 0; + int token_len2 = 0; + bool is_range_exp = false; re_token_t token2; start_elem.opr.name = start_name_buf; @@ -3046,7 +3124,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, *err = ret; goto parse_bracket_exp_free_return; } - first_round = 0; + first_round = false; /* Get information about the next token. We need it in any case. */ token_len = peek_token_bracket (token, regexp, syntax); @@ -3075,15 +3153,15 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, token->type = CHARACTER; } else - is_range_exp = 1; + is_range_exp = true; } } - if (is_range_exp == 1) + if (is_range_exp == true) { end_elem.opr.name = end_name_buf; ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, - dfa, syntax, 1); + dfa, syntax, true); if (BE (ret != REG_NOERROR, 0)) { *err = ret; @@ -3097,11 +3175,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N - *err = build_range_exp (sbcset, + *err = build_range_exp (syntax, sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, &range_alloc, &start_elem, &end_elem); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) @@ -3198,24 +3276,24 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (mbc_tree == NULL, 0)) goto parse_bracket_exp_espace; - for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) + for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) break; /* If there are no bits set in sbcset, there is no point of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ - if (sbc_idx < BITSET_UINTS) + if (sbc_idx < BITSET_WORDS) { - /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; - work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + work_tree = create_token_tree (dfa, NULL, NULL, &br_token); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; - /* Then join them by ALT node. */ - work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; } else { @@ -3234,7 +3312,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + goto parse_bracket_exp_espace; } return work_tree; @@ -3253,7 +3331,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token, int token_len, re_dfa_t *dfa, - reg_syntax_t syntax, int accept_hyphen) + reg_syntax_t syntax, bool accept_hyphen) { #ifdef RE_ENABLE_I18N int cur_char_size; @@ -3338,13 +3416,14 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, is a pointer argument sinse we may update it. */ static reg_errcode_t -build_equiv_class (re_bitset_ptr_t sbcset, #ifdef RE_ENABLE_I18N - re_charset_t *mbcset, int *equiv_class_alloc, -#endif - const unsigned char *name) +build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, + Idx *equiv_class_alloc, const unsigned char *name) +#else /* not RE_ENABLE_I18N */ +build_equiv_class (bitset_t sbcset, const unsigned char *name) +#endif /* not RE_ENABLE_I18N */ { -#if defined _LIBC +#ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { @@ -3372,7 +3451,7 @@ build_equiv_class (re_bitset_ptr_t sbcset, /* Build single byte matcing table for this equivalence class. */ char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; @@ -3384,11 +3463,15 @@ build_equiv_class (re_bitset_ptr_t sbcset, if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) @@ -3400,7 +3483,7 @@ build_equiv_class (re_bitset_ptr_t sbcset, { /* Not enough, realloc it. */ /* +1 in case of mbcset->nequiv_classes is 0. */ - int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; + Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; /* Use realloc since the array is NULL if *alloc == 0. */ int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, int32_t, @@ -3429,11 +3512,14 @@ build_equiv_class (re_bitset_ptr_t sbcset, is a pointer argument sinse we may update it. */ static reg_errcode_t -build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset, #ifdef RE_ENABLE_I18N - re_charset_t *mbcset, int *char_class_alloc, -#endif +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, + re_charset_t *mbcset, Idx *char_class_alloc, + const unsigned char *class_name, reg_syntax_t syntax) +#else /* not RE_ENABLE_I18N */ +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, const unsigned char *class_name, reg_syntax_t syntax) +#endif /* not RE_ENABLE_I18N */ { int i; const char *name = (const char *) class_name; @@ -3450,7 +3536,7 @@ build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset, { /* Not enough, realloc it. */ /* +1 in case of mbcset->nchar_classes is 0. */ - int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; + Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; /* Use realloc since array is NULL if *alloc == 0. */ wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, new_char_class_alloc); @@ -3463,39 +3549,45 @@ build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset, #endif /* RE_ENABLE_I18N */ #define BUILD_CHARCLASS_LOOP(ctype_func) \ - for (i = 0; i < SBC_MAX; ++i) \ + do { \ + if (BE (trans != NULL, 0)) \ { \ - if (ctype_func (i)) \ - { \ - int ch = trans ? trans[i] : i; \ - bitset_set (sbcset, ch); \ - } \ - } + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, trans[i]); \ + } \ + else \ + { \ + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, i); \ + } \ + } while (0) if (strcmp (name, "alnum") == 0) - BUILD_CHARCLASS_LOOP (isalnum) + BUILD_CHARCLASS_LOOP (isalnum); else if (strcmp (name, "cntrl") == 0) - BUILD_CHARCLASS_LOOP (iscntrl) + BUILD_CHARCLASS_LOOP (iscntrl); else if (strcmp (name, "lower") == 0) - BUILD_CHARCLASS_LOOP (islower) + BUILD_CHARCLASS_LOOP (islower); else if (strcmp (name, "space") == 0) - BUILD_CHARCLASS_LOOP (isspace) + BUILD_CHARCLASS_LOOP (isspace); else if (strcmp (name, "alpha") == 0) - BUILD_CHARCLASS_LOOP (isalpha) + BUILD_CHARCLASS_LOOP (isalpha); else if (strcmp (name, "digit") == 0) - BUILD_CHARCLASS_LOOP (isdigit) + BUILD_CHARCLASS_LOOP (isdigit); else if (strcmp (name, "print") == 0) - BUILD_CHARCLASS_LOOP (isprint) + BUILD_CHARCLASS_LOOP (isprint); else if (strcmp (name, "upper") == 0) - BUILD_CHARCLASS_LOOP (isupper) + BUILD_CHARCLASS_LOOP (isupper); else if (strcmp (name, "blank") == 0) - BUILD_CHARCLASS_LOOP (isblank) + BUILD_CHARCLASS_LOOP (isblank); else if (strcmp (name, "graph") == 0) - BUILD_CHARCLASS_LOOP (isgraph) + BUILD_CHARCLASS_LOOP (isgraph); else if (strcmp (name, "punct") == 0) - BUILD_CHARCLASS_LOOP (ispunct) + BUILD_CHARCLASS_LOOP (ispunct); else if (strcmp (name, "xdigit") == 0) - BUILD_CHARCLASS_LOOP (isxdigit) + BUILD_CHARCLASS_LOOP (isxdigit); else return REG_ECTYPE; @@ -3503,21 +3595,21 @@ build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset, } static bin_tree_t * -build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans, +build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, const unsigned char *class_name, - const unsigned char *extra, - int non_match, reg_errcode_t *err) + const unsigned char *extra, bool non_match, + reg_errcode_t *err) { re_bitset_ptr_t sbcset; #ifdef RE_ENABLE_I18N re_charset_t *mbcset; - int alloc = 0; + Idx alloc = 0; #endif /* not RE_ENABLE_I18N */ reg_errcode_t ret; re_token_t br_token; bin_tree_t *tree; - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3535,10 +3627,6 @@ build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans, if (non_match) { #ifdef RE_ENABLE_I18N - /* - if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set(cset->sbcset, '\0'); - */ mbcset->non_match = 1; #endif /* not RE_ENABLE_I18N */ } @@ -3616,25 +3704,27 @@ build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans, /* This is intended for the expressions like "a{1,3}". Fetch a number from `input', and return the number. - Return -1, if the number field is empty like "{,1}". - Return -2, If an error is occured. */ + Return REG_MISSING if the number field is empty like "{,1}". + Return REG_ERROR if an error occurred. */ -static int +static Idx fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) { - int num = -1; + Idx num = REG_MISSING; unsigned char c; while (1) { fetch_token (token, input, syntax); c = token->opr.c; if (BE (token->type == END_OF_RE, 0)) - return -2; + return REG_ERROR; if (token->type == OP_CLOSE_DUP_NUM || c == ',') break; - num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) - ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); - num = (num > RE_DUP_MAX) ? -2 : num; + num = ((token->type != CHARACTER || c < '0' || '9' < c + || num == REG_ERROR) + ? REG_ERROR + : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); + num = (num > RE_DUP_MAX) ? REG_ERROR : num; } return num; } @@ -3693,7 +3783,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, tree->token.opt_subexp = 0; tree->first = NULL; tree->next = NULL; - tree->node_idx = -1; + tree->node_idx = REG_MISSING; if (left != NULL) left->parent = tree; @@ -3708,7 +3798,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node) { - int idx = (int) (long) extra; + Idx idx = (Idx) (long) extra; if (node->token.type == SUBEXP && node->token.opr.idx == idx) node->token.opt_subexp = 1; @@ -3777,7 +3867,7 @@ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) node = node->parent; dup_node = dup_node->parent; if (!node) - return dup_root; + return dup_root; } node = node->right; p_new = &dup_node->right;