From c4f640f1277934d02bad810d887dec6033f1ad4c Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 6 Sep 2005 07:36:48 +0000 Subject: [PATCH] Change bitset word type from unsigned int to unsigned long int, as this has better performance on typical 64-bit hosts. Port bitset code to hosts with unusual word sizes. * lib/regcomp.c (build_equiv_class, build_charclass): (build_range_exp, build_collating_symbol): Prefer bitset to re_bitset_ptr_t in prototypes, when the actual argument is a bitset. This is merely a style issue, but it makes it clearer that an entire array is expected. (re_compile_fastmap_iter, init_dfa, init_word_char, optimize_subexps): * lib/regcomp.c (lower_subexp, parse_bracket_exp): (built_charclass_op): Port to the case where bitset_word is not the same as unsigned int. * lib/regex_internal.h (bitset_set, bitset_clear, bitset_contain): (bitset_not, bitset_merge, bitset_set_all, bitset_mask): Likewise. * lib/regexec.c (check_dst_limits_calc_pos_1): (check_subexp_matching_top): (build_trtable, group_nodes_into_DFAstates): Likewise. * lib/regcomp.c (re_compile_fastmap_iter, utf8_sb_map): (optimize_utf8): Don't assume that SBC_MAX is a multiple of BITSET_WORD_BITS. * lib/regex_internal.h (bitset_set_all, bitset_not): Likewise. * lib/regexec.c (group_nodes_into_DFAstates): Likewise. * lib/regcomp.c (utf8_sb_map): Don't assume UINT_MAX == 0xffffffff. * lib/regcomp.c (optimize_subexps, lower_subexp): Work even if bitset_word has holes in its bitwise representation. * lib/regex_internal.h (BITSET_WORD_BITS): Likewise. * lib/regexec.c (check_dst_limits_calc_pos_1): (heck_subexp_matching_top): Likewise. * lib/regex_internal.c (re_string_reconstruct): Don't assume UCHAR_MAX == 255. * lib/regex_internal.h (bitset_set_all): Likewise. * lib/regex_internal.h (BITSET_WORD_BITS): Renamed from UINT_BITS. All uses changed. (BITSET_WORDS): Renamed from BITSET_UINTS. All uses changed. (bitset_word): New type, replacing 'unsigned int' for bitset uses. All uses changed. (BITSET_WORD_MAX): New macro. (bitset_set, bitset_clear, bitset_contain, bitset_empty): (bitset_set_all, bitset_copy): Now inline functions, not macros. (bitset_empty, bitset_copy): Prefer sizeof (bitset) to multiplying it out ourselves. (bitset_not_merge): Remove; unused. (bitset_contain): Return bool, not unsigned int with one bit on. All callers changed. * lib/regexec.c (build_trtable): Don't assume bitset has no stricter alignment than re_node_set; do this by defining a new internal type struct dests_alloc and using it to allocate memory. * config/srclist.txt: Add glibc bug 1302. --- config/ChangeLog | 6 ++- config/srclist.txt | 6 ++- lib/ChangeLog | 49 ++++++++++++++++++++ lib/regcomp.c | 103 +++++++++++++++++++++++------------------ lib/regex_internal.c | 2 +- lib/regex_internal.h | 126 ++++++++++++++++++++++++++++++++++++++------------- lib/regexec.c | 78 +++++++++++++++++-------------- 7 files changed, 256 insertions(+), 114 deletions(-) diff --git a/config/ChangeLog b/config/ChangeLog index 20ac6e3d1..29f6fd5eb 100644 --- a/config/ChangeLog +++ b/config/ChangeLog @@ -1,6 +1,10 @@ +2005-09-06 Paul Eggert + + * srclist.txt: Add glibc bug 1302. + 2005-09-03 Simon Josefsson - * srclist.txt: Add glibc bugs 1293. + * srclist.txt: Add glibc bug 1293. 2005-09-02 Bruno Haible diff --git a/config/srclist.txt b/config/srclist.txt index 271c6e285..e7eab099a 100644 --- a/config/srclist.txt +++ b/config/srclist.txt @@ -1,4 +1,4 @@ -# $Id: srclist.txt,v 1.102 2005-09-03 09:05:55 jas Exp $ +# $Id: srclist.txt,v 1.103 2005-09-06 07:36:48 eggert Exp $ # Files for which we are not the source. See ./srclistvars.sh for the # variable definitions. @@ -112,6 +112,7 @@ $LIBCSRC/stdlib/getsubopt.c lib gpl # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1282 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1285 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1291 +# http://sources.redhat.com/bugzilla/show_bug.cgi?id=1302 #$LIBCSRC/posix/regcomp.c lib gpl # # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1238 @@ -142,6 +143,7 @@ $LIBCSRC/stdlib/getsubopt.c lib gpl # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1286 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1287 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1291 +# http://sources.redhat.com/bugzilla/show_bug.cgi?id=1302 #$LIBCSRC/posix/regex_internal.c lib gpl # # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1054 @@ -155,6 +157,7 @@ $LIBCSRC/stdlib/getsubopt.c lib gpl # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1281 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1285 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1291 +# http://sources.redhat.com/bugzilla/show_bug.cgi?id=1302 #$LIBCSRC/posix/regex_internal.h lib gpl # # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1216 @@ -172,6 +175,7 @@ $LIBCSRC/stdlib/getsubopt.c lib gpl # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1282 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1284 # http://sources.redhat.com/bugzilla/show_bug.cgi?id=1285 +# http://sources.redhat.com/bugzilla/show_bug.cgi?id=1302 #$LIBCSRC/posix/regexec.c lib gpl # # c89 changes $LIBCSRC/string/strdup.c lib gpl diff --git a/lib/ChangeLog b/lib/ChangeLog index 7ebaab37f..ce23aaeaf 100644 --- a/lib/ChangeLog +++ b/lib/ChangeLog @@ -1,3 +1,52 @@ +2005-09-05 Paul Eggert + + Change bitset word type from unsigned int to unsigned long int, + as this has better performance on typical 64-bit hosts. + Port bitset code to hosts with unusual word sizes. + * regcomp.c (build_equiv_class, build_charclass, build_range_exp): + (build_collating_symbol): + Prefer bitset to re_bitset_ptr_t in prototypes, when the actual + argument is a bitset. This is merely a style issue, but it makes + it clearer that an entire array is expected. + (re_compile_fastmap_iter, init_dfa, init_word_char, optimize_subexps): + * regcomp.c (lower_subexp, parse_bracket_exp, built_charclass_op): + Port to the case where bitset_word is not the same as unsigned int. + * regex_internal.h (bitset_set, bitset_clear, bitset_contain): + (bitset_not, bitset_merge, bitset_set_all, bitset_mask): + Likewise. + * regexec.c (check_dst_limits_calc_pos_1, check_subexp_matching_top): + (build_trtable, group_nodes_into_DFAstates): + Likewise. + * regcomp.c (re_compile_fastmap_iter, utf8_sb_map, optimize_utf8): + Don't assume that SBC_MAX is a multiple of BITSET_WORD_BITS. + * regex_internal.h (bitset_set_all, bitset_not): Likewise. + * regexec.c (group_nodes_into_DFAstates): Likewise. + * regcomp.c (utf8_sb_map): Don't assume UINT_MAX == 0xffffffff. + * regcomp.c (optimize_subexps, lower_subexp): + Work even if bitset_word has holes in its bitwise representation. + * regex_internal.h (BITSET_WORD_BITS): Likewise. + * regexec.c (check_dst_limits_calc_pos_1, check_subexp_matching_top): + Likewise. + * regex_internal.c (re_string_reconstruct): + Don't assume UCHAR_MAX == 255. + * regex_internal.h (bitset_set_all): Likewise. + * regex_internal.h (BITSET_WORD_BITS): Renamed from UINT_BITS. + All uses changed. + (BITSET_WORDS): Renamed from BITSET_UINTS. All uses changed. + (bitset_word): New type, replacing 'unsigned int' for bitset uses. + All uses changed. + (BITSET_WORD_MAX): New macro. + (bitset_set, bitset_clear, bitset_contain, bitset_empty): + (bitset_set_all, bitset_copy): Now inline functions, not macros. + (bitset_empty, bitset_copy): + Prefer sizeof (bitset) to multiplying it out ourselves. + (bitset_not_merge): Remove; unused. + (bitset_contain): Return bool, not unsigned int with one bit on. + All callers changed. + * regexec.c (build_trtable): Don't assume bitset has no stricter + alignment than re_node_set; do this by defining a new internal + type struct dests_alloc and using it to allocate memory. + 2005-09-02 Paul Eggert Check for arithmetic overflow when calculating sizes, to prevent diff --git a/lib/regcomp.c b/lib/regcomp.c index 5dc348838..279b20c4c 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -86,21 +86,21 @@ static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); #ifdef RE_ENABLE_I18N -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset sbcset, re_charset_t *mbcset, Idx *equiv_class_alloc, const unsigned char *name); static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, + bitset sbcset, re_charset_t *mbcset, Idx *char_class_alloc, const unsigned char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset sbcset, const unsigned char *name); static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, + bitset sbcset, const unsigned char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ @@ -334,9 +334,9 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, else if (type == SIMPLE_BRACKET) { int i, j, ch; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1u << j)) + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j)) re_set_fastmap (fastmap, icase, ch); } #ifdef RE_ENABLE_I18N @@ -356,13 +356,11 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, is a valid collation element, and don't catch 'b' since 'b' is the only collation element which starts from 'b'. */ - int j, ch; const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (table[ch] < 0) - re_set_fastmap (fastmap, icase, ch); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); } # else if (dfa->mb_cur_max > 1) @@ -546,11 +544,22 @@ weak_alias (__regerror, regerror) static const bitset utf8_sb_map = { /* Set the first 128 bits. */ -# if UINT_MAX == 0xffffffff - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff -# else -# error "Add case for new unsigned int size" +# if 2 < BITSET_WORDS + BITSET_WORD_MAX, +# endif +# if 4 < BITSET_WORDS + BITSET_WORD_MAX, +# endif +# if 6 < BITSET_WORDS + BITSET_WORD_MAX, # endif +# if 8 < BITSET_WORDS +# error "Invalid BITSET_WORDS" +# endif + (BITSET_WORD_MAX + >> (SBC_MAX % BITSET_WORD_BITS == 0 + ? 0 + : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) }; #endif @@ -858,20 +867,17 @@ init_dfa (re_dfa_t *dfa, Idx pat_len) { int i, j, ch; - dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS); + dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS); if (BE (dfa->sb_char == NULL, 0)) return REG_ESPACE; - /* Clear all bits by, then set those corresponding to single - byte chars. */ - bitset_empty (dfa->sb_char); - - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) { wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1u << j; + dfa->sb_char[i] |= (bitset_word) 1 << j; # ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; @@ -895,10 +901,10 @@ init_word_char (re_dfa_t *dfa) { int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1u << j; + dfa->word_char[i] |= (bitset_word) 1 << j; } /* Free the work area which are only used while compiling. */ @@ -1046,9 +1052,18 @@ optimize_utf8 (re_dfa_t *dfa) return; case SIMPLE_BRACKET: /* Just double check. */ - for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) - if (dfa->nodes[node].opr.sbcset[i]) - return; + { + int rshift = + (SBC_MAX / 2 % BITSET_WORD_BITS == 0 + ? 0 + : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS); + for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) + { + if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) + return; + rshift = 0; + } + } break; default: abort (); @@ -1224,8 +1239,8 @@ optimize_subexps (void *extra, bin_tree_t *node) node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map) - dfa->used_bkref_map &= ~(1u << other_idx); + if (other_idx < BITSET_WORD_BITS) + dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx); } return REG_NOERROR; @@ -1268,8 +1283,8 @@ lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL - && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map - || !(dfa->used_bkref_map & (1u << node->token.opr.idx)))) + && ! (node->token.opr.idx < BITSET_WORD_BITS + && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -2550,7 +2565,7 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, update it. */ static reg_errcode_t -build_range_exp (re_bitset_ptr_t sbcset, +build_range_exp (bitset sbcset, # ifdef RE_ENABLE_I18N re_charset_t *mbcset, Idx *range_alloc, # endif @@ -2666,7 +2681,7 @@ build_range_exp (re_bitset_ptr_t sbcset, pointer argument since we may update it. */ static reg_errcode_t -build_collating_symbol (re_bitset_ptr_t sbcset, +build_collating_symbol (bitset sbcset, # ifdef RE_ENABLE_I18N re_charset_t *mbcset, Idx *coll_sym_alloc, # endif @@ -2802,7 +2817,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, auto inline reg_errcode_t __attribute ((always_inline)) - build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset, + build_range_exp (bitset sbcset, re_charset_t *mbcset, Idx *range_alloc, bracket_elem_t *start_elem, bracket_elem_t *end_elem) { @@ -2882,7 +2897,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, auto inline reg_errcode_t __attribute ((always_inline)) - build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset, + build_collating_symbol (bitset sbcset, re_charset_t *mbcset, Idx *coll_sym_alloc, const unsigned char *name) { int32_t elem, idx; @@ -2966,7 +2981,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, _NL_COLLATE_SYMB_EXTRAMB); } #endif - sbcset = re_calloc (unsigned int, BITSET_UINTS); + sbcset = re_calloc (bitset_word, BITSET_WORDS); #ifdef RE_ENABLE_I18N mbcset = re_calloc (re_charset_t, 1); #endif /* RE_ENABLE_I18N */ @@ -3176,12 +3191,12 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (mbc_tree == NULL, 0)) goto parse_bracket_exp_espace; - for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) + for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) break; /* If there are no bits set in sbcset, there is no point of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ - if (sbc_idx < BITSET_UINTS) + if (sbc_idx < BITSET_WORDS) { /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3316,7 +3331,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, is a pointer argument sinse we may update it. */ static reg_errcode_t -build_equiv_class (re_bitset_ptr_t sbcset, +build_equiv_class (bitset sbcset, #ifdef RE_ENABLE_I18N re_charset_t *mbcset, Idx *equiv_class_alloc, #endif @@ -3406,7 +3421,7 @@ build_equiv_class (re_bitset_ptr_t sbcset, is a pointer argument sinse we may update it. */ static reg_errcode_t -build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset, +build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset, #ifdef RE_ENABLE_I18N re_charset_t *mbcset, Idx *char_class_alloc, #endif @@ -3493,7 +3508,7 @@ build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans, re_token_t br_token; bin_tree_t *tree; - sbcset = re_calloc (unsigned int, BITSET_UINTS); + sbcset = re_calloc (bitset_word, BITSET_WORDS); #ifdef RE_ENABLE_I18N mbcset = re_calloc (re_charset_t, 1); #endif /* RE_ENABLE_I18N */ diff --git a/lib/regex_internal.c b/lib/regex_internal.c index 864b1a737..ad618cf66 100644 --- a/lib/regex_internal.c +++ b/lib/regex_internal.c @@ -675,7 +675,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) pstr->wcs[wcs_idx] = WEOF; if (pstr->mbs_allocated) - memset (pstr->mbs, 255, pstr->valid_len); + memset (pstr->mbs, -1, pstr->valid_len); } pstr->valid_raw_len = pstr->valid_len; pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) diff --git a/lib/regex_internal.h b/lib/regex_internal.h index b1d79ef9d..fe37c0ba0 100644 --- a/lib/regex_internal.h +++ b/lib/regex_internal.h @@ -147,22 +147,48 @@ typedef __re_idx_t Idx; /* A hash value, suitable for computing hash tables. */ typedef __re_size_t re_hashval_t; -/* Number of bits in an unsinged int. */ -#define UINT_BITS (sizeof (unsigned int) * CHAR_BIT) -/* Number of unsigned int in an bit_set. */ -#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS) -typedef unsigned int bitset[BITSET_UINTS]; -typedef unsigned int *re_bitset_ptr_t; -typedef const unsigned int *re_const_bitset_ptr_t; - -#define bitset_set(set,i) (set[i / UINT_BITS] |= 1u << i % UINT_BITS) -#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1u << i % UINT_BITS)) -#define bitset_contain(set,i) (set[i / UINT_BITS] & (1u << i % UINT_BITS)) -#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS) -#define bitset_set_all(set) \ - memset (set, 255, sizeof (unsigned int) * BITSET_UINTS) -#define bitset_copy(dest,src) \ - memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS) +/* An integer used to represent a set of bits. It must be unsigned, + and must be at least as wide as unsigned int. */ +typedef unsigned long int bitset_word; + +/* Maximum value of a bitset word. It must be useful in preprocessor + contexts, and must be consistent with bitset_word. */ +#define BITSET_WORD_MAX ULONG_MAX + +/* Number of bits in a bitset word. Avoid greater-than-32-bit + integers and unconditional shifts by more than 31 bits, as they're + not portable. */ +#if BITSET_WORD_MAX == 0xffffffff +# define BITSET_WORD_BITS 32 +#elif BITSET_WORD_MAX >> 31 >> 5 == 1 +# define BITSET_WORD_BITS 36 +#elif BITSET_WORD_MAX >> 31 >> 16 == 1 +# define BITSET_WORD_BITS 48 +#elif BITSET_WORD_MAX >> 31 >> 28 == 1 +# define BITSET_WORD_BITS 60 +#elif BITSET_WORD_MAX >> 31 >> 31 >> 1 == 1 +# define BITSET_WORD_BITS 64 +#elif BITSET_WORD_MAX >> 31 >> 31 >> 9 == 1 +# define BITSET_WORD_BITS 72 +#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 3 == 1 +# define BITSET_WORD_BITS 128 +#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 == 1 +# define BITSET_WORD_BITS 256 +#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 > 1 +# define BITSET_WORD_BITS 257 /* any value > SBC_MAX will do here */ +# if BITSET_WORD_BITS <= SBC_MAX +# error "Invalid SBC_MAX" +# endif +#else +# error "Add case for new bitset_word size" +#endif + +/* Number of bitset words in a bitset. */ +#define BITSET_WORDS ((SBC_MAX + BITSET_WORD_BITS - 1) / BITSET_WORD_BITS) + +typedef bitset_word bitset[BITSET_WORDS]; +typedef bitset_word *re_bitset_ptr_t; +typedef const bitset_word *re_const_bitset_ptr_t; #define PREV_WORD_CONSTRAINT 0x0001 #define PREV_NOTWORD_CONSTRAINT 0x0002 @@ -707,8 +733,8 @@ struct re_dfa_t Idx nbackref; /* The number of backreference in this dfa. */ /* Bitmap expressing which backreference is used. */ - unsigned int used_bkref_map; - unsigned int completed_bkref_map; + bitset_word used_bkref_map; + bitset_word completed_bkref_map; unsigned int has_plural_match : 1; /* If this dfa has "multibyte node", which is a backreference or @@ -759,36 +785,72 @@ typedef struct /* Inline functions for bitset operation. */ + static inline void -bitset_not (bitset set) +bitset_set (bitset set, Idx i) { - int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) - set[bitset_i] = ~set[bitset_i]; + set[i / BITSET_WORD_BITS] |= (bitset_word) 1 << i % BITSET_WORD_BITS; } static inline void -bitset_merge (bitset dest, const bitset src) +bitset_clear (bitset set, Idx i) +{ + set[i / BITSET_WORD_BITS] &= ~ ((bitset_word) 1 << i % BITSET_WORD_BITS); +} + +static inline bool +bitset_contain (const bitset set, Idx i) +{ + return (set[i / BITSET_WORD_BITS] >> i % BITSET_WORD_BITS) & 1; +} + +static inline void +bitset_empty (bitset set) { - int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) - dest[bitset_i] |= src[bitset_i]; + memset (set, 0, sizeof (bitset)); } static inline void -bitset_not_merge (bitset dest, const bitset src) +bitset_set_all (bitset set) +{ + memset (set, -1, sizeof (bitset_word) * (SBC_MAX / BITSET_WORD_BITS)); + if (SBC_MAX % BITSET_WORD_BITS != 0) + set[BITSET_WORDS - 1] = + ((bitset_word) 1 << SBC_MAX % BITSET_WORD_BITS) - 1; +} + +static inline void +bitset_copy (bitset dest, const bitset src) +{ + memcpy (dest, src, sizeof (bitset)); +} + +static inline void +bitset_not (bitset set) +{ + int i; + for (i = 0; i < SBC_MAX / BITSET_WORD_BITS; ++i) + set[i] = ~set[i]; + if (SBC_MAX % BITSET_WORD_BITS != 0) + set[BITSET_WORDS - 1] = + (((bitset_word) 1 << SBC_MAX % BITSET_WORD_BITS) - 1 + & ~set[BITSET_WORDS - 1]); +} + +static inline void +bitset_merge (bitset dest, const bitset src) { int i; - for (i = 0; i < BITSET_UINTS; ++i) - dest[i] |= ~src[i]; + for (i = 0; i < BITSET_WORDS; ++i) + dest[i] |= src[i]; } static inline void bitset_mask (bitset dest, const bitset src) { - int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) - dest[bitset_i] &= src[bitset_i]; + int i; + for (i = 0; i < BITSET_WORDS; ++i) + dest[i] &= src[i]; } #if defined RE_ENABLE_I18N diff --git a/lib/regexec.c b/lib/regexec.c index 4de1f7b87..a85077c93 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -1936,9 +1936,9 @@ check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, if (ent->node != node) continue; - if (subexp_idx - < CHAR_BIT * sizeof ent->eps_reachable_subexps_map - && !(ent->eps_reachable_subexps_map & (1u << subexp_idx))) + if (subexp_idx < BITSET_WORD_BITS + && !(ent->eps_reachable_subexps_map + & ((bitset_word) 1 << subexp_idx))) continue; /* Recurse trying to reach the OP_OPEN_SUBEXP and @@ -1964,9 +1964,9 @@ check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, if (cpos == 0 && (boundaries & 2)) return 0; - if (subexp_idx - < CHAR_BIT * sizeof ent->eps_reachable_subexps_map) - ent->eps_reachable_subexps_map &= ~(1u << subexp_idx); + if (subexp_idx < BITSET_WORD_BITS) + ent->eps_reachable_subexps_map &= + ~ ((bitset_word) 1 << subexp_idx); } while (ent++->more); } @@ -2422,8 +2422,9 @@ check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, { Idx node = cur_nodes->elems[node_idx]; if (dfa->nodes[node].type == OP_OPEN_SUBEXP - && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map - && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx)) + && dfa->nodes[node].opr.idx < BITSET_WORD_BITS + && (dfa->used_bkref_map + & ((bitset_word) 1 << dfa->nodes[node].opr.idx))) { err = match_ctx_add_subtop (mctx, node, str_idx); if (BE (err != REG_NOERROR, 0)) @@ -3292,7 +3293,7 @@ build_trtable (re_dfa_t *dfa, re_dfastate_t *state) Idx i, j; int ch; bool need_word_trtable = false; - unsigned int elem, mask; + bitset_word elem, mask; bool dests_node_malloced = false, dest_states_malloced = false; Idx ndests; /* Number of the destination states from `state'. */ re_dfastate_t **trtable; @@ -3301,22 +3302,27 @@ build_trtable (re_dfa_t *dfa, re_dfastate_t *state) bitset *dests_ch; bitset acceptable; + struct dests_alloc + { + re_node_set dests_node[SBC_MAX]; + bitset dests_ch[SBC_MAX]; + } *dests_alloc; + /* We build DFA states which corresponds to the destination nodes from `state'. `dests_node[i]' represents the nodes which i-th destination state contains, and `dests_ch[i]' represents the characters which i-th destination state accepts. */ - if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) - dests_node = (re_node_set *) - alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); + if (__libc_use_alloca (sizeof (struct dests_alloc))) + dests_alloc = (struct dests_alloc *) alloca (sizeof dests_alloc[0]); else { - dests_node = (re_node_set *) - malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); - if (BE (dests_node == NULL, 0)) + dests_alloc = re_malloc (struct dests_alloc, 1); + if (BE (dests_alloc == NULL, 0)) return false; dests_node_malloced = true; } - dests_ch = (bitset *) (dests_node + SBC_MAX); + dests_node = dests_alloc->dests_node; + dests_ch = dests_alloc->dests_ch; /* Initialize transiton table. */ state->word_trtable = state->trtable = NULL; @@ -3327,7 +3333,7 @@ build_trtable (re_dfa_t *dfa, re_dfastate_t *state) if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0)) { if (dests_node_malloced) - free (dests_node); + free (dests_alloc); if (ndests == 0) { state->trtable = re_calloc (re_dfastate_t *, SBC_MAX); @@ -3363,7 +3369,7 @@ out_free: for (i = 0; i < ndests; ++i) re_node_set_free (dests_node + i); if (dests_node_malloced) - free (dests_node); + free (dests_alloc); return false; } dest_states_malloced = true; @@ -3427,8 +3433,8 @@ out_free: goto out_free; /* For all characters ch...: */ - for (i = 0; i < BITSET_UINTS; ++i) - for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; + for (i = 0; i < BITSET_WORDS; ++i) + for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; elem; mask <<= 1, elem >>= 1, ++ch) if (BE (elem & 1, 0)) @@ -3457,8 +3463,8 @@ out_free: goto out_free; /* For all characters ch...: */ - for (i = 0; i < BITSET_UINTS; ++i) - for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; + for (i = 0; i < BITSET_WORDS; ++i) + for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; elem; mask <<= 1, elem >>= 1, ++ch) if (BE (elem & 1, 0)) @@ -3499,7 +3505,7 @@ out_free: re_node_set_free (dests_node + i); if (dests_node_malloced) - free (dests_node); + free (dests_alloc); return true; } @@ -3553,7 +3559,10 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, #ifdef RE_ENABLE_I18N else if (type == OP_UTF8_PERIOD) { - memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2); + if (SBC_MAX / 2 % BITSET_WORD_BITS == 0) + memset (accepts, -1, sizeof accepts / 2); + else + bitset_merge (accepts, utf8_sb_map); if (!(dfa->syntax & REG_DOT_NEWLINE)) bitset_clear (accepts, '\n'); if (dfa->syntax & REG_DOT_NOT_NULL) @@ -3569,8 +3578,7 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, { if (constraint & NEXT_NEWLINE_CONSTRAINT) { - unsigned int accepts_newline = - bitset_contain (accepts, NEWLINE_CHAR); + bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); bitset_empty (accepts); if (accepts_newline) bitset_set (accepts, NEWLINE_CHAR); @@ -3585,7 +3593,7 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, if (constraint & NEXT_WORD_CONSTRAINT) { - unsigned int any_set = 0; + bitset_word any_set = 0; if (type == CHARACTER && !node->word_char) { bitset_empty (accepts); @@ -3593,18 +3601,18 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, } #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); else #endif - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= dfa->word_char[j]); if (!any_set) continue; } if (constraint & NEXT_NOTWORD_CONSTRAINT) { - unsigned int any_set = 0; + bitset_word any_set = 0; if (type == CHARACTER && node->word_char) { bitset_empty (accepts); @@ -3612,11 +3620,11 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, } #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); else #endif - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~dfa->word_char[j]); if (!any_set) continue; @@ -3630,7 +3638,7 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, bitset intersec; /* Intersection sets, see below. */ bitset remains; /* Flags, see below. */ - unsigned int has_intersec, not_subset, not_consumed; + bitset_word has_intersec, not_subset, not_consumed; /* Optimization, skip if this state doesn't accept the character. */ if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) @@ -3638,7 +3646,7 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, /* Enumerate the intersection set of this state and `accepts'. */ has_intersec = 0; - for (k = 0; k < BITSET_UINTS; ++k) + for (k = 0; k < BITSET_WORDS; ++k) has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; /* And skip if the intersection set is empty. */ if (!has_intersec) @@ -3646,7 +3654,7 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, /* Then check if this state is a subset of `accepts'. */ not_subset = not_consumed = 0; - for (k = 0; k < BITSET_UINTS; ++k) + for (k = 0; k < BITSET_WORDS; ++k) { not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; -- 2.11.0