From 5a460138e9cb595387de6db5a86a63c12d63d181 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 7 Feb 2012 22:47:01 -0800 Subject: [PATCH] regex: merge glibc changes * lib/regcomp.c (init_dfa): Tighten overflow checks to test for IDX_MAX too, since IDX_MAX can be much less than SIZE_MAX. (init_word_char): Work even if bitset words are not exactly 32 or 64 bits wide. Don't assume there are no padding bits. * lib/regex.c [_LIBC]: Do not include . [!_LIBC]: Add pragmas to ignore -Wsuggest-attributes=pure and -Wtype-limits. * lib/regex.h (__USE_GNU): Renamed from __USE_GNU_REGEX, to avoid needless disagreement with glibc. All uses changed. Define it to 1 only if _GNU_SOURCE, to match glibc. (_REG_RM_NAME): Remove; no longer needed, since the names in question are now all protected by __USE_GNU. (_REG_RE_NAME): Remove; replaced by glibc's __REPB_PREFIX. (REG_TRANSLATE_TYPE): Remove; replaced by glibc's __RE_TRANSLATE_TYPE. * lib/regex_internal.h (MIN): New macro. 2012-01-03 Ulrich Drepper * lib/regcomp.c (init_word_char): Optimize regex a bit. 2011-12-30 Jakub Jelinek * lib/regex_internal.c (re_string_fetch_byte_case): Fix up regcomp/regexec. The problem is that parse_bracket_symbol is miscompiled, and it turns out it is because of an incorrect attribute on re_string_fetch_byte_case. Unlike re_string_peek_byte_case, this one is really not pure, it modifies memory (increments pstr->cur_idx), and with the pure attribute GCC assumed it doesn't and it cached the presumed value of regexp->cur_idx in a variable across the for (;; ++i) { if (i >= BRACKET_NAME_BUF_SIZE) return REG_EBRACK; if (token->type == OP_OPEN_CHAR_CLASS) ch = re_string_fetch_byte_case (regexp); else ch = re_string_fetch_byte (regexp); if (re_string_eoi(regexp)) return REG_EBRACK; if (ch == delim && re_string_peek_byte (regexp, 0) == ']') break; elem->opr.name[i] = ch; } 2011-11-29 Andreas Schwab * lib/regcomp.c (build_equiv_class): Fix access after end of search string in regex matcher. 2011-11-12 Ulrich Drepper * lib/regex_internal.c, lib/regex_internal.h: Fix warnings in regex. 2011-10-12 Ulrich Drepper * lib/regcomp.c (parse_branch): One more regex memory leak fixed. 2011-10-11 Ulrich Drepper * lib/regcomp.c (parse_branch, parse_sub_exp): More regex memory leak fixes and tests. (parse_sub_exp, parse_bracket_exp): Fix memory leak for some invalid regular expressions. 2011-05-28 Ulrich Drepper * lib/regex_internal.c, lib/regexec.c: Fix unnecessary overallocation due to incomplete character. When incomplete characters are found at the end of a string the code ran amok and allocated lots of memory. Stricter limits are now in place. 2011-05-20 Reuben Thomas * lib/regex.h: Update documentation. 2011-05-16 Aharon Robbins * lib/regex.h: Update RE_SYNTAX*_AWK constants. 2010-05-05 Andreas Schwab * lib/regexec.c (find_collation_sequence_value): Fix lookup of collation sequence value during regexp matching. 2010-01-22 Ulrich Drepper * lib/regex_internal.c (re_dfa_add_node): Extend overflow detection. 2008-01-16 Ulrich Drepper * lib/regex.h: Cleanup namespace. 2007-11-26 Ulrich Drepper * lib/regex.h (REG_ENOSYS): Define REG_ENOSYS also for __USE_XOPEN2K. 2007-08-26 Ulrich Drepper * lib/regex_internal.h: Prevent some declarations and definitions to be seen when used in tests. 2005-05-06 Ulrich Drepper * lib/regex_internal.h: Include bits/libc-lock.h or define dummy __libc_lock_* macros if not _LIBC. (struct re_dfa_t): Add lock. --- ChangeLog | 98 ++++++++++++++++++++++++++++++ lib/regcomp.c | 87 ++++++++++++++++++++------- lib/regex.c | 10 +++- lib/regex.h | 164 ++++++++++++++++++++++++--------------------------- lib/regex_internal.c | 56 +++++++++--------- lib/regex_internal.h | 35 +++++++---- lib/regexec.c | 47 ++++++++------- 7 files changed, 326 insertions(+), 171 deletions(-) diff --git a/ChangeLog b/ChangeLog index acc2e4ff1..9171da19f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,101 @@ +2012-02-07 Paul Eggert + + regex: merge glibc changes + + * lib/regcomp.c (init_dfa): Tighten overflow checks to test + for IDX_MAX too, since IDX_MAX can be much less than SIZE_MAX. + (init_word_char): Work even if bitset words are not exactly 32 or + 64 bits wide. Don't assume there are no padding bits. + * lib/regex.c [_LIBC]: Do not include . + [!_LIBC]: Add pragmas to ignore -Wsuggest-attributes=pure + and -Wtype-limits. + * lib/regex.h (__USE_GNU): Renamed from __USE_GNU_REGEX, to avoid + needless disagreement with glibc. All uses changed. Define it to + 1 only if _GNU_SOURCE, to match glibc. + (_REG_RM_NAME): Remove; no longer needed, since the names in + question are now all protected by __USE_GNU. + (_REG_RE_NAME): Remove; replaced by glibc's __REPB_PREFIX. + (REG_TRANSLATE_TYPE): Remove; replaced by glibc's __RE_TRANSLATE_TYPE. + * lib/regex_internal.h (MIN): New macro. + + 2012-01-03 Ulrich Drepper + * lib/regcomp.c (init_word_char): Optimize regex a bit. + + 2011-12-30 Jakub Jelinek + * lib/regex_internal.c (re_string_fetch_byte_case): + Fix up regcomp/regexec. The problem is that parse_bracket_symbol + is miscompiled, and it turns out it is because of an incorrect + attribute on re_string_fetch_byte_case. Unlike + re_string_peek_byte_case, this one is really not pure, it modifies + memory (increments pstr->cur_idx), and with the pure attribute GCC + assumed it doesn't and it cached the presumed value of + regexp->cur_idx in a variable across the + for (;; ++i) + { + if (i >= BRACKET_NAME_BUF_SIZE) + return REG_EBRACK; + if (token->type == OP_OPEN_CHAR_CLASS) + ch = re_string_fetch_byte_case (regexp); + else + ch = re_string_fetch_byte (regexp); + if (re_string_eoi(regexp)) + return REG_EBRACK; + if (ch == delim && re_string_peek_byte (regexp, 0) == ']') + break; + elem->opr.name[i] = ch; + } + + 2011-11-29 Andreas Schwab + * lib/regcomp.c (build_equiv_class): + Fix access after end of search string in regex matcher. + + 2011-11-12 Ulrich Drepper + * lib/regex_internal.c, lib/regex_internal.h: Fix warnings in regex. + + 2011-10-12 Ulrich Drepper + * lib/regcomp.c (parse_branch): One more regex memory leak fixed. + + 2011-10-11 Ulrich Drepper + * lib/regcomp.c (parse_branch, parse_sub_exp): + More regex memory leak fixes and tests. + (parse_sub_exp, parse_bracket_exp): + Fix memory leak for some invalid regular expressions. + + 2011-05-28 Ulrich Drepper + * lib/regex_internal.c, lib/regexec.c: + Fix unnecessary overallocation due to incomplete character. When + incomplete characters are found at the end of a string the code + ran amok and allocated lots of memory. Stricter limits are now in + place. + + 2011-05-20 Reuben Thomas + * lib/regex.h: Update documentation. + + 2011-05-16 Aharon Robbins + * lib/regex.h: Update RE_SYNTAX*_AWK constants. + + 2010-05-05 Andreas Schwab + * lib/regexec.c (find_collation_sequence_value): + Fix lookup of collation sequence value during regexp matching. + + 2010-01-22 Ulrich Drepper + * lib/regex_internal.c (re_dfa_add_node): Extend overflow detection. + + 2008-01-16 Ulrich Drepper + * lib/regex.h: Cleanup namespace. + + 2007-11-26 Ulrich Drepper + * lib/regex.h (REG_ENOSYS): Define REG_ENOSYS also for __USE_XOPEN2K. + + 2007-08-26 Ulrich Drepper + * lib/regex_internal.h: Prevent some declarations and definitions + to be seen when used in tests. + + 2005-05-06 Ulrich Drepper + * lib/regex_internal.h: Include bits/libc-lock.h or define dummy + __libc_lock_* macros if not _LIBC. + (struct re_dfa_t): Add lock. + 2012-02-07 Eric Blake maint.mk: also prohibit lower-case @var@ diff --git a/lib/regcomp.c b/lib/regcomp.c index 89940ce11..2f36794ac 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -586,19 +586,23 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +# ifdef __GNUC__ + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX +# else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) +# endif }; #endif @@ -873,7 +877,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) calculation below, and for similar doubling calculations elsewhere. And it's <= rather than <, because some of the doubling calculations add 1 afterwards. */ - if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) return REG_ESPACE; dfa->nodes_alloc = pat_len + 1; @@ -947,9 +951,39 @@ static void internal_function init_word_char (re_dfa_t *dfa) { - int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + int i = 0; + int j; + int ch = 0; + if (BE (dfa->map_notascii == 0, 1)) + { + if (BITSET_WORD_BITS == 64) + { + dfa->word_char[0] = UINT64_C (0x03ff000000000000); + dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe); + i = 2; + } + else if (BITSET_WORD_BITS == 32) + { + dfa->word_char[0] = UINT32_C (0x00000000); + dfa->word_char[1] = UINT32_C (0x03ff0000); + dfa->word_char[2] = UINT32_C (0x87fffffe); + dfa->word_char[3] = UINT32_C (0x07fffffe); + i = 4; + } + else + goto general_case; + ch = 128; + + if (BE (dfa->is_utf8, 1)) + { + memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); + return; + } + } + + general_case: + for (; i < BITSET_WORDS; ++i) for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') dfa->word_char[i] |= (bitset_word_t) 1 << j; @@ -2193,16 +2227,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, expr = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && expr == NULL, 0)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); return NULL; } if (tree != NULL && expr != NULL) { - tree = create_tree (dfa, tree, expr, CONCAT); - if (tree == NULL) + bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); + if (newtree == NULL) { + postorder (expr, free_tree, NULL); + postorder (tree, free_tree, NULL); *err = REG_ESPACE; return NULL; } + tree = newtree; } else if (tree == NULL) tree = expr; @@ -2451,7 +2490,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + *err = REG_EPAREN; + } if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -2749,11 +2792,12 @@ build_range_exp (const reg_syntax_t syntax, static reg_errcode_t internal_function -build_collating_symbol (bitset_t sbcset, # ifdef RE_ENABLE_I18N - re_charset_t *mbcset, Idx *coll_sym_alloc, -# endif - const unsigned char *name) +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name) +# else /* not RE_ENABLE_I18N */ +build_collating_symbol (bitset_t sbcset, const unsigned char *name) +# endif /* not RE_ENABLE_I18N */ { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -3075,6 +3119,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (BE (sbcset == NULL, 0)) #endif /* RE_ENABLE_I18N */ { + re_free (sbcset); +#ifdef RE_ENABLE_I18N + re_free (mbcset); +#endif *err = REG_ESPACE; return NULL; } @@ -3444,19 +3492,18 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp); - if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) + idx1 = findidx (&cp, -1); + if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; /* Build single byte matching table for this equivalence class. */ - char_buf[1] = (unsigned char) '\0'; len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp); + idx2 = findidx (&cp, 1); /* idx2 = table[ch]; */ diff --git a/lib/regex.c b/lib/regex.c index edc85f34b..89ce73e1f 100644 --- a/lib/regex.c +++ b/lib/regex.c @@ -17,7 +17,14 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include +#ifndef _LIBC +# include + +# if (__GNUC__ == 4 && 3 <= __GNUC_MINOR__) || 4 < __GNUC__ +# pragma GCC diagnostic ignored "-Wsuggest-attribute=pure" +# pragma GCC diagnostic ignored "-Wtype-limits" +# endif +#endif /* Make sure no one compiles this code with a C++ compiler. */ #if defined __cplusplus && defined _LIBC @@ -53,7 +60,6 @@ GNU regex allows. Include it before , which correctly #undefs RE_DUP_MAX and sets it to the right value. */ #include -#include #include #include "regex_internal.h" diff --git a/lib/regex.h b/lib/regex.h index d8fbadc28..acfbb0b27 100644 --- a/lib/regex.h +++ b/lib/regex.h @@ -1,6 +1,6 @@ /* Definitions for data structures and routines for the regular expression library. - Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2006, 2009-2012 + Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2012 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -28,13 +28,10 @@ extern "C" { #endif -/* Define __USE_GNU_REGEX to declare GNU extensions that violate the +/* Define __USE_GNU to declare GNU extensions that violate the POSIX name space rules. */ -#undef __USE_GNU_REGEX -#if (defined _GNU_SOURCE \ - || (!defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE \ - && !defined _XOPEN_SOURCE)) -# define __USE_GNU_REGEX 1 +#ifdef _GNU_SOURCE +# define __USE_GNU 1 #endif #ifdef _REGEX_LARGE_OFFSETS @@ -45,16 +42,6 @@ extern "C" { supported within glibc itself, and glibc users should not define _REGEX_LARGE_OFFSETS. */ -/* The type of the offset of a byte within a string. - For historical reasons POSIX 1003.1-2004 requires that regoff_t be - at least as wide as off_t. However, many common POSIX platforms set - regoff_t to the more-sensible ssize_t and the Open Group has - signalled its intention to change the requirement to be that - regoff_t be at least as wide as ptrdiff_t and ssize_t; see XBD ERN - 60 (2005-08-25). We don't know of any hosts where ssize_t or - ptrdiff_t is wider than ssize_t, so ssize_t is safe. */ -typedef ssize_t regoff_t; - /* The type of nonnegative object indexes. Traditionally, GNU regex uses 'int' for these. Code that uses __re_idx_t should work regardless of whether the type is signed. */ @@ -69,10 +56,8 @@ typedef size_t __re_long_size_t; #else -/* Use types that are binary-compatible with the traditional GNU regex - implementation, which mishandles strings longer than INT_MAX. */ - -typedef int regoff_t; +/* The traditional GNU regex implementation mishandles strings longer + than INT_MAX. */ typedef int __re_idx_t; typedef unsigned int __re_size_t; typedef unsigned long int __re_long_size_t; @@ -93,8 +78,7 @@ typedef unsigned long int active_reg_t; add or remove a bit, only one other definition need change. */ typedef unsigned long int reg_syntax_t; -#ifdef __USE_GNU_REGEX - +#ifdef __USE_GNU /* If this bit is not set, then \ inside a bracket expression is literal. If set, then such a \ quotes the following character. */ # define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1) @@ -225,8 +209,7 @@ typedef unsigned long int reg_syntax_t; /* If this bit is set, then no_sub will be set to 1 during re_compile_pattern. */ # define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) - -#endif /* defined __USE_GNU_REGEX */ +#endif /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is @@ -234,7 +217,7 @@ typedef unsigned long int reg_syntax_t; already-compiled regexps. */ extern reg_syntax_t re_syntax_options; -#ifdef __USE_GNU_REGEX +#ifdef __USE_GNU /* Define combinations of the above bits for the standard possibilities. (The [[[ comments delimit what gets put into the Texinfo file, so don't delete them!) */ @@ -246,16 +229,19 @@ extern reg_syntax_t re_syntax_options; | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CHAR_CLASSES \ | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) # define RE_SYNTAX_GNU_AWK \ - ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \ - & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS \ - | RE_CONTEXT_INVALID_OPS )) + ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ + | RE_INVALID_INTERVAL_ORD) \ + & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \ + | RE_CONTEXT_INVALID_OPS )) # define RE_SYNTAX_POSIX_AWK \ (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ - | RE_INTERVALS | RE_NO_GNU_OPS) + | RE_INTERVALS | RE_NO_GNU_OPS \ + | RE_INVALID_INTERVAL_ORD) # define RE_SYNTAX_GREP \ (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ @@ -306,10 +292,6 @@ extern reg_syntax_t re_syntax_options; | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) /* [[[end syntaxes]]] */ -#endif /* defined __USE_GNU_REGEX */ - -#ifdef __USE_GNU_REGEX - /* Maximum number of duplicates an interval can allow. POSIX-conforming systems might define this in , but we want our value, so remove any previous define. */ @@ -325,8 +307,7 @@ extern reg_syntax_t re_syntax_options; actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains its historical value. */ # define RE_DUP_MAX (0x7fff) - -#endif /* defined __USE_GNU_REGEX */ +#endif /* POSIX 'cflags' bits (i.e., information for 'regcomp'). */ @@ -396,7 +377,7 @@ typedef enum _REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ } reg_errcode_t; -#ifdef _XOPEN_SOURCE +#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K # define REG_ENOSYS _REG_ENOSYS #endif #define REG_NOERROR _REG_NOERROR @@ -417,62 +398,51 @@ typedef enum #define REG_ESIZE _REG_ESIZE #define REG_ERPAREN _REG_ERPAREN -/* struct re_pattern_buffer normally uses member names like 'buffer' - that POSIX does not allow. In POSIX mode these members have names - with leading 're_' (e.g., 're_buffer'). */ -#ifdef __USE_GNU_REGEX -# define _REG_RE_NAME(id) id -# define _REG_RM_NAME(id) id -#else -# define _REG_RE_NAME(id) re_##id -# define _REG_RM_NAME(id) rm_##id +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields 'buffer', 'allocated', 'fastmap', + and 'translate' can be set. After the pattern has been compiled, + the fields 're_nsub', 'not_bol' and 'not_eol' are available. All + other fields are private to the regex routines. */ + +#ifndef RE_TRANSLATE_TYPE +# define __RE_TRANSLATE_TYPE unsigned char * +# ifdef __USE_GNU +# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE +# endif #endif -/* The user can specify the type of the re_translate member by - defining the macro RE_TRANSLATE_TYPE, which defaults to unsigned - char *. This pollutes the POSIX name space, so in POSIX mode just - use unsigned char *. */ -#ifdef __USE_GNU_REGEX -# ifndef RE_TRANSLATE_TYPE -# define RE_TRANSLATE_TYPE unsigned char * -# endif -# define REG_TRANSLATE_TYPE RE_TRANSLATE_TYPE +#ifdef __USE_GNU +# define __REPB_PREFIX(name) name #else -# define REG_TRANSLATE_TYPE unsigned char * +# define __REPB_PREFIX(name) __##name #endif -/* This data structure represents a compiled pattern. Before calling - the pattern compiler, the fields 'buffer', 'allocated', 'fastmap', - 'translate', and 'no_sub' can be set. After the pattern has been - compiled, the 're_nsub' field is available. All other fields are - private to the regex routines. */ - struct re_pattern_buffer { /* Space that holds the compiled pattern. It is declared as 'unsigned char *' because its elements are sometimes used as array indexes. */ - unsigned char *_REG_RE_NAME (buffer); + unsigned char *__REPB_PREFIX(buffer); /* Number of bytes to which 'buffer' points. */ - __re_long_size_t _REG_RE_NAME (allocated); + __re_long_size_t __REPB_PREFIX(allocated); /* Number of bytes actually used in 'buffer'. */ - __re_long_size_t _REG_RE_NAME (used); + __re_long_size_t __REPB_PREFIX(used); /* Syntax setting with which the pattern was compiled. */ - reg_syntax_t _REG_RE_NAME (syntax); + reg_syntax_t __REPB_PREFIX(syntax); /* Pointer to a fastmap, if any, otherwise zero. re_search uses the fastmap, if there is one, to skip over impossible starting points for matches. */ - char *_REG_RE_NAME (fastmap); + char *__REPB_PREFIX(fastmap); /* Either a translate table to apply to all characters before comparing them, or zero for no translation. The translation is applied to a pattern when it is compiled and to a string when it is matched. */ - REG_TRANSLATE_TYPE _REG_RE_NAME (translate); + __RE_TRANSLATE_TYPE __REPB_PREFIX(translate); /* Number of subexpressions found by the compiler. */ size_t re_nsub; @@ -481,57 +451,70 @@ struct re_pattern_buffer Well, in truth it's used only in 're_search_2', to see whether or not we should use the fastmap, so we don't set this absolutely perfectly; see 're_compile_fastmap' (the "duplicate" case). */ - unsigned int _REG_RE_NAME (can_be_null) : 1; + unsigned __REPB_PREFIX(can_be_null) : 1; /* If REGS_UNALLOCATED, allocate space in the 'regs' structure for 'max (RE_NREGS, re_nsub + 1)' groups. If REGS_REALLOCATE, reallocate space if necessary. If REGS_FIXED, use what's there. */ -#ifdef __USE_GNU_REGEX +#ifdef __USE_GNU # define REGS_UNALLOCATED 0 # define REGS_REALLOCATE 1 # define REGS_FIXED 2 #endif - unsigned int _REG_RE_NAME (regs_allocated) : 2; + unsigned __REPB_PREFIX(regs_allocated) : 2; /* Set to zero when 're_compile_pattern' compiles a pattern; set to one by 're_compile_fastmap' if it updates the fastmap. */ - unsigned int _REG_RE_NAME (fastmap_accurate) : 1; + unsigned __REPB_PREFIX(fastmap_accurate) : 1; /* If set, 're_match_2' does not return information about subexpressions. */ - unsigned int _REG_RE_NAME (no_sub) : 1; + unsigned __REPB_PREFIX(no_sub) : 1; /* If set, a beginning-of-line anchor doesn't match at the beginning of the string. */ - unsigned int _REG_RE_NAME (not_bol) : 1; + unsigned __REPB_PREFIX(not_bol) : 1; /* Similarly for an end-of-line anchor. */ - unsigned int _REG_RE_NAME (not_eol) : 1; + unsigned __REPB_PREFIX(not_eol) : 1; /* If true, an anchor at a newline matches. */ - unsigned int _REG_RE_NAME (newline_anchor) : 1; - -/* [[[end pattern_buffer]]] */ + unsigned __REPB_PREFIX(newline_anchor) : 1; }; typedef struct re_pattern_buffer regex_t; +/* Type for byte offsets within the string. POSIX mandates this. */ +#ifdef _REGEX_LARGE_OFFSETS +/* POSIX 1003.1-2008 requires that regoff_t be at least as wide as + ptrdiff_t and ssize_t. We don't know of any hosts where ptrdiff_t + is wider than ssize_t, so ssize_t is safe. */ +typedef ssize_t regoff_t; +#else +/* The traditional GNU regex implementation mishandles strings longer + than INT_MAX. */ +typedef int regoff_t; +#endif + + +#ifdef __USE_GNU /* This is the structure we store register match data in. See regex.texinfo for a full description of what registers match. */ struct re_registers { - __re_size_t _REG_RM_NAME (num_regs); - regoff_t *_REG_RM_NAME (start); - regoff_t *_REG_RM_NAME (end); + __re_size_t num_regs; + regoff_t *start; + regoff_t *end; }; /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer, 're_match_2' returns information about at least this many registers the first time a 'regs' structure is passed. */ -#if !defined RE_NREGS && defined __USE_GNU_REGEX -# define RE_NREGS 30 +# ifndef RE_NREGS +# define RE_NREGS 30 +# endif #endif @@ -546,13 +529,19 @@ typedef struct /* Declarations for routines. */ +#ifdef __USE_GNU /* Sets the current default syntax to SYNTAX, and return the old syntax. You can also simply assign to the 're_syntax_options' variable. */ extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax); /* Compile the regular expression PATTERN, with length LENGTH and syntax given by the global 're_syntax_options', into the buffer - BUFFER. Return NULL if successful, and an error string if not. */ + BUFFER. Return NULL if successful, and an error string if not. + + To free the allocated storage, you must call 'regfree' on BUFFER. + Note that the translate table must either have been initialised by + 'regcomp', with a malloc'ed value, or set to NULL before calling + 'regfree'. */ extern const char *re_compile_pattern (const char *__pattern, size_t __length, struct re_pattern_buffer *__buffer); @@ -609,14 +598,15 @@ extern regoff_t re_match_2 (struct re_pattern_buffer *__buffer, register data. Unless this function is called, the first search or match using - BUFFER will allocate its own register data, without freeing the old - data. */ + BUFFER will allocate its own register data, without + freeing the old data. */ extern void re_set_registers (struct re_pattern_buffer *__buffer, struct re_registers *__regs, __re_size_t __num_regs, regoff_t *__starts, regoff_t *__ends); +#endif /* Use GNU */ -#if defined _REGEX_RE_COMP || defined _LIBC +#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_BSD) # ifndef _CRAY /* 4.2 bsd compatibility. */ extern char *re_comp (const char *); diff --git a/lib/regex_internal.c b/lib/regex_internal.c index 4fb98351b..7697723c4 100644 --- a/lib/regex_internal.c +++ b/lib/regex_internal.c @@ -134,9 +134,9 @@ re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) { wint_t *new_wcs; - /* Avoid overflow. */ - size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); - if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) + /* Avoid overflow in realloc. */ + const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0)) return REG_ESPACE; new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); @@ -236,13 +236,8 @@ build_wcs_buffer (re_string_t *pstr) else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); - if (BE (mbclen == (size_t) -2, 0)) - { - /* The buffer doesn't have enough space, finish to build. */ - pstr->cur_state = prev_st; - break; - } - else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) + if (BE (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) { /* We treat these cases as a singlebyte character. */ mbclen = 1; @@ -251,6 +246,12 @@ build_wcs_buffer (re_string_t *pstr) wc = pstr->trans[wc]; pstr->cur_state = prev_st; } + else if (BE (mbclen == (size_t) -2, 0)) + { + /* The buffer doesn't have enough space, finish to build. */ + pstr->cur_state = prev_st; + break; + } /* Write wide character and padding. */ pstr->wcs[byte_idx++] = wc; @@ -333,9 +334,11 @@ build_wcs_upper_buffer (re_string_t *pstr) for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } - else if (mbclen == (size_t) -1 || mbclen == 0) + else if (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) { - /* It is an invalid character or '\0'. Just use the byte. */ + /* It is an invalid character, an incomplete character + at the end of the string, or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; pstr->mbs[byte_idx] = ch; /* And also cast it to wide char. */ @@ -448,7 +451,8 @@ build_wcs_upper_buffer (re_string_t *pstr) for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } - else if (mbclen == (size_t) -1 || mbclen == 0) + else if (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) { /* It is an invalid character or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; @@ -495,8 +499,7 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) rawbuf_idx < new_raw_idx;) { wchar_t wc2; - Idx remain_len; - remain_len = pstr->len - rawbuf_idx; + Idx remain_len = pstr->len - rawbuf_idx; prev_st = pstr->cur_state; mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, remain_len, &pstr->cur_state); @@ -732,21 +735,21 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) mbstate_t cur_state; wchar_t wc2; Idx mlen = raw + pstr->len - p; + unsigned char buf[6]; size_t mbclen; -#if 0 /* dead code: buf is set but never used */ - unsigned char buf[6]; + const unsigned char *pp = p; if (BE (pstr->trans != NULL, 0)) { int i = mlen < 6 ? mlen : 6; while (--i >= 0) buf[i] = pstr->trans[p[i]]; + pp = buf; } -#endif /* XXX Don't use mbrtowc, we know which conversion to use (UTF-8 -> UCS4). */ memset (&cur_state, 0, sizeof (cur_state)); - mbclen = __mbrtowc (&wc2, (const char *) p, mlen, + mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, &cur_state); if (raw + offset - p <= mbclen && mbclen < (size_t) -2) @@ -868,7 +871,7 @@ re_string_peek_byte_case (const re_string_t *pstr, Idx idx) } static unsigned char -internal_function __attribute ((pure)) +internal_function re_string_fetch_byte_case (re_string_t *pstr) { if (BE (!pstr->mbs_allocated, 1)) @@ -1412,13 +1415,12 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token) Idx *new_nexts, *new_indices; re_node_set *new_edests, *new_eclosures; re_token_t *new_nodes; - size_t max_object_size = - MAX (sizeof (re_token_t), - MAX (sizeof (re_node_set), - sizeof (Idx))); - /* Avoid overflows. */ - if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) + /* Avoid overflows in realloc. */ + const size_t max_object_size = MAX (sizeof (re_token_t), + MAX (sizeof (re_node_set), + sizeof (Idx))); + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0)) return REG_MISSING; new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); @@ -1579,7 +1581,7 @@ register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, { Idx elem = newstate->nodes.elems[i]; if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) - if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) + if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem)) return REG_ESPACE; } diff --git a/lib/regex_internal.h b/lib/regex_internal.h index 5cc685e77..714b54cd6 100644 --- a/lib/regex_internal.h +++ b/lib/regex_internal.h @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -32,13 +31,14 @@ # include "localcharset.h" #endif #include - #include #include +#include #include #if defined _LIBC # include #else +# define __libc_lock_define(CLASS,NAME) # define __libc_lock_init(NAME) do { } while (0) # define __libc_lock_lock(NAME) do { } while (0) # define __libc_lock_unlock(NAME) do { } while (0) @@ -111,8 +111,8 @@ # define __wctype wctype # define __iswctype iswctype # define __btowc btowc -# define __wcrtomb wcrtomb # define __mbrtowc mbrtowc +# define __wcrtomb wcrtomb # define __regfree regfree # define attribute_hidden #endif /* not _LIBC */ @@ -124,6 +124,11 @@ #endif typedef __re_idx_t Idx; +#ifdef _REGEX_LARGE_OFFSETS +# define IDX_MAX (SIZE_MAX - 2) +#else +# define IDX_MAX INT_MAX +#endif /* Special return value for failure to match. */ #define REG_MISSING ((Idx) -1) @@ -418,19 +423,21 @@ typedef struct re_dfa_t re_dfa_t; # define internal_function #endif +#ifndef NOT_IN_libc static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) internal_function; -#ifdef RE_ENABLE_I18N +# ifdef RE_ENABLE_I18N static void build_wcs_buffer (re_string_t *pstr) internal_function; static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr) - internal_function; -#endif /* RE_ENABLE_I18N */ + internal_function; +# endif /* RE_ENABLE_I18N */ static void build_upper_buffer (re_string_t *pstr) internal_function; static void re_string_translate_buffer (re_string_t *pstr) internal_function; static unsigned int re_string_context_at (const re_string_t *input, Idx idx, int eflags) internal_function __attribute ((pure)); +#endif #define re_string_peek_byte(pstr, offset) \ ((pstr)->mbs[(pstr)->cur_idx + offset]) #define re_string_fetch_byte(pstr) \ @@ -468,6 +475,9 @@ static unsigned int re_string_context_at (const re_string_t *input, Idx idx, #ifndef MAX # define MAX(a,b) ((a) < (b) ? (b) : (a)) #endif +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (b) : (a)) +#endif #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t))) #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t))) @@ -692,9 +702,7 @@ struct re_dfa_t #ifdef DEBUG char* re_str; #endif -#ifdef _LIBC __libc_lock_define (, lock) -#endif }; #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set)) @@ -818,15 +826,15 @@ re_string_wchar_at (const re_string_t *pstr, Idx idx) return (wint_t) pstr->wcs[idx]; } +# ifndef NOT_IN_libc static int internal_function __attribute ((pure)) re_string_elem_size_at (const re_string_t *pstr, Idx idx) { -# ifdef _LIBC +# ifdef _LIBC const unsigned char *p, *extra; const int32_t *table, *indirect; - int32_t tmp; -# include +# include uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) @@ -837,13 +845,14 @@ re_string_elem_size_at (const re_string_t *pstr, Idx idx) indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); p = pstr->mbs + idx; - tmp = findidx (&p); + findidx (&p, pstr->len - idx); return p - pstr->mbs - idx; } else -# endif /* _LIBC */ +# endif /* _LIBC */ return 1; } +# endif #endif /* RE_ENABLE_I18N */ #ifndef __GNUC_PREREQ diff --git a/lib/regexec.c b/lib/regexec.c index 5f12de91e..a0ea02401 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -51,9 +51,8 @@ static regoff_t re_search_stub (struct re_pattern_buffer *bufp, regoff_t range, Idx stop, struct re_registers *regs, bool ret_len) internal_function; -static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, - Idx nregs, int regs_allocated) - internal_function; +static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, + Idx nregs, int regs_allocated) internal_function; static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) internal_function; static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match, @@ -365,7 +364,6 @@ weak_alias (__re_search_2, re_search_2) #endif static regoff_t -internal_function re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1, Idx length1, const char *string2, Idx length2, @@ -413,7 +411,6 @@ re_search_2_stub (struct re_pattern_buffer *bufp, otherwise the position of the match is returned. */ static regoff_t -internal_function re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length, Idx start, regoff_t range, Idx stop, struct re_registers *regs, @@ -505,8 +502,7 @@ re_search_stub (struct re_pattern_buffer *bufp, return rval; } -static unsigned int -internal_function +static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs, int regs_allocated) { @@ -636,7 +632,7 @@ re_exec (s) (0 <= LAST_START && LAST_START <= LENGTH) */ static reg_errcode_t -internal_function __attribute_warn_unused_result__ +__attribute_warn_unused_result__ re_search_internal (const regex_t *preg, const char *string, Idx length, Idx start, Idx last_start, Idx stop, @@ -719,7 +715,8 @@ re_search_internal (const regex_t *preg, if (nmatch > 1 || dfa->has_mb_node) { /* Avoid overflow. */ - if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) + if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) + <= mctx.input.bufs_len), 0)) { err = REG_ESPACE; goto free_return; @@ -971,7 +968,7 @@ re_search_internal (const regex_t *preg, } static reg_errcode_t -internal_function __attribute_warn_unused_result__ +__attribute_warn_unused_result__ prune_impossible_nodes (re_match_context_t *mctx) { const re_dfa_t *const dfa = mctx->dfa; @@ -987,7 +984,7 @@ prune_impossible_nodes (re_match_context_t *mctx) halt_node = mctx->last_node; /* Avoid overflow. */ - if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0)) return REG_ESPACE; sifted_states = re_malloc (re_dfastate_t *, match_last + 1); @@ -1174,7 +1171,8 @@ check_matching (re_match_context_t *mctx, bool fl_longest_match, re_dfastate_t *old_state = cur_state; Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1; - if (BE (next_char_idx >= mctx->input.bufs_len, 0) + if ((BE (next_char_idx >= mctx->input.bufs_len, 0) + && mctx->input.bufs_len < mctx->input.len) || (BE (next_char_idx >= mctx->input.valid_len, 0) && mctx->input.valid_len < mctx->input.len)) { @@ -1752,7 +1750,8 @@ clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx) { Idx top = mctx->state_log_top; - if (next_state_log_idx >= mctx->input.bufs_len + if ((next_state_log_idx >= mctx->input.bufs_len + && mctx->input.bufs_len < mctx->input.len) || (next_state_log_idx >= mctx->input.valid_len && mctx->input.valid_len < mctx->input.len)) { @@ -2936,9 +2935,12 @@ check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node, { re_dfastate_t **new_array; Idx old_alloc = path->alloc; - Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1; - if (BE (new_alloc < old_alloc, 0) - || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0)) + Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1; + Idx new_alloc; + if (BE (IDX_MAX - old_alloc < incr_alloc, 0)) + return REG_ESPACE; + new_alloc = old_alloc + incr_alloc; + if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0)) return REG_ESPACE; new_array = re_realloc (path->array, re_dfastate_t *, new_alloc); if (BE (new_array == NULL, 0)) @@ -3397,6 +3399,7 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) { if (dests_node_malloced) free (dests_alloc); + /* Return false in case of an error, true otherwise. */ if (ndests == 0) { state->trtable = (re_dfastate_t **) @@ -3896,7 +3899,6 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, const int32_t *table, *indirect; const unsigned char *weights, *extra; const char *collseqwc; - int32_t idx; /* This #include defines a local function! */ # include @@ -3954,7 +3956,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - int32_t idx = findidx (&cp); + int32_t idx = findidx (&cp, elem_len); if (idx > 0) for (i = 0; i < cset->nequiv_classes; ++i) { @@ -4066,7 +4068,7 @@ find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) /* Skip the collation sequence value. */ idx += sizeof (uint32_t); /* Skip the wide char sequence of the collating element. */ - idx = idx + sizeof (uint32_t) * (extra[idx] + 1); + idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1); /* If we found the entry, return the sequence value. */ if (found) return *(uint32_t *) (extra + idx); @@ -4140,11 +4142,12 @@ extend_buffers (re_match_context_t *mctx) re_string_t *pstr = &mctx->input; /* Avoid overflow. */ - if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2 + <= pstr->bufs_len, 0)) return REG_ESPACE; /* Double the lengthes of the buffers. */ - ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); + ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2)); if (BE (ret != REG_NOERROR, 0)) return ret; @@ -4207,7 +4210,7 @@ match_ctx_init (re_match_context_t *mctx, int eflags, Idx n) size_t max_object_size = MAX (sizeof (struct re_backref_cache_entry), sizeof (re_sub_match_top_t *)); - if (BE (SIZE_MAX / max_object_size < n, 0)) + if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0)) return REG_ESPACE; mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); -- 2.11.0