/* Extended regular expression matching and search library.
- Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
- Software Foundation, Inc.
+ Copyright (C) 2002-2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation,
- Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+ with this program; if not, see <http://www.gnu.org/licenses/>. */
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
size_t length, reg_syntax_t syntax);
compiles PATTERN (of length LENGTH) and puts the result in BUFP.
Returns 0 if the pattern was valid, otherwise an error string.
- Assumes the `allocated' (and perhaps `buffer') and `translate' fields
+ Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
are set in BUFP on entry. */
#ifdef _LIBC
weak_alias (__re_compile_pattern, re_compile_pattern)
#endif
-/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
+/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
/* This has no initializer because initialized variables in Emacs
PREG is a regex_t *. We do not expect any fields to be initialized,
since POSIX says we shouldn't. Thus, we set
- `buffer' to the compiled pattern;
- `used' to the length of the compiled pattern;
- `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
+ 'buffer' to the compiled pattern;
+ 'used' to the length of the compiled pattern;
+ 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
REG_EXTENDED bit in CFLAGS is set; otherwise, to
RE_SYNTAX_POSIX_BASIC;
- `newline_anchor' to REG_NEWLINE being set in CFLAGS;
- `fastmap' to an allocated space for the fastmap;
- `fastmap_accurate' to zero;
- `re_nsub' to the number of subexpressions in PATTERN.
+ 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
+ 'fastmap' to an allocated space for the fastmap;
+ 'fastmap_accurate' to zero;
+ 're_nsub' to the number of subexpressions in PATTERN.
PATTERN is the address of the pattern string.
static const bitset_t utf8_sb_map =
{
/* Set the first 128 bits. */
-# if 4 * BITSET_WORD_BITS < ASCII_CHARS
-# error "bitset_word_t is narrower than 32 bits"
-# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
+# ifdef __GNUC__
+ [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
+# else
+# if 4 * BITSET_WORD_BITS < ASCII_CHARS
+# error "bitset_word_t is narrower than 32 bits"
+# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
-# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
+# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX, BITSET_WORD_MAX,
-# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
+# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX,
-# endif
+# endif
(BITSET_WORD_MAX
>> (SBC_MAX % BITSET_WORD_BITS == 0
? 0
: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
+# endif
};
#endif
+ __re_error_msgid_idx[(int) REG_ESPACE]);
}
- /* Since `re_exec' always passes NULL for the `regs' argument, we
+ /* Since 're_exec' always passes NULL for the 'regs' argument, we
don't need to initialize the pattern buffer fields which affect it. */
/* Match anchors at newlines. */
if (!ret)
return NULL;
- /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
+ /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
}
calculation below, and for similar doubling calculations
elsewhere. And it's <= rather than <, because some of the
doubling calculations add 1 afterwards. */
- if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
+ if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
return REG_ESPACE;
dfa->nodes_alloc = pat_len + 1;
internal_function
init_word_char (re_dfa_t *dfa)
{
- int i, j, ch;
dfa->word_ops_used = 1;
- for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
+ int i = 0;
+ int j;
+ int ch = 0;
+ if (BE (dfa->map_notascii == 0, 1))
+ {
+ if (BITSET_WORD_BITS == 64)
+ {
+ dfa->word_char[0] = UINT64_C (0x03ff000000000000);
+ dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
+ i = 2;
+ }
+ else if (BITSET_WORD_BITS == 32)
+ {
+ dfa->word_char[0] = UINT32_C (0x00000000);
+ dfa->word_char[1] = UINT32_C (0x03ff0000);
+ dfa->word_char[2] = UINT32_C (0x87fffffe);
+ dfa->word_char[3] = UINT32_C (0x07fffffe);
+ i = 4;
+ }
+ else
+ goto general_case;
+ ch = 128;
+
+ if (BE (dfa->is_utf8, 1))
+ {
+ memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
+ return;
+ }
+ }
+
+ general_case:
+ for (; i < BITSET_WORDS; ++i)
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
if (isalnum (ch) || ch == '_')
dfa->word_char[i] |= (bitset_word_t) 1 << j;
Idx dest_idx = dfa->edests[node_idx].elems[0];
if (!re_node_set_contains (&init_nodes, dest_idx))
{
- reg_errcode_t err = re_node_set_merge (&init_nodes,
- dfa->eclosures
- + dest_idx);
- if (err != REG_NOERROR)
- return err;
+ reg_errcode_t merge_err
+ = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
+ if (merge_err != REG_NOERROR)
+ return merge_err;
i = 0;
}
}
/* If we have already calculated, skip it. */
if (dfa->eclosures[node_idx].nelem != 0)
continue;
- /* Calculate epsilon closure of `node_idx'. */
+ /* Calculate epsilon closure of 'node_idx'. */
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
if (BE (err != REG_NOERROR, 0))
return err;
{
re_node_set eclosure_elem;
Idx edest = dfa->edests[node].elems[i];
- /* If calculating the epsilon closure of `edest' is in progress,
+ /* If calculating the epsilon closure of 'edest' is in progress,
return intermediate result. */
if (dfa->eclosures[edest].nelem == REG_MISSING)
{
incomplete = true;
continue;
}
- /* If we haven't calculated the epsilon closure of `edest' yet,
+ /* If we haven't calculated the epsilon closure of 'edest' yet,
calculate now. Otherwise use calculated epsilon closure. */
if (dfa->eclosures[edest].nelem == 0)
{
}
else
eclosure_elem = dfa->eclosures[edest];
- /* Merge the epsilon closure of `edest'. */
+ /* Merge the epsilon closure of 'edest'. */
err = re_node_set_merge (&eclosure, &eclosure_elem);
if (BE (err != REG_NOERROR, 0))
return err;
- /* If the epsilon closure of `edest' is incomplete,
+ /* If the epsilon closure of 'edest' is incomplete,
the epsilon closure of this node is also incomplete. */
if (dfa->eclosures[edest].nelem == 0)
{
/* Entry point of the parser.
Parse the regular expression REGEXP and return the structure tree.
- If an error is occured, ERR is set by error code, and return NULL.
+ If an error occurs, ERR is set by error code, and return NULL.
This function build the following tree, from regular expression <reg_exp>:
CAT
/ \
/ \
<branch1> <branch2>
- ALT means alternative, which represents the operator `|'. */
+ ALT means alternative, which represents the operator '|'. */
static bin_tree_t *
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
expr = parse_expression (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && expr == NULL, 0))
{
+ if (tree != NULL)
+ postorder (tree, free_tree, NULL);
return NULL;
}
if (tree != NULL && expr != NULL)
{
- tree = create_tree (dfa, tree, expr, CONCAT);
- if (tree == NULL)
+ bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
+ if (newtree == NULL)
{
+ postorder (expr, free_tree, NULL);
+ postorder (tree, free_tree, NULL);
*err = REG_ESPACE;
return NULL;
}
+ tree = newtree;
}
else if (tree == NULL)
tree = expr;
{
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
- *err = REG_EPAREN;
+ {
+ if (tree != NULL)
+ postorder (tree, free_tree, NULL);
+ *err = REG_EPAREN;
+ }
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
if (BE (tree == NULL, 0))
goto parse_dup_op_espace;
+/* From gnulib's "intprops.h":
+ True if the arithmetic type T is signed. */
+#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
+
/* This loop is actually executed only when end != REG_MISSING,
to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
already created the start+1-th copy. */
- if ((Idx) -1 < 0 || end != REG_MISSING)
+ if (TYPE_SIGNED (Idx) || end != REG_MISSING)
for (i = start + 2; i <= end; ++i)
{
elem = duplicate_tree (elem, dfa);
Build the range expression which starts from START_ELEM, and ends
at END_ELEM. The result are written to MBCSET and SBCSET.
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
- mbcset->range_ends, is a pointer argument sinse we may
+ mbcset->range_ends, is a pointer argument since we may
update it. */
static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
- bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+build_range_exp (const reg_syntax_t syntax,
+ bitset_t sbcset,
+ re_charset_t *mbcset,
+ Idx *range_alloc,
+ const bracket_elem_t *start_elem,
+ const bracket_elem_t *end_elem)
# else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
- bracket_elem_t *end_elem)
+build_range_exp (const reg_syntax_t syntax,
+ bitset_t sbcset,
+ const bracket_elem_t *start_elem,
+ const bracket_elem_t *end_elem)
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
return REG_ECOLLATE;
cmp_buf[0] = start_wc;
cmp_buf[4] = end_wc;
- if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+
+ if (BE ((syntax & RE_NO_EMPTY_RANGES)
+ && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
static reg_errcode_t
internal_function
-build_collating_symbol (bitset_t sbcset,
# ifdef RE_ENABLE_I18N
- re_charset_t *mbcset, Idx *coll_sym_alloc,
-# endif
- const unsigned char *name)
+build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
+ Idx *coll_sym_alloc, const unsigned char *name)
+# else /* not RE_ENABLE_I18N */
+build_collating_symbol (bitset_t sbcset, const unsigned char *name)
+# endif /* not RE_ENABLE_I18N */
{
size_t name_len = strlen ((const char *) name);
if (BE (name_len != 1, 0))
const int32_t *symb_table;
const unsigned char *extra;
- /* Local function for parse_bracket_exp used in _LIBC environement.
- Seek the collating symbol entry correspondings to NAME.
+ /* Local function for parse_bracket_exp used in _LIBC environment.
+ Seek the collating symbol entry corresponding to NAME.
Return the index of the symbol in the SYMB_TABLE. */
auto inline int32_t
return UINT_MAX;
}
- /* Local function for parse_bracket_exp used in _LIBC environement.
+ /* Local function for parse_bracket_exp used in _LIBC environment.
Build the range expression which starts from START_ELEM, and ends
at END_ELEM. The result are written to MBCSET and SBCSET.
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
- mbcset->range_ends, is a pointer argument sinse we may
+ mbcset->range_ends, is a pointer argument since we may
update it. */
auto inline reg_errcode_t
return REG_NOERROR;
}
- /* Local function for parse_bracket_exp used in _LIBC environement.
+ /* Local function for parse_bracket_exp used in _LIBC environment.
Build the collating element which is represented by NAME.
The result are written to MBCSET and SBCSET.
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
- pointer argument sinse we may update it. */
+ pointer argument since we may update it. */
auto inline reg_errcode_t
__attribute ((always_inline))
if (BE (sbcset == NULL, 0))
#endif /* RE_ENABLE_I18N */
{
+ re_free (sbcset);
+#ifdef RE_ENABLE_I18N
+ re_free (mbcset);
+#endif
*err = REG_ESPACE;
return NULL;
}
&start_elem, &end_elem);
#else
# ifdef RE_ENABLE_I18N
- *err = build_range_exp (sbcset,
+ *err = build_range_exp (syntax, sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
&range_alloc, &start_elem, &end_elem);
# else
- *err = build_range_exp (sbcset, &start_elem, &end_elem);
+ *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
# endif
#endif /* RE_ENABLE_I18N */
if (BE (*err != REG_NOERROR, 0))
Build the equivalence class which is represented by NAME.
The result are written to MBCSET and SBCSET.
EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
- is a pointer argument sinse we may update it. */
+ is a pointer argument since we may update it. */
static reg_errcode_t
#ifdef RE_ENABLE_I18N
_NL_COLLATE_EXTRAMB);
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
- idx1 = findidx (&cp);
- if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
+ idx1 = findidx (&cp, -1);
+ if (BE (idx1 == 0 || *cp != '\0', 0))
/* This isn't a valid character. */
return REG_ECOLLATE;
- /* Build single byte matcing table for this equivalence class. */
- char_buf[1] = (unsigned char) '\0';
+ /* Build single byte matching table for this equivalence class. */
len = weights[idx1 & 0xffffff];
for (ch = 0; ch < SBC_MAX; ++ch)
{
char_buf[0] = ch;
cp = char_buf;
- idx2 = findidx (&cp);
+ idx2 = findidx (&cp, 1);
/*
idx2 = table[ch];
*/
Build the character class which is represented by NAME.
The result are written to MBCSET and SBCSET.
CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
- is a pointer argument sinse we may update it. */
+ is a pointer argument since we may update it. */
static reg_errcode_t
#ifdef RE_ENABLE_I18N
}
/* This is intended for the expressions like "a{1,3}".
- Fetch a number from `input', and return the number.
+ Fetch a number from 'input', and return the number.
Return REG_MISSING if the number field is empty like "{,1}".
Return REG_ERROR if an error occurred. */