1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
215 re_compile_pattern (pattern, length, bufp)
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 weak_alias (__re_compile_pattern, re_compile_pattern)
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
261 re_set_syntax (syntax)
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
270 weak_alias (__re_set_syntax, re_set_syntax)
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 re_set_fastmap (fastmap, false, buf[0]);
345 else if (type == SIMPLE_BRACKET)
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 /* See if we have to try all bytes which start multiple collation
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match || cset->nranges
388 || cset->nequiv_classes
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *_Restrict_ preg;
476 const char *_Restrict_ pattern;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
529 weak_alias (__regcomp, regcomp)
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode, preg, errbuf, errbuf_size)
539 const regex_t *_Restrict_ preg;
540 char *_Restrict_ errbuf;
542 #else /* size_t might promote */
544 regerror (int errcode, const regex_t *_Restrict_ preg,
545 char *_Restrict_ errbuf, size_t errbuf_size)
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 size_t cpy_size = msg_size;
567 if (BE (msg_size > errbuf_size, 0))
569 cpy_size = errbuf_size - 1;
570 errbuf[cpy_size] = '\0';
572 memcpy (errbuf, msg, cpy_size);
578 weak_alias (__regerror, regerror)
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map =
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608 free_dfa_content (re_dfa_t *dfa)
613 for (i = 0; i < dfa->nodes_len; ++i)
614 free_token (dfa->nodes + i);
615 re_free (dfa->nexts);
616 for (i = 0; i < dfa->nodes_len; ++i)
618 if (dfa->eclosures != NULL)
619 re_node_set_free (dfa->eclosures + i);
620 if (dfa->inveclosures != NULL)
621 re_node_set_free (dfa->inveclosures + i);
622 if (dfa->edests != NULL)
623 re_node_set_free (dfa->edests + i);
625 re_free (dfa->edests);
626 re_free (dfa->eclosures);
627 re_free (dfa->inveclosures);
628 re_free (dfa->nodes);
630 if (dfa->state_table)
631 for (i = 0; i <= dfa->state_hash_mask; ++i)
633 struct re_state_table_entry *entry = dfa->state_table + i;
634 for (j = 0; j < entry->num; ++j)
636 re_dfastate_t *state = entry->array[j];
639 re_free (entry->array);
641 re_free (dfa->state_table);
642 #ifdef RE_ENABLE_I18N
643 if (dfa->sb_char != utf8_sb_map)
644 re_free (dfa->sb_char);
646 re_free (dfa->subexp_map);
648 re_free (dfa->re_str);
655 /* Free dynamically allocated space used by PREG. */
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa);
667 re_free (preg->fastmap);
668 preg->fastmap = NULL;
670 re_free (preg->translate);
671 preg->translate = NULL;
674 weak_alias (__regfree, regfree)
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf;
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
700 if (!re_comp_buf.buffer)
701 return gettext ("No previous regular expression");
705 if (re_comp_buf.buffer)
707 fastmap = re_comp_buf.fastmap;
708 re_comp_buf.fastmap = NULL;
709 __regfree (&re_comp_buf);
710 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711 re_comp_buf.fastmap = fastmap;
714 if (re_comp_buf.fastmap == NULL)
716 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717 if (re_comp_buf.fastmap == NULL)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf.newline_anchor = 1;
728 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738 libc_freeres_fn (free_mem)
740 __regfree (&re_comp_buf);
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_errcode_t err = REG_NOERROR;
758 /* Initialize the pattern buffer. */
759 preg->fastmap_accurate = 0;
760 preg->syntax = syntax;
761 preg->not_bol = preg->not_eol = 0;
764 preg->can_be_null = 0;
765 preg->regs_allocated = REGS_UNALLOCATED;
767 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa;
781 preg->used = sizeof (re_dfa_t);
783 err = init_dfa (dfa, length);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
797 __libc_lock_init (dfa->lock);
799 err = re_string_construct (®exp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0))
803 re_compile_internal_free_return:
804 free_workarea_compile (preg);
805 re_string_destruct (®exp);
806 free_dfa_content (dfa);
812 /* Parse the regular expression, and build a structure tree. */
814 dfa->str_tree = parse (®exp, preg, syntax, &err);
815 if (BE (dfa->str_tree == NULL, 0))
816 goto re_compile_internal_free_return;
818 /* Analyze the tree and create the nfa. */
819 err = analyze (preg);
820 if (BE (err != REG_NOERROR, 0))
821 goto re_compile_internal_free_return;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829 /* Then create the initial state of the dfa. */
830 err = create_initial_state (dfa);
832 /* Release work areas. */
833 free_workarea_compile (preg);
834 re_string_destruct (®exp);
836 if (BE (err != REG_NOERROR, 0))
838 free_dfa_content (dfa);
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
850 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 __re_size_t table_size;
856 #ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
859 size_t max_i18n_object_size = 0;
861 size_t max_object_size =
862 MAX (sizeof (struct re_state_table_entry),
863 MAX (sizeof (re_token_t),
864 MAX (sizeof (re_node_set),
865 MAX (sizeof (regmatch_t),
866 max_i18n_object_size))));
868 memset (dfa, '\0', sizeof (re_dfa_t));
870 /* Force allocation of str_tree_storage the first time. */
871 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
874 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
880 dfa->nodes_alloc = pat_len + 1;
881 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883 /* table_size = 2 ^ ceil(log pat_len) */
884 for (table_size = 1; ; table_size <<= 1)
885 if (table_size > pat_len)
888 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
889 dfa->state_hash_mask = table_size - 1;
891 dfa->mb_cur_max = MB_CUR_MAX;
893 if (dfa->mb_cur_max == 6
894 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
896 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 codeset_name = nl_langinfo (CODESET);
900 if (strcasecmp (codeset_name, "UTF-8") == 0
901 || strcasecmp (codeset_name, "UTF8") == 0)
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
913 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
919 if (BE (dfa->sb_char == NULL, 0))
922 /* Set the bits corresponding to single byte chars. */
923 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
924 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 wint_t wch = __btowc (ch);
928 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930 if (isascii (ch) && wch != ch)
931 dfa->map_notascii = 1;
938 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943 /* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
949 init_word_char (re_dfa_t *dfa)
952 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
954 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= (bitset_word_t) 1 << j;
959 /* Free the work area which are only used while compiling. */
962 free_workarea_compile (regex_t *preg)
964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
965 bin_tree_storage_t *storage, *next;
966 for (storage = dfa->str_tree_storage; storage; storage = next)
968 next = storage->next;
971 dfa->str_tree_storage = NULL;
972 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
973 dfa->str_tree = NULL;
974 re_free (dfa->org_indices);
975 dfa->org_indices = NULL;
978 /* Create initial states for all contexts. */
981 create_initial_state (re_dfa_t *dfa)
985 re_node_set init_nodes;
987 /* Initial states have the epsilon closure of the node which is
988 the first node of the regular expression. */
989 first = dfa->str_tree->first->node_idx;
990 dfa->init_node = first;
991 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
992 if (BE (err != REG_NOERROR, 0))
995 /* The back-references which are in initial states can epsilon transit,
996 since in this case all of the subexpressions can be null.
997 Then we add epsilon closures of the nodes which are the next nodes of
998 the back-references. */
999 if (dfa->nbackref > 0)
1000 for (i = 0; i < init_nodes.nelem; ++i)
1002 Idx node_idx = init_nodes.elems[i];
1003 re_token_type_t type = dfa->nodes[node_idx].type;
1006 if (type != OP_BACK_REF)
1008 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 re_token_t *clexp_node;
1011 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012 if (clexp_node->type == OP_CLOSE_SUBEXP
1013 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 if (clexp_idx == init_nodes.nelem)
1019 if (type == OP_BACK_REF)
1021 Idx dest_idx = dfa->edests[node_idx].elems[0];
1022 if (!re_node_set_contains (&init_nodes, dest_idx))
1024 reg_errcode_t err = re_node_set_merge (&init_nodes,
1027 if (err != REG_NOERROR)
1034 /* It must be the first time to invoke acquire_state. */
1035 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036 /* We don't check ERR here, since the initial state must not be NULL. */
1037 if (BE (dfa->init_state == NULL, 0))
1039 if (dfa->init_state->has_constraint)
1041 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1049 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050 || dfa->init_state_begbuf == NULL, 0))
1054 dfa->init_state_word = dfa->init_state_nl
1055 = dfa->init_state_begbuf = dfa->init_state;
1057 re_node_set_free (&init_nodes);
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064 DFA nodes where needed. */
1067 optimize_utf8 (re_dfa_t *dfa)
1071 bool mb_chars = false;
1072 bool has_period = false;
1074 for (node = 0; node < dfa->nodes_len; ++node)
1075 switch (dfa->nodes[node].type)
1078 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1082 switch (dfa->nodes[node].opr.ctx_type)
1090 /* Word anchors etc. cannot be handled. It's okay to test
1091 opr.ctx_type since constraints (for all DFA nodes) are
1092 created by ORing one or more opr.ctx_type values. */
1102 case OP_DUP_ASTERISK:
1103 case OP_OPEN_SUBEXP:
1104 case OP_CLOSE_SUBEXP:
1106 case COMPLEX_BRACKET:
1108 case SIMPLE_BRACKET:
1109 /* Just double check. */
1111 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1126 if (mb_chars || has_period)
1127 for (node = 0; node < dfa->nodes_len; ++node)
1129 if (dfa->nodes[node].type == CHARACTER
1130 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131 dfa->nodes[node].mb_partial = 0;
1132 else if (dfa->nodes[node].type == OP_PERIOD)
1133 dfa->nodes[node].type = OP_UTF8_PERIOD;
1136 /* The search can be in single byte locale. */
1137 dfa->mb_cur_max = 1;
1139 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144 "eclosure", and "inveclosure". */
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1152 /* Allocate arrays. */
1153 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158 || dfa->eclosures == NULL, 0))
1161 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162 if (dfa->subexp_map != NULL)
1165 for (i = 0; i < preg->re_nsub; i++)
1166 dfa->subexp_map[i] = i;
1167 preorder (dfa->str_tree, optimize_subexps, dfa);
1168 for (i = 0; i < preg->re_nsub; i++)
1169 if (dfa->subexp_map[i] != i)
1171 if (i == preg->re_nsub)
1173 free (dfa->subexp_map);
1174 dfa->subexp_map = NULL;
1178 ret = postorder (dfa->str_tree, lower_subexps, preg);
1179 if (BE (ret != REG_NOERROR, 0))
1181 ret = postorder (dfa->str_tree, calc_first, dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1184 preorder (dfa->str_tree, calc_next, dfa);
1185 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1188 ret = calc_eclosure (dfa);
1189 if (BE (ret != REG_NOERROR, 0))
1192 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1194 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1197 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198 if (BE (dfa->inveclosures == NULL, 0))
1200 ret = calc_inveclosure (dfa);
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207 implement parse tree visits. Instead, we use parent pointers and
1208 some hairy code in these two functions. */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1213 bin_tree_t *node, *prev;
1215 for (node = root; ; )
1217 /* Descend down the tree, preferably to the left (or to the right
1218 if that's the only child). */
1219 while (node->left || node->right)
1227 reg_errcode_t err = fn (extra, node);
1228 if (BE (err != REG_NOERROR, 0))
1230 if (node->parent == NULL)
1233 node = node->parent;
1235 /* Go up while we have a node that is reached from the right. */
1236 while (node->right == prev || node->right == NULL);
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247 for (node = root; ; )
1249 reg_errcode_t err = fn (extra, node);
1250 if (BE (err != REG_NOERROR, 0))
1253 /* Go to the left node, or up and to the right. */
1258 bin_tree_t *prev = NULL;
1259 while (node->right == prev || node->right == NULL)
1262 node = node->parent;
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1273 backreferences as well. Requires a preorder visit. */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1277 re_dfa_t *dfa = (re_dfa_t *) extra;
1279 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281 int idx = node->token.opr.idx;
1282 node->token.opr.idx = dfa->subexp_map[idx];
1283 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1286 else if (node->token.type == SUBEXP
1287 && node->left && node->left->token.type == SUBEXP)
1289 Idx other_idx = node->left->token.opr.idx;
1291 node->left = node->left->left;
1293 node->left->parent = node;
1295 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296 if (other_idx < BITSET_WORD_BITS)
1297 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1308 regex_t *preg = (regex_t *) extra;
1309 reg_errcode_t err = REG_NOERROR;
1311 if (node->left && node->left->token.type == SUBEXP)
1313 node->left = lower_subexp (&err, preg, node->left);
1315 node->left->parent = node;
1317 if (node->right && node->right->token.type == SUBEXP)
1319 node->right = lower_subexp (&err, preg, node->right);
1321 node->right->parent = node;
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331 bin_tree_t *body = node->left;
1332 bin_tree_t *op, *cls, *tree1, *tree;
1335 /* We do not optimize empty subexpressions, because otherwise we may
1336 have bad CONCAT nodes with NULL children. This is obviously not
1337 very common, so we do not lose much. An example that triggers
1338 this case is the sed "script" /\(\)/x. */
1339 && node->left != NULL
1340 && (node->token.opr.idx >= BITSET_WORD_BITS
1341 || !(dfa->used_bkref_map
1342 & ((bitset_word_t) 1 << node->token.opr.idx))))
1345 /* Convert the SUBEXP node to the concatenation of an
1346 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1347 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350 tree = create_tree (dfa, op, tree1, CONCAT);
1351 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1357 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363 nodes. Requires a postorder visit. */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1367 re_dfa_t *dfa = (re_dfa_t *) extra;
1368 if (node->token.type == CONCAT)
1370 node->first = node->left->first;
1371 node->node_idx = node->left->node_idx;
1376 node->node_idx = re_dfa_add_node (dfa, node->token);
1377 if (BE (node->node_idx == REG_MISSING, 0))
1379 if (node->token.type == ANCHOR)
1380 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1385 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1386 static reg_errcode_t
1387 calc_next (void *extra, bin_tree_t *node)
1389 switch (node->token.type)
1391 case OP_DUP_ASTERISK:
1392 node->left->next = node;
1395 node->left->next = node->right->first;
1396 node->right->next = node->next;
1400 node->left->next = node->next;
1402 node->right->next = node->next;
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 re_dfa_t *dfa = (re_dfa_t *) extra;
1413 Idx idx = node->node_idx;
1414 reg_errcode_t err = REG_NOERROR;
1416 switch (node->token.type)
1422 assert (node->next == NULL);
1425 case OP_DUP_ASTERISK:
1429 dfa->has_plural_match = 1;
1430 if (node->left != NULL)
1431 left = node->left->first->node_idx;
1433 left = node->next->node_idx;
1434 if (node->right != NULL)
1435 right = node->right->first->node_idx;
1437 right = node->next->node_idx;
1438 assert (REG_VALID_INDEX (left));
1439 assert (REG_VALID_INDEX (right));
1440 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1445 case OP_OPEN_SUBEXP:
1446 case OP_CLOSE_SUBEXP:
1447 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451 dfa->nexts[idx] = node->next->node_idx;
1452 if (node->token.type == OP_BACK_REF)
1453 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457 assert (!IS_EPSILON_NODE (node->token.type));
1458 dfa->nexts[idx] = node->next->node_idx;
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467 to their own constraint. */
1469 static reg_errcode_t
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472 Idx root_node, unsigned int init_constraint)
1474 Idx org_node, clone_node;
1476 unsigned int constraint = init_constraint;
1477 for (org_node = top_org_node, clone_node = top_clone_node;;)
1479 Idx org_dest, clone_dest;
1480 if (dfa->nodes[org_node].type == OP_BACK_REF)
1482 /* If the back reference epsilon-transit, its destination must
1483 also have the constraint. Then duplicate the epsilon closure
1484 of the destination of the back reference, and store it in
1485 edests of the back reference. */
1486 org_dest = dfa->nexts[org_node];
1487 re_node_set_empty (dfa->edests + clone_node);
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == REG_MISSING, 0))
1491 dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 else if (dfa->edests[org_node].nelem == 0)
1498 /* In case of the node can't epsilon-transit, don't duplicate the
1499 destination and store the original destination as the
1500 destination of the node. */
1501 dfa->nexts[clone_node] = dfa->nexts[org_node];
1504 else if (dfa->edests[org_node].nelem == 1)
1506 /* In case of the node can epsilon-transit, and it has only one
1508 org_dest = dfa->edests[org_node].elems[0];
1509 re_node_set_empty (dfa->edests + clone_node);
1510 /* If the node is root_node itself, it means the epsilon closure
1511 has a loop. Then tie it to the destination of the root_node. */
1512 if (org_node == root_node && clone_node != org_node)
1514 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519 /* In case the node has another constraint, append it. */
1520 constraint |= dfa->nodes[org_node].constraint;
1521 clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 if (BE (clone_dest == REG_MISSING, 0))
1524 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 else /* dfa->edests[org_node].nelem == 2 */
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest = dfa->edests[org_node].elems[0];
1533 re_node_set_empty (dfa->edests + clone_node);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536 if (clone_dest == REG_MISSING)
1538 /* There is no such duplicated node, create a new one. */
1540 clone_dest = duplicate_node (dfa, org_dest, constraint);
1541 if (BE (clone_dest == REG_MISSING, 0))
1543 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547 root_node, constraint);
1548 if (BE (err != REG_NOERROR, 0))
1553 /* There is a duplicated node which satisfies the constraint,
1554 use it to avoid infinite loop. */
1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 org_dest = dfa->edests[org_node].elems[1];
1561 clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 if (BE (clone_dest == REG_MISSING, 0))
1564 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 org_node = org_dest;
1569 clone_node = clone_dest;
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579 unsigned int constraint)
1582 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584 if (org_node == dfa->org_indices[idx]
1585 && constraint == dfa->nodes[idx].constraint)
1586 return idx; /* Found. */
1588 return REG_MISSING; /* Not found. */
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 Return the index of the new node, or REG_MISSING if insufficient storage is
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599 if (BE (dup_idx != REG_MISSING, 1))
1601 dfa->nodes[dup_idx].constraint = constraint;
1602 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603 dfa->nodes[dup_idx].duplicated = 1;
1605 /* Store the index of the original node. */
1606 dfa->org_indices[dup_idx] = org_idx;
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1616 for (idx = 0; idx < dfa->nodes_len; ++idx)
1617 re_node_set_init_empty (dfa->inveclosures + idx);
1619 for (src = 0; src < dfa->nodes_len; ++src)
1621 Idx *elems = dfa->eclosures[src].elems;
1622 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1633 /* Calculate "eclosure" for all the node in DFA. */
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1641 assert (dfa->nodes_len > 0);
1644 /* For each nodes, calculate epsilon closure. */
1645 for (node_idx = 0; ; ++node_idx)
1648 re_node_set eclosure_elem;
1649 if (node_idx == dfa->nodes_len)
1658 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1661 /* If we have already calculated, skip it. */
1662 if (dfa->eclosures[node_idx].nelem != 0)
1664 /* Calculate epsilon closure of `node_idx'. */
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666 if (BE (err != REG_NOERROR, 0))
1669 if (dfa->eclosures[node_idx].nelem == 0)
1672 re_node_set_free (&eclosure_elem);
1678 /* Calculate epsilon closure of NODE. */
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1687 re_node_set eclosure;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = REG_MISSING;
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 re_node_set eclosure_elem;
1714 Idx edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == REG_MISSING)
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727 if (BE (err != REG_NOERROR, 0))
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1741 re_node_set_free (&eclosure_elem);
1745 /* Epsilon closures include itself. */
1746 ok = re_node_set_insert (&eclosure, node);
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1778 if (re_string_eoi (input))
1780 token->type = END_OF_RE;
1784 c = re_string_peek_byte (input, 0);
1787 token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 token->type = BACK_SLASH;
1807 c2 = re_string_peek_byte_case (input, 1);
1809 token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1923 token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1945 token->type = OP_DUP_ASTERISK;
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1972 token->type = OP_OPEN_BRACKET;
1975 token->type = OP_PERIOD;
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2008 /* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 if (re_string_eoi (input))
2018 token->type = END_OF_RE;
2021 c = re_string_peek_byte (input, 0);
2024 #ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 token->type = CHARACTER;
2031 #endif /* RE_ENABLE_I18N */
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 /* In this case, '\' escape a character. */
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2041 token->type = CHARACTER;
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2057 token->type = OP_OPEN_COLL_ELEM;
2060 token->type = OP_OPEN_EQUIV_CLASS;
2063 if (syntax & RE_CHAR_CLASSES)
2065 token->type = OP_OPEN_CHAR_CLASS;
2068 /* else fall through. */
2070 token->type = CHARACTER;
2080 token->type = OP_CHARSET_RANGE;
2083 token->type = OP_CLOSE_BRACKET;
2086 token->type = OP_NON_MATCH_LIST;
2089 token->type = CHARACTER;
2094 /* Functions for parser. */
2096 /* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error is occured, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 root = create_tree (dfa, tree, eor, CONCAT);
2125 if (BE (eor == NULL || root == NULL, 0))
2133 /* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2140 ALT means alternative, which represents the operator `|'. */
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 while (token->type == OP_ALT)
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2174 /* This function build the following tree, from regular expression
2181 CAT means concatenation. */
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 bin_tree_t *tree, *expr;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2201 if (tree != NULL && expr != NULL)
2203 tree = create_tree (dfa, tree, expr, CONCAT);
2210 else if (tree == NULL)
2212 /* Otherwise expr == NULL, we don't need to create new tree. */
2217 /* This function build the following tree, from regular expression a*:
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2229 switch (token->type)
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2281 dfa->has_mb_node = 1;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2290 case OP_DUP_ASTERISK:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 (const unsigned char *) "alnum",
2384 (const unsigned char *) "_",
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391 tree = build_charclass_op (dfa, regexp->trans,
2392 (const unsigned char *) "space",
2393 (const unsigned char *) "",
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2405 /* Must not happen? */
2411 fetch_token (token, regexp, syntax);
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2432 /* This function build the following tree, from regular expression
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 if (BE (*err != REG_NOERROR, 0))
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2471 tree->token.opr.idx = cur_nsub;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483 re_token_t start_token = *token;
2485 if (token->type == OP_OPEN_DUP_NUM)
2488 start = fetch_number (regexp, token, syntax);
2489 if (start == REG_MISSING)
2491 if (token->type == CHARACTER && token->opr.c == ',')
2492 start = 0; /* We treat "{,m}" as "{0,m}". */
2495 *err = REG_BADBR; /* <re>{} is invalid. */
2499 if (BE (start != REG_ERROR, 1))
2501 /* We treat "{n}" as "{n,n}". */
2502 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503 : ((token->type == CHARACTER && token->opr.c == ',')
2504 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2506 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2508 /* Invalid sequence. */
2509 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2511 if (token->type == END_OF_RE)
2519 /* If the syntax bit is set, rollback. */
2520 re_string_set_index (regexp, start_idx);
2521 *token = start_token;
2522 token->type = CHARACTER;
2523 /* mb_partial and word_char bits should be already initialized by
2528 if (BE ((end != REG_MISSING && start > end)
2529 || token->type != OP_CLOSE_DUP_NUM, 0))
2531 /* First number greater than second. */
2538 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2542 fetch_token (token, regexp, syntax);
2544 if (BE (elem == NULL, 0))
2546 if (BE (start == 0 && end == 0, 0))
2548 postorder (elem, free_tree, NULL);
2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2553 if (BE (start > 0, 0))
2556 for (i = 2; i <= start; ++i)
2558 elem = duplicate_tree (elem, dfa);
2559 tree = create_tree (dfa, tree, elem, CONCAT);
2560 if (BE (elem == NULL || tree == NULL, 0))
2561 goto parse_dup_op_espace;
2567 /* Duplicate ELEM before it is marked optional. */
2568 elem = duplicate_tree (elem, dfa);
2574 if (elem->token.type == SUBEXP)
2575 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2577 tree = create_tree (dfa, elem, NULL,
2578 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579 if (BE (tree == NULL, 0))
2580 goto parse_dup_op_espace;
2582 /* This loop is actually executed only when end != REG_MISSING,
2583 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2584 already created the start+1-th copy. */
2585 if ((Idx) -1 < 0 || end != REG_MISSING)
2586 for (i = start + 2; i <= end; ++i)
2588 elem = duplicate_tree (elem, dfa);
2589 tree = create_tree (dfa, tree, elem, CONCAT);
2590 if (BE (elem == NULL || tree == NULL, 0))
2591 goto parse_dup_op_espace;
2593 tree = create_tree (dfa, tree, NULL, OP_ALT);
2594 if (BE (tree == NULL, 0))
2595 goto parse_dup_op_espace;
2599 tree = create_tree (dfa, old_tree, tree, CONCAT);
2603 parse_dup_op_espace:
2608 /* Size of the names for collating symbol/equivalence_class/character_class.
2609 I'm not sure, but maybe enough. */
2610 #define BRACKET_NAME_BUF_SIZE 32
2613 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2614 Build the range expression which starts from START_ELEM, and ends
2615 at END_ELEM. The result are written to MBCSET and SBCSET.
2616 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2617 mbcset->range_ends, is a pointer argument sinse we may
2620 static reg_errcode_t
2622 # ifdef RE_ENABLE_I18N
2623 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2624 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2625 # else /* not RE_ENABLE_I18N */
2626 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2627 bracket_elem_t *end_elem)
2628 # endif /* not RE_ENABLE_I18N */
2630 unsigned int start_ch, end_ch;
2631 /* Equivalence Classes and Character Classes can't be a range start/end. */
2632 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2633 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2637 /* We can handle no multi character collating elements without libc
2639 if (BE ((start_elem->type == COLL_SYM
2640 && strlen ((char *) start_elem->opr.name) > 1)
2641 || (end_elem->type == COLL_SYM
2642 && strlen ((char *) end_elem->opr.name) > 1), 0))
2643 return REG_ECOLLATE;
2645 # ifdef RE_ENABLE_I18N
2650 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2653 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2656 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2659 ? __btowc (start_ch) : start_elem->opr.wch);
2660 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2661 ? __btowc (end_ch) : end_elem->opr.wch);
2662 if (start_wc == WEOF || end_wc == WEOF)
2663 return REG_ECOLLATE;
2664 cmp_buf[0] = start_wc;
2665 cmp_buf[4] = end_wc;
2666 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2669 /* Got valid collation sequence values, add them as a new entry.
2670 However, for !_LIBC we have no collation elements: if the
2671 character set is single byte, the single byte character set
2672 that we build below suffices. parse_bracket_exp passes
2673 no MBCSET if dfa->mb_cur_max == 1. */
2676 /* Check the space of the arrays. */
2677 if (BE (*range_alloc == mbcset->nranges, 0))
2679 /* There is not enough space, need realloc. */
2680 wchar_t *new_array_start, *new_array_end;
2683 /* +1 in case of mbcset->nranges is 0. */
2684 new_nranges = 2 * mbcset->nranges + 1;
2685 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2686 are NULL if *range_alloc == 0. */
2687 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2692 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2695 mbcset->range_starts = new_array_start;
2696 mbcset->range_ends = new_array_end;
2697 *range_alloc = new_nranges;
2700 mbcset->range_starts[mbcset->nranges] = start_wc;
2701 mbcset->range_ends[mbcset->nranges++] = end_wc;
2704 /* Build the table for single byte characters. */
2705 for (wc = 0; wc < SBC_MAX; ++wc)
2708 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2709 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2710 bitset_set (sbcset, wc);
2713 # else /* not RE_ENABLE_I18N */
2716 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722 if (start_ch > end_ch)
2724 /* Build the table for single byte characters. */
2725 for (ch = 0; ch < SBC_MAX; ++ch)
2726 if (start_ch <= ch && ch <= end_ch)
2727 bitset_set (sbcset, ch);
2729 # endif /* not RE_ENABLE_I18N */
2732 #endif /* not _LIBC */
2735 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2736 Build the collating element which is represented by NAME.
2737 The result are written to MBCSET and SBCSET.
2738 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2739 pointer argument since we may update it. */
2741 static reg_errcode_t
2743 build_collating_symbol (bitset_t sbcset,
2744 # ifdef RE_ENABLE_I18N
2745 re_charset_t *mbcset, Idx *coll_sym_alloc,
2747 const unsigned char *name)
2749 size_t name_len = strlen ((const char *) name);
2750 if (BE (name_len != 1, 0))
2751 return REG_ECOLLATE;
2754 bitset_set (sbcset, name[0]);
2758 #endif /* not _LIBC */
2760 /* This function parse bracket expression like "[abc]", "[a-c]",
2764 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2765 reg_syntax_t syntax, reg_errcode_t *err)
2768 const unsigned char *collseqmb;
2769 const char *collseqwc;
2772 const int32_t *symb_table;
2773 const unsigned char *extra;
2775 /* Local function for parse_bracket_exp used in _LIBC environement.
2776 Seek the collating symbol entry correspondings to NAME.
2777 Return the index of the symbol in the SYMB_TABLE. */
2780 __attribute ((always_inline))
2781 seek_collating_symbol_entry (name, name_len)
2782 const unsigned char *name;
2785 int32_t hash = elem_hash ((const char *) name, name_len);
2786 int32_t elem = hash % table_size;
2787 if (symb_table[2 * elem] != 0)
2789 int32_t second = hash % (table_size - 2) + 1;
2793 /* First compare the hashing value. */
2794 if (symb_table[2 * elem] == hash
2795 /* Compare the length of the name. */
2796 && name_len == extra[symb_table[2 * elem + 1]]
2797 /* Compare the name. */
2798 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2801 /* Yep, this is the entry. */
2808 while (symb_table[2 * elem] != 0);
2813 /* Local function for parse_bracket_exp used in _LIBC environment.
2814 Look up the collation sequence value of BR_ELEM.
2815 Return the value if succeeded, UINT_MAX otherwise. */
2817 auto inline unsigned int
2818 __attribute ((always_inline))
2819 lookup_collation_sequence_value (br_elem)
2820 bracket_elem_t *br_elem;
2822 if (br_elem->type == SB_CHAR)
2825 if (MB_CUR_MAX == 1)
2828 return collseqmb[br_elem->opr.ch];
2831 wint_t wc = __btowc (br_elem->opr.ch);
2832 return __collseq_table_lookup (collseqwc, wc);
2835 else if (br_elem->type == MB_CHAR)
2838 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2840 else if (br_elem->type == COLL_SYM)
2842 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2846 elem = seek_collating_symbol_entry (br_elem->opr.name,
2848 if (symb_table[2 * elem] != 0)
2850 /* We found the entry. */
2851 idx = symb_table[2 * elem + 1];
2852 /* Skip the name of collating element name. */
2853 idx += 1 + extra[idx];
2854 /* Skip the byte sequence of the collating element. */
2855 idx += 1 + extra[idx];
2856 /* Adjust for the alignment. */
2857 idx = (idx + 3) & ~3;
2858 /* Skip the multibyte collation sequence value. */
2859 idx += sizeof (unsigned int);
2860 /* Skip the wide char sequence of the collating element. */
2861 idx += sizeof (unsigned int) *
2862 (1 + *(unsigned int *) (extra + idx));
2863 /* Return the collation sequence value. */
2864 return *(unsigned int *) (extra + idx);
2866 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2868 /* No valid character. Match it as a single byte
2870 return collseqmb[br_elem->opr.name[0]];
2873 else if (sym_name_len == 1)
2874 return collseqmb[br_elem->opr.name[0]];
2879 /* Local function for parse_bracket_exp used in _LIBC environement.
2880 Build the range expression which starts from START_ELEM, and ends
2881 at END_ELEM. The result are written to MBCSET and SBCSET.
2882 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2883 mbcset->range_ends, is a pointer argument sinse we may
2886 auto inline reg_errcode_t
2887 __attribute ((always_inline))
2888 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2889 re_charset_t *mbcset;
2892 bracket_elem_t *start_elem, *end_elem;
2895 uint32_t start_collseq;
2896 uint32_t end_collseq;
2898 /* Equivalence Classes and Character Classes can't be a range
2900 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2901 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2905 start_collseq = lookup_collation_sequence_value (start_elem);
2906 end_collseq = lookup_collation_sequence_value (end_elem);
2907 /* Check start/end collation sequence values. */
2908 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2909 return REG_ECOLLATE;
2910 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2913 /* Got valid collation sequence values, add them as a new entry.
2914 However, if we have no collation elements, and the character set
2915 is single byte, the single byte character set that we
2916 build below suffices. */
2917 if (nrules > 0 || dfa->mb_cur_max > 1)
2919 /* Check the space of the arrays. */
2920 if (BE (*range_alloc == mbcset->nranges, 0))
2922 /* There is not enough space, need realloc. */
2923 uint32_t *new_array_start;
2924 uint32_t *new_array_end;
2927 /* +1 in case of mbcset->nranges is 0. */
2928 new_nranges = 2 * mbcset->nranges + 1;
2929 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2931 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2934 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2937 mbcset->range_starts = new_array_start;
2938 mbcset->range_ends = new_array_end;
2939 *range_alloc = new_nranges;
2942 mbcset->range_starts[mbcset->nranges] = start_collseq;
2943 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2946 /* Build the table for single byte characters. */
2947 for (ch = 0; ch < SBC_MAX; ch++)
2949 uint32_t ch_collseq;
2951 if (MB_CUR_MAX == 1)
2954 ch_collseq = collseqmb[ch];
2956 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2957 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2958 bitset_set (sbcset, ch);
2963 /* Local function for parse_bracket_exp used in _LIBC environement.
2964 Build the collating element which is represented by NAME.
2965 The result are written to MBCSET and SBCSET.
2966 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2967 pointer argument sinse we may update it. */
2969 auto inline reg_errcode_t
2970 __attribute ((always_inline))
2971 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2972 re_charset_t *mbcset;
2973 Idx *coll_sym_alloc;
2975 const unsigned char *name;
2978 size_t name_len = strlen ((const char *) name);
2981 elem = seek_collating_symbol_entry (name, name_len);
2982 if (symb_table[2 * elem] != 0)
2984 /* We found the entry. */
2985 idx = symb_table[2 * elem + 1];
2986 /* Skip the name of collating element name. */
2987 idx += 1 + extra[idx];
2989 else if (symb_table[2 * elem] == 0 && name_len == 1)
2991 /* No valid character, treat it as a normal
2993 bitset_set (sbcset, name[0]);
2997 return REG_ECOLLATE;
2999 /* Got valid collation sequence, add it as a new entry. */
3000 /* Check the space of the arrays. */
3001 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3003 /* Not enough, realloc it. */
3004 /* +1 in case of mbcset->ncoll_syms is 0. */
3005 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3006 /* Use realloc since mbcset->coll_syms is NULL
3008 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3009 new_coll_sym_alloc);
3010 if (BE (new_coll_syms == NULL, 0))
3012 mbcset->coll_syms = new_coll_syms;
3013 *coll_sym_alloc = new_coll_sym_alloc;
3015 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3020 if (BE (name_len != 1, 0))
3021 return REG_ECOLLATE;
3024 bitset_set (sbcset, name[0]);
3031 re_token_t br_token;
3032 re_bitset_ptr_t sbcset;
3033 #ifdef RE_ENABLE_I18N
3034 re_charset_t *mbcset;
3035 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3036 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3037 #endif /* not RE_ENABLE_I18N */
3038 bool non_match = false;
3039 bin_tree_t *work_tree;
3041 bool first_round = true;
3043 collseqmb = (const unsigned char *)
3044 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3051 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3054 _NL_COLLATE_SYMB_TABLEMB);
3055 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3056 _NL_COLLATE_SYMB_EXTRAMB);
3059 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3060 #ifdef RE_ENABLE_I18N
3061 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3062 #endif /* RE_ENABLE_I18N */
3063 #ifdef RE_ENABLE_I18N
3064 if (BE (sbcset == NULL || mbcset == NULL, 0))
3066 if (BE (sbcset == NULL, 0))
3067 #endif /* RE_ENABLE_I18N */
3073 token_len = peek_token_bracket (token, regexp, syntax);
3074 if (BE (token->type == END_OF_RE, 0))
3077 goto parse_bracket_exp_free_return;
3079 if (token->type == OP_NON_MATCH_LIST)
3081 #ifdef RE_ENABLE_I18N
3082 mbcset->non_match = 1;
3083 #endif /* not RE_ENABLE_I18N */
3085 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3086 bitset_set (sbcset, '\n');
3087 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3088 token_len = peek_token_bracket (token, regexp, syntax);
3089 if (BE (token->type == END_OF_RE, 0))
3092 goto parse_bracket_exp_free_return;
3096 /* We treat the first ']' as a normal character. */
3097 if (token->type == OP_CLOSE_BRACKET)
3098 token->type = CHARACTER;
3102 bracket_elem_t start_elem, end_elem;
3103 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3104 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3107 bool is_range_exp = false;
3110 start_elem.opr.name = start_name_buf;
3111 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3112 syntax, first_round);
3113 if (BE (ret != REG_NOERROR, 0))
3116 goto parse_bracket_exp_free_return;
3118 first_round = false;
3120 /* Get information about the next token. We need it in any case. */
3121 token_len = peek_token_bracket (token, regexp, syntax);
3123 /* Do not check for ranges if we know they are not allowed. */
3124 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3126 if (BE (token->type == END_OF_RE, 0))
3129 goto parse_bracket_exp_free_return;
3131 if (token->type == OP_CHARSET_RANGE)
3133 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3134 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3135 if (BE (token2.type == END_OF_RE, 0))
3138 goto parse_bracket_exp_free_return;
3140 if (token2.type == OP_CLOSE_BRACKET)
3142 /* We treat the last '-' as a normal character. */
3143 re_string_skip_bytes (regexp, -token_len);
3144 token->type = CHARACTER;
3147 is_range_exp = true;
3151 if (is_range_exp == true)
3153 end_elem.opr.name = end_name_buf;
3154 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3156 if (BE (ret != REG_NOERROR, 0))
3159 goto parse_bracket_exp_free_return;
3162 token_len = peek_token_bracket (token, regexp, syntax);
3165 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3166 &start_elem, &end_elem);
3168 # ifdef RE_ENABLE_I18N
3169 *err = build_range_exp (sbcset,
3170 dfa->mb_cur_max > 1 ? mbcset : NULL,
3171 &range_alloc, &start_elem, &end_elem);
3173 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3175 #endif /* RE_ENABLE_I18N */
3176 if (BE (*err != REG_NOERROR, 0))
3177 goto parse_bracket_exp_free_return;
3181 switch (start_elem.type)
3184 bitset_set (sbcset, start_elem.opr.ch);
3186 #ifdef RE_ENABLE_I18N
3188 /* Check whether the array has enough space. */
3189 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3191 wchar_t *new_mbchars;
3192 /* Not enough, realloc it. */
3193 /* +1 in case of mbcset->nmbchars is 0. */
3194 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3195 /* Use realloc since array is NULL if *alloc == 0. */
3196 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3198 if (BE (new_mbchars == NULL, 0))
3199 goto parse_bracket_exp_espace;
3200 mbcset->mbchars = new_mbchars;
3202 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3204 #endif /* RE_ENABLE_I18N */
3206 *err = build_equiv_class (sbcset,
3207 #ifdef RE_ENABLE_I18N
3208 mbcset, &equiv_class_alloc,
3209 #endif /* RE_ENABLE_I18N */
3210 start_elem.opr.name);
3211 if (BE (*err != REG_NOERROR, 0))
3212 goto parse_bracket_exp_free_return;
3215 *err = build_collating_symbol (sbcset,
3216 #ifdef RE_ENABLE_I18N
3217 mbcset, &coll_sym_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3224 *err = build_charclass (regexp->trans, sbcset,
3225 #ifdef RE_ENABLE_I18N
3226 mbcset, &char_class_alloc,
3227 #endif /* RE_ENABLE_I18N */
3228 start_elem.opr.name, syntax);
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3237 if (BE (token->type == END_OF_RE, 0))
3240 goto parse_bracket_exp_free_return;
3242 if (token->type == OP_CLOSE_BRACKET)
3246 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3248 /* If it is non-matching list. */
3250 bitset_not (sbcset);
3252 #ifdef RE_ENABLE_I18N
3253 /* Ensure only single byte characters are set. */
3254 if (dfa->mb_cur_max > 1)
3255 bitset_mask (sbcset, dfa->sb_char);
3257 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3258 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3259 || mbcset->non_match)))
3261 bin_tree_t *mbc_tree;
3263 /* Build a tree for complex bracket. */
3264 dfa->has_mb_node = 1;
3265 br_token.type = COMPLEX_BRACKET;
3266 br_token.opr.mbcset = mbcset;
3267 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268 if (BE (mbc_tree == NULL, 0))
3269 goto parse_bracket_exp_espace;
3270 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3271 if (sbcset[sbc_idx])
3273 /* If there are no bits set in sbcset, there is no point
3274 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3275 if (sbc_idx < BITSET_WORDS)
3277 /* Build a tree for simple bracket. */
3278 br_token.type = SIMPLE_BRACKET;
3279 br_token.opr.sbcset = sbcset;
3280 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3281 if (BE (work_tree == NULL, 0))
3282 goto parse_bracket_exp_espace;
3284 /* Then join them by ALT node. */
3285 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3286 if (BE (work_tree == NULL, 0))
3287 goto parse_bracket_exp_espace;
3292 work_tree = mbc_tree;
3296 #endif /* not RE_ENABLE_I18N */
3298 #ifdef RE_ENABLE_I18N
3299 free_charset (mbcset);
3301 /* Build a tree for simple bracket. */
3302 br_token.type = SIMPLE_BRACKET;
3303 br_token.opr.sbcset = sbcset;
3304 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305 if (BE (work_tree == NULL, 0))
3306 goto parse_bracket_exp_espace;
3310 parse_bracket_exp_espace:
3312 parse_bracket_exp_free_return:
3314 #ifdef RE_ENABLE_I18N
3315 free_charset (mbcset);
3316 #endif /* RE_ENABLE_I18N */
3320 /* Parse an element in the bracket expression. */
3322 static reg_errcode_t
3323 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3324 re_token_t *token, int token_len, re_dfa_t *dfa,
3325 reg_syntax_t syntax, bool accept_hyphen)
3327 #ifdef RE_ENABLE_I18N
3329 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3330 if (cur_char_size > 1)
3332 elem->type = MB_CHAR;
3333 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3334 re_string_skip_bytes (regexp, cur_char_size);
3337 #endif /* RE_ENABLE_I18N */
3338 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3339 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3340 || token->type == OP_OPEN_EQUIV_CLASS)
3341 return parse_bracket_symbol (elem, regexp, token);
3342 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3344 /* A '-' must only appear as anything but a range indicator before
3345 the closing bracket. Everything else is an error. */
3347 (void) peek_token_bracket (&token2, regexp, syntax);
3348 if (token2.type != OP_CLOSE_BRACKET)
3349 /* The actual error value is not standardized since this whole
3350 case is undefined. But ERANGE makes good sense. */
3353 elem->type = SB_CHAR;
3354 elem->opr.ch = token->opr.c;
3358 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3359 such as [:<character_class>:], [.<collating_element>.], and
3360 [=<equivalent_class>=]. */
3362 static reg_errcode_t
3363 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3366 unsigned char ch, delim = token->opr.c;
3368 if (re_string_eoi(regexp))
3372 if (i >= BRACKET_NAME_BUF_SIZE)
3374 if (token->type == OP_OPEN_CHAR_CLASS)
3375 ch = re_string_fetch_byte_case (regexp);
3377 ch = re_string_fetch_byte (regexp);
3378 if (re_string_eoi(regexp))
3380 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3382 elem->opr.name[i] = ch;
3384 re_string_skip_bytes (regexp, 1);
3385 elem->opr.name[i] = '\0';
3386 switch (token->type)
3388 case OP_OPEN_COLL_ELEM:
3389 elem->type = COLL_SYM;
3391 case OP_OPEN_EQUIV_CLASS:
3392 elem->type = EQUIV_CLASS;
3394 case OP_OPEN_CHAR_CLASS:
3395 elem->type = CHAR_CLASS;
3403 /* Helper function for parse_bracket_exp.
3404 Build the equivalence class which is represented by NAME.
3405 The result are written to MBCSET and SBCSET.
3406 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3407 is a pointer argument sinse we may update it. */
3409 static reg_errcode_t
3410 #ifdef RE_ENABLE_I18N
3411 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3412 Idx *equiv_class_alloc, const unsigned char *name)
3413 #else /* not RE_ENABLE_I18N */
3414 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3415 #endif /* not RE_ENABLE_I18N */
3418 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3421 const int32_t *table, *indirect;
3422 const unsigned char *weights, *extra, *cp;
3423 unsigned char char_buf[2];
3427 /* This #include defines a local function! */
3428 # include <locale/weight.h>
3429 /* Calculate the index for equivalence class. */
3431 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3432 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3433 _NL_COLLATE_WEIGHTMB);
3434 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435 _NL_COLLATE_EXTRAMB);
3436 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_INDIRECTMB);
3438 idx1 = findidx (&cp);
3439 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3440 /* This isn't a valid character. */
3441 return REG_ECOLLATE;
3443 /* Build single byte matcing table for this equivalence class. */
3444 char_buf[1] = (unsigned char) '\0';
3445 len = weights[idx1 & 0xffffff];
3446 for (ch = 0; ch < SBC_MAX; ++ch)
3450 idx2 = findidx (&cp);
3455 /* This isn't a valid character. */
3457 /* Compare only if the length matches and the collation rule
3458 index is the same. */
3459 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3463 while (cnt <= len &&
3464 weights[(idx1 & 0xffffff) + 1 + cnt]
3465 == weights[(idx2 & 0xffffff) + 1 + cnt])
3469 bitset_set (sbcset, ch);
3472 /* Check whether the array has enough space. */
3473 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3475 /* Not enough, realloc it. */
3476 /* +1 in case of mbcset->nequiv_classes is 0. */
3477 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3478 /* Use realloc since the array is NULL if *alloc == 0. */
3479 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3481 new_equiv_class_alloc);
3482 if (BE (new_equiv_classes == NULL, 0))
3484 mbcset->equiv_classes = new_equiv_classes;
3485 *equiv_class_alloc = new_equiv_class_alloc;
3487 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3492 if (BE (strlen ((const char *) name) != 1, 0))
3493 return REG_ECOLLATE;
3494 bitset_set (sbcset, *name);
3499 /* Helper function for parse_bracket_exp.
3500 Build the character class which is represented by NAME.
3501 The result are written to MBCSET and SBCSET.
3502 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3503 is a pointer argument sinse we may update it. */
3505 static reg_errcode_t
3506 #ifdef RE_ENABLE_I18N
3507 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508 re_charset_t *mbcset, Idx *char_class_alloc,
3509 const unsigned char *class_name, reg_syntax_t syntax)
3510 #else /* not RE_ENABLE_I18N */
3511 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512 const unsigned char *class_name, reg_syntax_t syntax)
3513 #endif /* not RE_ENABLE_I18N */
3516 const char *name = (const char *) class_name;
3518 /* In case of REG_ICASE "upper" and "lower" match the both of
3519 upper and lower cases. */
3520 if ((syntax & RE_ICASE)
3521 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3524 #ifdef RE_ENABLE_I18N
3525 /* Check the space of the arrays. */
3526 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nchar_classes is 0. */
3530 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3531 /* Use realloc since array is NULL if *alloc == 0. */
3532 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3533 new_char_class_alloc);
3534 if (BE (new_char_classes == NULL, 0))
3536 mbcset->char_classes = new_char_classes;
3537 *char_class_alloc = new_char_class_alloc;
3539 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3540 #endif /* RE_ENABLE_I18N */
3542 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3544 if (BE (trans != NULL, 0)) \
3546 for (i = 0; i < SBC_MAX; ++i) \
3547 if (ctype_func (i)) \
3548 bitset_set (sbcset, trans[i]); \
3552 for (i = 0; i < SBC_MAX; ++i) \
3553 if (ctype_func (i)) \
3554 bitset_set (sbcset, i); \
3558 if (strcmp (name, "alnum") == 0)
3559 BUILD_CHARCLASS_LOOP (isalnum);
3560 else if (strcmp (name, "cntrl") == 0)
3561 BUILD_CHARCLASS_LOOP (iscntrl);
3562 else if (strcmp (name, "lower") == 0)
3563 BUILD_CHARCLASS_LOOP (islower);
3564 else if (strcmp (name, "space") == 0)
3565 BUILD_CHARCLASS_LOOP (isspace);
3566 else if (strcmp (name, "alpha") == 0)
3567 BUILD_CHARCLASS_LOOP (isalpha);
3568 else if (strcmp (name, "digit") == 0)
3569 BUILD_CHARCLASS_LOOP (isdigit);
3570 else if (strcmp (name, "print") == 0)
3571 BUILD_CHARCLASS_LOOP (isprint);
3572 else if (strcmp (name, "upper") == 0)
3573 BUILD_CHARCLASS_LOOP (isupper);
3574 else if (strcmp (name, "blank") == 0)
3575 BUILD_CHARCLASS_LOOP (isblank);
3576 else if (strcmp (name, "graph") == 0)
3577 BUILD_CHARCLASS_LOOP (isgraph);
3578 else if (strcmp (name, "punct") == 0)
3579 BUILD_CHARCLASS_LOOP (ispunct);
3580 else if (strcmp (name, "xdigit") == 0)
3581 BUILD_CHARCLASS_LOOP (isxdigit);
3589 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3590 const unsigned char *class_name,
3591 const unsigned char *extra, bool non_match,
3594 re_bitset_ptr_t sbcset;
3595 #ifdef RE_ENABLE_I18N
3596 re_charset_t *mbcset;
3598 #endif /* not RE_ENABLE_I18N */
3600 re_token_t br_token;
3603 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3604 #ifdef RE_ENABLE_I18N
3605 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3606 #endif /* RE_ENABLE_I18N */
3608 #ifdef RE_ENABLE_I18N
3609 if (BE (sbcset == NULL || mbcset == NULL, 0))
3610 #else /* not RE_ENABLE_I18N */
3611 if (BE (sbcset == NULL, 0))
3612 #endif /* not RE_ENABLE_I18N */
3620 #ifdef RE_ENABLE_I18N
3621 mbcset->non_match = 1;
3622 #endif /* not RE_ENABLE_I18N */
3625 /* We don't care the syntax in this case. */
3626 ret = build_charclass (trans, sbcset,
3627 #ifdef RE_ENABLE_I18N
3629 #endif /* RE_ENABLE_I18N */
3632 if (BE (ret != REG_NOERROR, 0))
3635 #ifdef RE_ENABLE_I18N
3636 free_charset (mbcset);
3637 #endif /* RE_ENABLE_I18N */
3641 /* \w match '_' also. */
3642 for (; *extra; extra++)
3643 bitset_set (sbcset, *extra);
3645 /* If it is non-matching list. */
3647 bitset_not (sbcset);
3649 #ifdef RE_ENABLE_I18N
3650 /* Ensure only single byte characters are set. */
3651 if (dfa->mb_cur_max > 1)
3652 bitset_mask (sbcset, dfa->sb_char);
3655 /* Build a tree for simple bracket. */
3656 br_token.type = SIMPLE_BRACKET;
3657 br_token.opr.sbcset = sbcset;
3658 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659 if (BE (tree == NULL, 0))
3660 goto build_word_op_espace;
3662 #ifdef RE_ENABLE_I18N
3663 if (dfa->mb_cur_max > 1)
3665 bin_tree_t *mbc_tree;
3666 /* Build a tree for complex bracket. */
3667 br_token.type = COMPLEX_BRACKET;
3668 br_token.opr.mbcset = mbcset;
3669 dfa->has_mb_node = 1;
3670 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671 if (BE (mbc_tree == NULL, 0))
3672 goto build_word_op_espace;
3673 /* Then join them by ALT node. */
3674 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3675 if (BE (mbc_tree != NULL, 1))
3680 free_charset (mbcset);
3683 #else /* not RE_ENABLE_I18N */
3685 #endif /* not RE_ENABLE_I18N */
3687 build_word_op_espace:
3689 #ifdef RE_ENABLE_I18N
3690 free_charset (mbcset);
3691 #endif /* RE_ENABLE_I18N */
3696 /* This is intended for the expressions like "a{1,3}".
3697 Fetch a number from `input', and return the number.
3698 Return REG_MISSING if the number field is empty like "{,1}".
3699 Return REG_ERROR if an error occurred. */
3702 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3704 Idx num = REG_MISSING;
3708 fetch_token (token, input, syntax);
3710 if (BE (token->type == END_OF_RE, 0))
3712 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3714 num = ((token->type != CHARACTER || c < '0' || '9' < c
3715 || num == REG_ERROR)
3717 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3718 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3723 #ifdef RE_ENABLE_I18N
3725 free_charset (re_charset_t *cset)
3727 re_free (cset->mbchars);
3729 re_free (cset->coll_syms);
3730 re_free (cset->equiv_classes);
3731 re_free (cset->range_starts);
3732 re_free (cset->range_ends);
3734 re_free (cset->char_classes);
3737 #endif /* RE_ENABLE_I18N */
3739 /* Functions for binary tree operation. */
3741 /* Create a tree node. */
3744 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3745 re_token_type_t type)
3749 return create_token_tree (dfa, left, right, &t);
3753 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3754 const re_token_t *token)
3757 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3759 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3761 if (storage == NULL)
3763 storage->next = dfa->str_tree_storage;
3764 dfa->str_tree_storage = storage;
3765 dfa->str_tree_storage_idx = 0;
3767 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3769 tree->parent = NULL;
3771 tree->right = right;
3772 tree->token = *token;
3773 tree->token.duplicated = 0;
3774 tree->token.opt_subexp = 0;
3777 tree->node_idx = REG_MISSING;
3780 left->parent = tree;
3782 right->parent = tree;
3786 /* Mark the tree SRC as an optional subexpression.
3787 To be called from preorder or postorder. */
3789 static reg_errcode_t
3790 mark_opt_subexp (void *extra, bin_tree_t *node)
3792 Idx idx = (Idx) (long) extra;
3793 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3794 node->token.opt_subexp = 1;
3799 /* Free the allocated memory inside NODE. */
3802 free_token (re_token_t *node)
3804 #ifdef RE_ENABLE_I18N
3805 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3806 free_charset (node->opr.mbcset);
3808 #endif /* RE_ENABLE_I18N */
3809 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3810 re_free (node->opr.sbcset);
3813 /* Worker function for tree walking. Free the allocated memory inside NODE
3814 and its children. */
3816 static reg_errcode_t
3817 free_tree (void *extra, bin_tree_t *node)
3819 free_token (&node->token);
3824 /* Duplicate the node SRC, and return new node. This is a preorder
3825 visit similar to the one implemented by the generic visitor, but
3826 we need more infrastructure to maintain two parallel trees --- so,
3827 it's easier to duplicate. */
3830 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3832 const bin_tree_t *node;
3833 bin_tree_t *dup_root;
3834 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3836 for (node = root; ; )
3838 /* Create a new tree and link it back to the current parent. */
3839 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3842 (*p_new)->parent = dup_node;
3843 (*p_new)->token.duplicated = 1;
3846 /* Go to the left node, or up and to the right. */
3850 p_new = &dup_node->left;
3854 const bin_tree_t *prev = NULL;
3855 while (node->right == prev || node->right == NULL)
3858 node = node->parent;
3859 dup_node = dup_node->parent;
3864 p_new = &dup_node->right;