1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 /* See if we have to try all bytes which start multiple collation
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
376 re_set_fastmap (fastmap, icase, i);
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
387 || cset->nequiv_classes
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
528 weak_alias (__regcomp, regcomp)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
536 regerror (errcode, preg, errbuf, errbuf_size)
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
541 #else /* size_t might promote */
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
561 msg_size = strlen (msg) + 1; /* Includes the null. */
563 if (BE (errbuf_size != 0, 1))
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
571 memcpy (errbuf, msg, cpy_size);
577 weak_alias (__regerror, regerror)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
588 /* Set the first 128 bits. */
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
611 free_dfa_content (re_dfa_t *dfa)
616 for (i = 0; i < dfa->nodes_len; ++i)
617 free_token (dfa->nodes + i);
618 re_free (dfa->nexts);
619 for (i = 0; i < dfa->nodes_len; ++i)
621 if (dfa->eclosures != NULL)
622 re_node_set_free (dfa->eclosures + i);
623 if (dfa->inveclosures != NULL)
624 re_node_set_free (dfa->inveclosures + i);
625 if (dfa->edests != NULL)
626 re_node_set_free (dfa->edests + i);
628 re_free (dfa->edests);
629 re_free (dfa->eclosures);
630 re_free (dfa->inveclosures);
631 re_free (dfa->nodes);
633 if (dfa->state_table)
634 for (i = 0; i <= dfa->state_hash_mask; ++i)
636 struct re_state_table_entry *entry = dfa->state_table + i;
637 for (j = 0; j < entry->num; ++j)
639 re_dfastate_t *state = entry->array[j];
642 re_free (entry->array);
644 re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646 if (dfa->sb_char != utf8_sb_map)
647 re_free (dfa->sb_char);
649 re_free (dfa->subexp_map);
651 re_free (dfa->re_str);
658 /* Free dynamically allocated space used by PREG. */
664 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
665 if (BE (dfa != NULL, 1))
666 free_dfa_content (dfa);
670 re_free (preg->fastmap);
671 preg->fastmap = NULL;
673 re_free (preg->translate);
674 preg->translate = NULL;
677 weak_alias (__regfree, regfree)
680 /* Entry points compatible with 4.2 BSD regex library. We don't define
681 them unless specifically requested. */
683 #if defined _REGEX_RE_COMP || defined _LIBC
685 /* BSD has one and only one pattern buffer. */
686 static struct re_pattern_buffer re_comp_buf;
690 /* Make these definitions weak in libc, so POSIX programs can redefine
691 these names if they don't use our functions, and still use
692 regcomp/regexec above without link errors. */
703 if (!re_comp_buf.buffer)
704 return gettext ("No previous regular expression");
708 if (re_comp_buf.buffer)
710 fastmap = re_comp_buf.fastmap;
711 re_comp_buf.fastmap = NULL;
712 __regfree (&re_comp_buf);
713 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
714 re_comp_buf.fastmap = fastmap;
717 if (re_comp_buf.fastmap == NULL)
719 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
720 if (re_comp_buf.fastmap == NULL)
721 return (char *) gettext (__re_error_msgid
722 + __re_error_msgid_idx[(int) REG_ESPACE]);
725 /* Since 're_exec' always passes NULL for the 'regs' argument, we
726 don't need to initialize the pattern buffer fields which affect it. */
728 /* Match anchors at newlines. */
729 re_comp_buf.newline_anchor = 1;
731 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
736 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
737 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741 libc_freeres_fn (free_mem)
743 __regfree (&re_comp_buf);
747 #endif /* _REGEX_RE_COMP */
749 /* Internal entry point.
750 Compile the regular expression PATTERN, whose length is LENGTH.
751 SYNTAX indicate regular expression's syntax. */
754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
757 reg_errcode_t err = REG_NOERROR;
761 /* Initialize the pattern buffer. */
762 preg->fastmap_accurate = 0;
763 preg->syntax = syntax;
764 preg->not_bol = preg->not_eol = 0;
767 preg->can_be_null = 0;
768 preg->regs_allocated = REGS_UNALLOCATED;
770 /* Initialize the dfa. */
771 dfa = (re_dfa_t *) preg->buffer;
772 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
774 /* If zero allocated, but buffer is non-null, try to realloc
775 enough space. This loses if buffer's address is bogus, but
776 that is the user's responsibility. If ->buffer is NULL this
777 is a simple allocation. */
778 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
781 preg->allocated = sizeof (re_dfa_t);
782 preg->buffer = (unsigned char *) dfa;
784 preg->used = sizeof (re_dfa_t);
786 err = init_dfa (dfa, length);
787 if (BE (err != REG_NOERROR, 0))
789 free_dfa_content (dfa);
795 /* Note: length+1 will not overflow since it is checked in init_dfa. */
796 dfa->re_str = re_malloc (char, length + 1);
797 strncpy (dfa->re_str, pattern, length + 1);
800 __libc_lock_init (dfa->lock);
802 err = re_string_construct (®exp, pattern, length, preg->translate,
803 (syntax & RE_ICASE) != 0, dfa);
804 if (BE (err != REG_NOERROR, 0))
806 re_compile_internal_free_return:
807 free_workarea_compile (preg);
808 re_string_destruct (®exp);
809 free_dfa_content (dfa);
815 /* Parse the regular expression, and build a structure tree. */
817 dfa->str_tree = parse (®exp, preg, syntax, &err);
818 if (BE (dfa->str_tree == NULL, 0))
819 goto re_compile_internal_free_return;
821 /* Analyze the tree and create the nfa. */
822 err = analyze (preg);
823 if (BE (err != REG_NOERROR, 0))
824 goto re_compile_internal_free_return;
826 #ifdef RE_ENABLE_I18N
827 /* If possible, do searching in single byte encoding to speed things up. */
828 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
832 /* Then create the initial state of the dfa. */
833 err = create_initial_state (dfa);
835 /* Release work areas. */
836 free_workarea_compile (preg);
837 re_string_destruct (®exp);
839 if (BE (err != REG_NOERROR, 0))
841 free_dfa_content (dfa);
849 /* Initialize DFA. We use the length of the regular expression PAT_LEN
850 as the initial length of some arrays. */
853 init_dfa (re_dfa_t *dfa, size_t pat_len)
855 __re_size_t table_size;
859 #ifdef RE_ENABLE_I18N
860 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
862 size_t max_i18n_object_size = 0;
864 size_t max_object_size =
865 MAX (sizeof (struct re_state_table_entry),
866 MAX (sizeof (re_token_t),
867 MAX (sizeof (re_node_set),
868 MAX (sizeof (regmatch_t),
869 max_i18n_object_size))));
871 memset (dfa, '\0', sizeof (re_dfa_t));
873 /* Force allocation of str_tree_storage the first time. */
874 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
876 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
877 calculation below, and for similar doubling calculations
878 elsewhere. And it's <= rather than <, because some of the
879 doubling calculations add 1 afterwards. */
880 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
883 dfa->nodes_alloc = pat_len + 1;
884 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
886 /* table_size = 2 ^ ceil(log pat_len) */
887 for (table_size = 1; ; table_size <<= 1)
888 if (table_size > pat_len)
891 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
892 dfa->state_hash_mask = table_size - 1;
894 dfa->mb_cur_max = MB_CUR_MAX;
896 if (dfa->mb_cur_max == 6
897 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
899 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
902 codeset_name = nl_langinfo (CODESET);
903 if (strcasecmp (codeset_name, "UTF-8") == 0
904 || strcasecmp (codeset_name, "UTF8") == 0)
907 /* We check exhaustively in the loop below if this charset is a
908 superset of ASCII. */
909 dfa->map_notascii = 0;
912 #ifdef RE_ENABLE_I18N
913 if (dfa->mb_cur_max > 1)
916 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
921 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
922 if (BE (dfa->sb_char == NULL, 0))
925 /* Set the bits corresponding to single byte chars. */
926 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
927 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929 wint_t wch = __btowc (ch);
931 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
933 if (isascii (ch) && wch != ch)
934 dfa->map_notascii = 1;
941 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
946 /* Initialize WORD_CHAR table, which indicate which character is
947 "word". In this case "word" means that it is the word construction
948 character used by some operators like "\<", "\>", etc. */
952 init_word_char (re_dfa_t *dfa)
954 dfa->word_ops_used = 1;
958 if (BE (dfa->map_notascii == 0, 1))
960 if (BITSET_WORD_BITS == 64)
962 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
963 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
966 else if (BITSET_WORD_BITS == 32)
968 dfa->word_char[0] = UINT32_C (0x00000000);
969 dfa->word_char[1] = UINT32_C (0x03ff0000);
970 dfa->word_char[2] = UINT32_C (0x87fffffe);
971 dfa->word_char[3] = UINT32_C (0x07fffffe);
978 if (BE (dfa->is_utf8, 1))
980 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
986 for (; i < BITSET_WORDS; ++i)
987 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
988 if (isalnum (ch) || ch == '_')
989 dfa->word_char[i] |= (bitset_word_t) 1 << j;
992 /* Free the work area which are only used while compiling. */
995 free_workarea_compile (regex_t *preg)
997 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
998 bin_tree_storage_t *storage, *next;
999 for (storage = dfa->str_tree_storage; storage; storage = next)
1001 next = storage->next;
1004 dfa->str_tree_storage = NULL;
1005 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1006 dfa->str_tree = NULL;
1007 re_free (dfa->org_indices);
1008 dfa->org_indices = NULL;
1011 /* Create initial states for all contexts. */
1013 static reg_errcode_t
1014 create_initial_state (re_dfa_t *dfa)
1018 re_node_set init_nodes;
1020 /* Initial states have the epsilon closure of the node which is
1021 the first node of the regular expression. */
1022 first = dfa->str_tree->first->node_idx;
1023 dfa->init_node = first;
1024 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1025 if (BE (err != REG_NOERROR, 0))
1028 /* The back-references which are in initial states can epsilon transit,
1029 since in this case all of the subexpressions can be null.
1030 Then we add epsilon closures of the nodes which are the next nodes of
1031 the back-references. */
1032 if (dfa->nbackref > 0)
1033 for (i = 0; i < init_nodes.nelem; ++i)
1035 Idx node_idx = init_nodes.elems[i];
1036 re_token_type_t type = dfa->nodes[node_idx].type;
1039 if (type != OP_BACK_REF)
1041 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1043 re_token_t *clexp_node;
1044 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1045 if (clexp_node->type == OP_CLOSE_SUBEXP
1046 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1049 if (clexp_idx == init_nodes.nelem)
1052 if (type == OP_BACK_REF)
1054 Idx dest_idx = dfa->edests[node_idx].elems[0];
1055 if (!re_node_set_contains (&init_nodes, dest_idx))
1057 reg_errcode_t merge_err
1058 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1059 if (merge_err != REG_NOERROR)
1066 /* It must be the first time to invoke acquire_state. */
1067 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1068 /* We don't check ERR here, since the initial state must not be NULL. */
1069 if (BE (dfa->init_state == NULL, 0))
1071 if (dfa->init_state->has_constraint)
1073 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1075 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1077 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1081 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1082 || dfa->init_state_begbuf == NULL, 0))
1086 dfa->init_state_word = dfa->init_state_nl
1087 = dfa->init_state_begbuf = dfa->init_state;
1089 re_node_set_free (&init_nodes);
1093 #ifdef RE_ENABLE_I18N
1094 /* If it is possible to do searching in single byte encoding instead of UTF-8
1095 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1096 DFA nodes where needed. */
1099 optimize_utf8 (re_dfa_t *dfa)
1103 bool mb_chars = false;
1104 bool has_period = false;
1106 for (node = 0; node < dfa->nodes_len; ++node)
1107 switch (dfa->nodes[node].type)
1110 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1114 switch (dfa->nodes[node].opr.ctx_type)
1122 /* Word anchors etc. cannot be handled. It's okay to test
1123 opr.ctx_type since constraints (for all DFA nodes) are
1124 created by ORing one or more opr.ctx_type values. */
1134 case OP_DUP_ASTERISK:
1135 case OP_OPEN_SUBEXP:
1136 case OP_CLOSE_SUBEXP:
1138 case COMPLEX_BRACKET:
1140 case SIMPLE_BRACKET:
1141 /* Just double check. */
1143 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1145 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1146 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1148 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1158 if (mb_chars || has_period)
1159 for (node = 0; node < dfa->nodes_len; ++node)
1161 if (dfa->nodes[node].type == CHARACTER
1162 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1163 dfa->nodes[node].mb_partial = 0;
1164 else if (dfa->nodes[node].type == OP_PERIOD)
1165 dfa->nodes[node].type = OP_UTF8_PERIOD;
1168 /* The search can be in single byte locale. */
1169 dfa->mb_cur_max = 1;
1171 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1175 /* Analyze the structure tree, and calculate "first", "next", "edest",
1176 "eclosure", and "inveclosure". */
1178 static reg_errcode_t
1179 analyze (regex_t *preg)
1181 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1184 /* Allocate arrays. */
1185 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1186 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1187 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1188 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1189 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1190 || dfa->eclosures == NULL, 0))
1193 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1194 if (dfa->subexp_map != NULL)
1197 for (i = 0; i < preg->re_nsub; i++)
1198 dfa->subexp_map[i] = i;
1199 preorder (dfa->str_tree, optimize_subexps, dfa);
1200 for (i = 0; i < preg->re_nsub; i++)
1201 if (dfa->subexp_map[i] != i)
1203 if (i == preg->re_nsub)
1205 free (dfa->subexp_map);
1206 dfa->subexp_map = NULL;
1210 ret = postorder (dfa->str_tree, lower_subexps, preg);
1211 if (BE (ret != REG_NOERROR, 0))
1213 ret = postorder (dfa->str_tree, calc_first, dfa);
1214 if (BE (ret != REG_NOERROR, 0))
1216 preorder (dfa->str_tree, calc_next, dfa);
1217 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1218 if (BE (ret != REG_NOERROR, 0))
1220 ret = calc_eclosure (dfa);
1221 if (BE (ret != REG_NOERROR, 0))
1224 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1225 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1226 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1229 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1230 if (BE (dfa->inveclosures == NULL, 0))
1232 ret = calc_inveclosure (dfa);
1238 /* Our parse trees are very unbalanced, so we cannot use a stack to
1239 implement parse tree visits. Instead, we use parent pointers and
1240 some hairy code in these two functions. */
1241 static reg_errcode_t
1242 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1245 bin_tree_t *node, *prev;
1247 for (node = root; ; )
1249 /* Descend down the tree, preferably to the left (or to the right
1250 if that's the only child). */
1251 while (node->left || node->right)
1259 reg_errcode_t err = fn (extra, node);
1260 if (BE (err != REG_NOERROR, 0))
1262 if (node->parent == NULL)
1265 node = node->parent;
1267 /* Go up while we have a node that is reached from the right. */
1268 while (node->right == prev || node->right == NULL);
1273 static reg_errcode_t
1274 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1279 for (node = root; ; )
1281 reg_errcode_t err = fn (extra, node);
1282 if (BE (err != REG_NOERROR, 0))
1285 /* Go to the left node, or up and to the right. */
1290 bin_tree_t *prev = NULL;
1291 while (node->right == prev || node->right == NULL)
1294 node = node->parent;
1303 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1304 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1305 backreferences as well. Requires a preorder visit. */
1306 static reg_errcode_t
1307 optimize_subexps (void *extra, bin_tree_t *node)
1309 re_dfa_t *dfa = (re_dfa_t *) extra;
1311 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1313 int idx = node->token.opr.idx;
1314 node->token.opr.idx = dfa->subexp_map[idx];
1315 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1318 else if (node->token.type == SUBEXP
1319 && node->left && node->left->token.type == SUBEXP)
1321 Idx other_idx = node->left->token.opr.idx;
1323 node->left = node->left->left;
1325 node->left->parent = node;
1327 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1328 if (other_idx < BITSET_WORD_BITS)
1329 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1335 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1336 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1337 static reg_errcode_t
1338 lower_subexps (void *extra, bin_tree_t *node)
1340 regex_t *preg = (regex_t *) extra;
1341 reg_errcode_t err = REG_NOERROR;
1343 if (node->left && node->left->token.type == SUBEXP)
1345 node->left = lower_subexp (&err, preg, node->left);
1347 node->left->parent = node;
1349 if (node->right && node->right->token.type == SUBEXP)
1351 node->right = lower_subexp (&err, preg, node->right);
1353 node->right->parent = node;
1360 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1362 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1363 bin_tree_t *body = node->left;
1364 bin_tree_t *op, *cls, *tree1, *tree;
1367 /* We do not optimize empty subexpressions, because otherwise we may
1368 have bad CONCAT nodes with NULL children. This is obviously not
1369 very common, so we do not lose much. An example that triggers
1370 this case is the sed "script" /\(\)/x. */
1371 && node->left != NULL
1372 && (node->token.opr.idx >= BITSET_WORD_BITS
1373 || !(dfa->used_bkref_map
1374 & ((bitset_word_t) 1 << node->token.opr.idx))))
1377 /* Convert the SUBEXP node to the concatenation of an
1378 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1379 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1380 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1381 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1382 tree = create_tree (dfa, op, tree1, CONCAT);
1383 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1389 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1390 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1394 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1395 nodes. Requires a postorder visit. */
1396 static reg_errcode_t
1397 calc_first (void *extra, bin_tree_t *node)
1399 re_dfa_t *dfa = (re_dfa_t *) extra;
1400 if (node->token.type == CONCAT)
1402 node->first = node->left->first;
1403 node->node_idx = node->left->node_idx;
1408 node->node_idx = re_dfa_add_node (dfa, node->token);
1409 if (BE (node->node_idx == REG_MISSING, 0))
1411 if (node->token.type == ANCHOR)
1412 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1417 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1418 static reg_errcode_t
1419 calc_next (void *extra, bin_tree_t *node)
1421 switch (node->token.type)
1423 case OP_DUP_ASTERISK:
1424 node->left->next = node;
1427 node->left->next = node->right->first;
1428 node->right->next = node->next;
1432 node->left->next = node->next;
1434 node->right->next = node->next;
1440 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1441 static reg_errcode_t
1442 link_nfa_nodes (void *extra, bin_tree_t *node)
1444 re_dfa_t *dfa = (re_dfa_t *) extra;
1445 Idx idx = node->node_idx;
1446 reg_errcode_t err = REG_NOERROR;
1448 switch (node->token.type)
1454 assert (node->next == NULL);
1457 case OP_DUP_ASTERISK:
1461 dfa->has_plural_match = 1;
1462 if (node->left != NULL)
1463 left = node->left->first->node_idx;
1465 left = node->next->node_idx;
1466 if (node->right != NULL)
1467 right = node->right->first->node_idx;
1469 right = node->next->node_idx;
1470 assert (REG_VALID_INDEX (left));
1471 assert (REG_VALID_INDEX (right));
1472 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1477 case OP_OPEN_SUBEXP:
1478 case OP_CLOSE_SUBEXP:
1479 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1483 dfa->nexts[idx] = node->next->node_idx;
1484 if (node->token.type == OP_BACK_REF)
1485 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1489 assert (!IS_EPSILON_NODE (node->token.type));
1490 dfa->nexts[idx] = node->next->node_idx;
1497 /* Duplicate the epsilon closure of the node ROOT_NODE.
1498 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1499 to their own constraint. */
1501 static reg_errcode_t
1503 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1504 Idx root_node, unsigned int init_constraint)
1506 Idx org_node, clone_node;
1508 unsigned int constraint = init_constraint;
1509 for (org_node = top_org_node, clone_node = top_clone_node;;)
1511 Idx org_dest, clone_dest;
1512 if (dfa->nodes[org_node].type == OP_BACK_REF)
1514 /* If the back reference epsilon-transit, its destination must
1515 also have the constraint. Then duplicate the epsilon closure
1516 of the destination of the back reference, and store it in
1517 edests of the back reference. */
1518 org_dest = dfa->nexts[org_node];
1519 re_node_set_empty (dfa->edests + clone_node);
1520 clone_dest = duplicate_node (dfa, org_dest, constraint);
1521 if (BE (clone_dest == REG_MISSING, 0))
1523 dfa->nexts[clone_node] = dfa->nexts[org_node];
1524 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 else if (dfa->edests[org_node].nelem == 0)
1530 /* In case of the node can't epsilon-transit, don't duplicate the
1531 destination and store the original destination as the
1532 destination of the node. */
1533 dfa->nexts[clone_node] = dfa->nexts[org_node];
1536 else if (dfa->edests[org_node].nelem == 1)
1538 /* In case of the node can epsilon-transit, and it has only one
1540 org_dest = dfa->edests[org_node].elems[0];
1541 re_node_set_empty (dfa->edests + clone_node);
1542 /* If the node is root_node itself, it means the epsilon closure
1543 has a loop. Then tie it to the destination of the root_node. */
1544 if (org_node == root_node && clone_node != org_node)
1546 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1551 /* In case the node has another constraint, append it. */
1552 constraint |= dfa->nodes[org_node].constraint;
1553 clone_dest = duplicate_node (dfa, org_dest, constraint);
1554 if (BE (clone_dest == REG_MISSING, 0))
1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 else /* dfa->edests[org_node].nelem == 2 */
1562 /* In case of the node can epsilon-transit, and it has two
1563 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1564 org_dest = dfa->edests[org_node].elems[0];
1565 re_node_set_empty (dfa->edests + clone_node);
1566 /* Search for a duplicated node which satisfies the constraint. */
1567 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1568 if (clone_dest == REG_MISSING)
1570 /* There is no such duplicated node, create a new one. */
1572 clone_dest = duplicate_node (dfa, org_dest, constraint);
1573 if (BE (clone_dest == REG_MISSING, 0))
1575 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1578 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1579 root_node, constraint);
1580 if (BE (err != REG_NOERROR, 0))
1585 /* There is a duplicated node which satisfies the constraint,
1586 use it to avoid infinite loop. */
1587 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1592 org_dest = dfa->edests[org_node].elems[1];
1593 clone_dest = duplicate_node (dfa, org_dest, constraint);
1594 if (BE (clone_dest == REG_MISSING, 0))
1596 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1600 org_node = org_dest;
1601 clone_node = clone_dest;
1606 /* Search for a node which is duplicated from the node ORG_NODE, and
1607 satisfies the constraint CONSTRAINT. */
1610 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1611 unsigned int constraint)
1614 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1616 if (org_node == dfa->org_indices[idx]
1617 && constraint == dfa->nodes[idx].constraint)
1618 return idx; /* Found. */
1620 return REG_MISSING; /* Not found. */
1623 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1624 Return the index of the new node, or REG_MISSING if insufficient storage is
1628 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1630 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1631 if (BE (dup_idx != REG_MISSING, 1))
1633 dfa->nodes[dup_idx].constraint = constraint;
1634 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1635 dfa->nodes[dup_idx].duplicated = 1;
1637 /* Store the index of the original node. */
1638 dfa->org_indices[dup_idx] = org_idx;
1643 static reg_errcode_t
1644 calc_inveclosure (re_dfa_t *dfa)
1648 for (idx = 0; idx < dfa->nodes_len; ++idx)
1649 re_node_set_init_empty (dfa->inveclosures + idx);
1651 for (src = 0; src < dfa->nodes_len; ++src)
1653 Idx *elems = dfa->eclosures[src].elems;
1654 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1656 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1665 /* Calculate "eclosure" for all the node in DFA. */
1667 static reg_errcode_t
1668 calc_eclosure (re_dfa_t *dfa)
1673 assert (dfa->nodes_len > 0);
1676 /* For each nodes, calculate epsilon closure. */
1677 for (node_idx = 0; ; ++node_idx)
1680 re_node_set eclosure_elem;
1681 if (node_idx == dfa->nodes_len)
1690 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1693 /* If we have already calculated, skip it. */
1694 if (dfa->eclosures[node_idx].nelem != 0)
1696 /* Calculate epsilon closure of 'node_idx'. */
1697 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1698 if (BE (err != REG_NOERROR, 0))
1701 if (dfa->eclosures[node_idx].nelem == 0)
1704 re_node_set_free (&eclosure_elem);
1710 /* Calculate epsilon closure of NODE. */
1712 static reg_errcode_t
1713 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1717 re_node_set eclosure;
1719 bool incomplete = false;
1720 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1721 if (BE (err != REG_NOERROR, 0))
1724 /* This indicates that we are calculating this node now.
1725 We reference this value to avoid infinite loop. */
1726 dfa->eclosures[node].nelem = REG_MISSING;
1728 /* If the current node has constraints, duplicate all nodes
1729 since they must inherit the constraints. */
1730 if (dfa->nodes[node].constraint
1731 && dfa->edests[node].nelem
1732 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1734 err = duplicate_node_closure (dfa, node, node, node,
1735 dfa->nodes[node].constraint);
1736 if (BE (err != REG_NOERROR, 0))
1740 /* Expand each epsilon destination nodes. */
1741 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1742 for (i = 0; i < dfa->edests[node].nelem; ++i)
1744 re_node_set eclosure_elem;
1745 Idx edest = dfa->edests[node].elems[i];
1746 /* If calculating the epsilon closure of 'edest' is in progress,
1747 return intermediate result. */
1748 if (dfa->eclosures[edest].nelem == REG_MISSING)
1753 /* If we haven't calculated the epsilon closure of 'edest' yet,
1754 calculate now. Otherwise use calculated epsilon closure. */
1755 if (dfa->eclosures[edest].nelem == 0)
1757 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1758 if (BE (err != REG_NOERROR, 0))
1762 eclosure_elem = dfa->eclosures[edest];
1763 /* Merge the epsilon closure of 'edest'. */
1764 err = re_node_set_merge (&eclosure, &eclosure_elem);
1765 if (BE (err != REG_NOERROR, 0))
1767 /* If the epsilon closure of 'edest' is incomplete,
1768 the epsilon closure of this node is also incomplete. */
1769 if (dfa->eclosures[edest].nelem == 0)
1772 re_node_set_free (&eclosure_elem);
1776 /* An epsilon closure includes itself. */
1777 ok = re_node_set_insert (&eclosure, node);
1780 if (incomplete && !root)
1781 dfa->eclosures[node].nelem = 0;
1783 dfa->eclosures[node] = eclosure;
1784 *new_set = eclosure;
1788 /* Functions for token which are used in the parser. */
1790 /* Fetch a token from INPUT.
1791 We must not use this function inside bracket expressions. */
1795 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1797 re_string_skip_bytes (input, peek_token (result, input, syntax));
1800 /* Peek a token from INPUT, and return the length of the token.
1801 We must not use this function inside bracket expressions. */
1805 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1809 if (re_string_eoi (input))
1811 token->type = END_OF_RE;
1815 c = re_string_peek_byte (input, 0);
1818 token->word_char = 0;
1819 #ifdef RE_ENABLE_I18N
1820 token->mb_partial = 0;
1821 if (input->mb_cur_max > 1 &&
1822 !re_string_first_byte (input, re_string_cur_idx (input)))
1824 token->type = CHARACTER;
1825 token->mb_partial = 1;
1832 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1834 token->type = BACK_SLASH;
1838 c2 = re_string_peek_byte_case (input, 1);
1840 token->type = CHARACTER;
1841 #ifdef RE_ENABLE_I18N
1842 if (input->mb_cur_max > 1)
1844 wint_t wc = re_string_wchar_at (input,
1845 re_string_cur_idx (input) + 1);
1846 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1850 token->word_char = IS_WORD_CHAR (c2) != 0;
1855 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1856 token->type = OP_ALT;
1858 case '1': case '2': case '3': case '4': case '5':
1859 case '6': case '7': case '8': case '9':
1860 if (!(syntax & RE_NO_BK_REFS))
1862 token->type = OP_BACK_REF;
1863 token->opr.idx = c2 - '1';
1867 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = ANCHOR;
1870 token->opr.ctx_type = WORD_FIRST;
1874 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = WORD_LAST;
1881 if (!(syntax & RE_NO_GNU_OPS))
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = WORD_DELIM;
1888 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = NOT_WORD_DELIM;
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = OP_WORD;
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = OP_NOTWORD;
1903 if (!(syntax & RE_NO_GNU_OPS))
1904 token->type = OP_SPACE;
1907 if (!(syntax & RE_NO_GNU_OPS))
1908 token->type = OP_NOTSPACE;
1911 if (!(syntax & RE_NO_GNU_OPS))
1913 token->type = ANCHOR;
1914 token->opr.ctx_type = BUF_FIRST;
1918 if (!(syntax & RE_NO_GNU_OPS))
1920 token->type = ANCHOR;
1921 token->opr.ctx_type = BUF_LAST;
1925 if (!(syntax & RE_NO_BK_PARENS))
1926 token->type = OP_OPEN_SUBEXP;
1929 if (!(syntax & RE_NO_BK_PARENS))
1930 token->type = OP_CLOSE_SUBEXP;
1933 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1934 token->type = OP_DUP_PLUS;
1937 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1938 token->type = OP_DUP_QUESTION;
1941 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1942 token->type = OP_OPEN_DUP_NUM;
1945 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1946 token->type = OP_CLOSE_DUP_NUM;
1954 token->type = CHARACTER;
1955 #ifdef RE_ENABLE_I18N
1956 if (input->mb_cur_max > 1)
1958 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1959 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1963 token->word_char = IS_WORD_CHAR (token->opr.c);
1968 if (syntax & RE_NEWLINE_ALT)
1969 token->type = OP_ALT;
1972 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1973 token->type = OP_ALT;
1976 token->type = OP_DUP_ASTERISK;
1979 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1980 token->type = OP_DUP_PLUS;
1983 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1984 token->type = OP_DUP_QUESTION;
1987 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1988 token->type = OP_OPEN_DUP_NUM;
1991 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1992 token->type = OP_CLOSE_DUP_NUM;
1995 if (syntax & RE_NO_BK_PARENS)
1996 token->type = OP_OPEN_SUBEXP;
1999 if (syntax & RE_NO_BK_PARENS)
2000 token->type = OP_CLOSE_SUBEXP;
2003 token->type = OP_OPEN_BRACKET;
2006 token->type = OP_PERIOD;
2009 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2010 re_string_cur_idx (input) != 0)
2012 char prev = re_string_peek_byte (input, -1);
2013 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2016 token->type = ANCHOR;
2017 token->opr.ctx_type = LINE_FIRST;
2020 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2021 re_string_cur_idx (input) + 1 != re_string_length (input))
2024 re_string_skip_bytes (input, 1);
2025 peek_token (&next, input, syntax);
2026 re_string_skip_bytes (input, -1);
2027 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2030 token->type = ANCHOR;
2031 token->opr.ctx_type = LINE_LAST;
2039 /* Peek a token from INPUT, and return the length of the token.
2040 We must not use this function out of bracket expressions. */
2044 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2047 if (re_string_eoi (input))
2049 token->type = END_OF_RE;
2052 c = re_string_peek_byte (input, 0);
2055 #ifdef RE_ENABLE_I18N
2056 if (input->mb_cur_max > 1 &&
2057 !re_string_first_byte (input, re_string_cur_idx (input)))
2059 token->type = CHARACTER;
2062 #endif /* RE_ENABLE_I18N */
2064 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2065 && re_string_cur_idx (input) + 1 < re_string_length (input))
2067 /* In this case, '\' escape a character. */
2069 re_string_skip_bytes (input, 1);
2070 c2 = re_string_peek_byte (input, 0);
2072 token->type = CHARACTER;
2075 if (c == '[') /* '[' is a special char in a bracket exps. */
2079 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2080 c2 = re_string_peek_byte (input, 1);
2088 token->type = OP_OPEN_COLL_ELEM;
2091 token->type = OP_OPEN_EQUIV_CLASS;
2094 if (syntax & RE_CHAR_CLASSES)
2096 token->type = OP_OPEN_CHAR_CLASS;
2099 /* else fall through. */
2101 token->type = CHARACTER;
2111 token->type = OP_CHARSET_RANGE;
2114 token->type = OP_CLOSE_BRACKET;
2117 token->type = OP_NON_MATCH_LIST;
2120 token->type = CHARACTER;
2125 /* Functions for parser. */
2127 /* Entry point of the parser.
2128 Parse the regular expression REGEXP and return the structure tree.
2129 If an error occurs, ERR is set by error code, and return NULL.
2130 This function build the following tree, from regular expression <reg_exp>:
2136 CAT means concatenation.
2137 EOR means end of regular expression. */
2140 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144 bin_tree_t *tree, *eor, *root;
2145 re_token_t current_token;
2146 dfa->syntax = syntax;
2147 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2148 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2151 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2153 root = create_tree (dfa, tree, eor, CONCAT);
2156 if (BE (eor == NULL || root == NULL, 0))
2164 /* This function build the following tree, from regular expression
2165 <branch1>|<branch2>:
2171 ALT means alternative, which represents the operator '|'. */
2174 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2178 bin_tree_t *tree, *branch = NULL;
2179 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2180 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183 while (token->type == OP_ALT)
2185 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2186 if (token->type != OP_ALT && token->type != END_OF_RE
2187 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2189 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2195 tree = create_tree (dfa, tree, branch, OP_ALT);
2196 if (BE (tree == NULL, 0))
2205 /* This function build the following tree, from regular expression
2212 CAT means concatenation. */
2215 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2216 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 bin_tree_t *tree, *expr;
2219 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2220 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2221 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2224 while (token->type != OP_ALT && token->type != END_OF_RE
2225 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2228 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2231 postorder (tree, free_tree, NULL);
2234 if (tree != NULL && expr != NULL)
2236 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2237 if (newtree == NULL)
2239 postorder (expr, free_tree, NULL);
2240 postorder (tree, free_tree, NULL);
2246 else if (tree == NULL)
2248 /* Otherwise expr == NULL, we don't need to create new tree. */
2253 /* This function build the following tree, from regular expression a*:
2260 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2261 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2265 switch (token->type)
2268 tree = create_token_tree (dfa, NULL, NULL, token);
2269 if (BE (tree == NULL, 0))
2274 #ifdef RE_ENABLE_I18N
2275 if (dfa->mb_cur_max > 1)
2277 while (!re_string_eoi (regexp)
2278 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280 bin_tree_t *mbc_remain;
2281 fetch_token (token, regexp, syntax);
2282 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2283 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2284 if (BE (mbc_remain == NULL || tree == NULL, 0))
2293 case OP_OPEN_SUBEXP:
2294 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2295 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298 case OP_OPEN_BRACKET:
2299 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2300 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309 dfa->used_bkref_map |= 1 << token->opr.idx;
2310 tree = create_token_tree (dfa, NULL, NULL, token);
2311 if (BE (tree == NULL, 0))
2317 dfa->has_mb_node = 1;
2319 case OP_OPEN_DUP_NUM:
2320 if (syntax & RE_CONTEXT_INVALID_DUP)
2326 case OP_DUP_ASTERISK:
2328 case OP_DUP_QUESTION:
2329 if (syntax & RE_CONTEXT_INVALID_OPS)
2334 else if (syntax & RE_CONTEXT_INDEP_OPS)
2336 fetch_token (token, regexp, syntax);
2337 return parse_expression (regexp, preg, token, syntax, nest, err);
2339 /* else fall through */
2340 case OP_CLOSE_SUBEXP:
2341 if ((token->type == OP_CLOSE_SUBEXP) &&
2342 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2347 /* else fall through */
2348 case OP_CLOSE_DUP_NUM:
2349 /* We treat it as a normal character. */
2351 /* Then we can these characters as normal characters. */
2352 token->type = CHARACTER;
2353 /* mb_partial and word_char bits should be initialized already
2355 tree = create_token_tree (dfa, NULL, NULL, token);
2356 if (BE (tree == NULL, 0))
2363 if ((token->opr.ctx_type
2364 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2365 && dfa->word_ops_used == 0)
2366 init_word_char (dfa);
2367 if (token->opr.ctx_type == WORD_DELIM
2368 || token->opr.ctx_type == NOT_WORD_DELIM)
2370 bin_tree_t *tree_first, *tree_last;
2371 if (token->opr.ctx_type == WORD_DELIM)
2373 token->opr.ctx_type = WORD_FIRST;
2374 tree_first = create_token_tree (dfa, NULL, NULL, token);
2375 token->opr.ctx_type = WORD_LAST;
2379 token->opr.ctx_type = INSIDE_WORD;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = INSIDE_NOTWORD;
2383 tree_last = create_token_tree (dfa, NULL, NULL, token);
2384 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2385 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2393 tree = create_token_tree (dfa, NULL, NULL, token);
2394 if (BE (tree == NULL, 0))
2400 /* We must return here, since ANCHORs can't be followed
2401 by repetition operators.
2402 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2403 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2404 fetch_token (token, regexp, syntax);
2407 tree = create_token_tree (dfa, NULL, NULL, token);
2408 if (BE (tree == NULL, 0))
2413 if (dfa->mb_cur_max > 1)
2414 dfa->has_mb_node = 1;
2418 tree = build_charclass_op (dfa, regexp->trans,
2419 (const unsigned char *) "alnum",
2420 (const unsigned char *) "_",
2421 token->type == OP_NOTWORD, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427 tree = build_charclass_op (dfa, regexp->trans,
2428 (const unsigned char *) "space",
2429 (const unsigned char *) "",
2430 token->type == OP_NOTSPACE, err);
2431 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441 /* Must not happen? */
2447 fetch_token (token, regexp, syntax);
2449 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2450 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2452 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2453 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2455 /* In BRE consecutive duplications are not allowed. */
2456 if ((syntax & RE_CONTEXT_INVALID_DUP)
2457 && (token->type == OP_DUP_ASTERISK
2458 || token->type == OP_OPEN_DUP_NUM))
2468 /* This function build the following tree, from regular expression
2476 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2477 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2479 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2482 cur_nsub = preg->re_nsub++;
2484 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2486 /* The subexpression may be a null string. */
2487 if (token->type == OP_CLOSE_SUBEXP)
2491 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2495 postorder (tree, free_tree, NULL);
2498 if (BE (*err != REG_NOERROR, 0))
2502 if (cur_nsub <= '9' - '1')
2503 dfa->completed_bkref_map |= 1 << cur_nsub;
2505 tree = create_tree (dfa, tree, NULL, SUBEXP);
2506 if (BE (tree == NULL, 0))
2511 tree->token.opr.idx = cur_nsub;
2515 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2518 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2519 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2521 bin_tree_t *tree = NULL, *old_tree = NULL;
2522 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2523 re_token_t start_token = *token;
2525 if (token->type == OP_OPEN_DUP_NUM)
2528 start = fetch_number (regexp, token, syntax);
2529 if (start == REG_MISSING)
2531 if (token->type == CHARACTER && token->opr.c == ',')
2532 start = 0; /* We treat "{,m}" as "{0,m}". */
2535 *err = REG_BADBR; /* <re>{} is invalid. */
2539 if (BE (start != REG_ERROR, 1))
2541 /* We treat "{n}" as "{n,n}". */
2542 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2543 : ((token->type == CHARACTER && token->opr.c == ',')
2544 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2546 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2548 /* Invalid sequence. */
2549 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2551 if (token->type == END_OF_RE)
2559 /* If the syntax bit is set, rollback. */
2560 re_string_set_index (regexp, start_idx);
2561 *token = start_token;
2562 token->type = CHARACTER;
2563 /* mb_partial and word_char bits should be already initialized by
2568 if (BE ((end != REG_MISSING && start > end)
2569 || token->type != OP_CLOSE_DUP_NUM, 0))
2571 /* First number greater than second. */
2578 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2579 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2582 fetch_token (token, regexp, syntax);
2584 if (BE (elem == NULL, 0))
2586 if (BE (start == 0 && end == 0, 0))
2588 postorder (elem, free_tree, NULL);
2592 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2593 if (BE (start > 0, 0))
2596 for (i = 2; i <= start; ++i)
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2607 /* Duplicate ELEM before it is marked optional. */
2608 elem = duplicate_tree (elem, dfa);
2614 if (elem->token.type == SUBEXP)
2615 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2617 tree = create_tree (dfa, elem, NULL,
2618 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2619 if (BE (tree == NULL, 0))
2620 goto parse_dup_op_espace;
2622 /* From gnulib's "intprops.h":
2623 True if the arithmetic type T is signed. */
2624 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2626 /* This loop is actually executed only when end != REG_MISSING,
2627 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2628 already created the start+1-th copy. */
2629 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2630 for (i = start + 2; i <= end; ++i)
2632 elem = duplicate_tree (elem, dfa);
2633 tree = create_tree (dfa, tree, elem, CONCAT);
2634 if (BE (elem == NULL || tree == NULL, 0))
2635 goto parse_dup_op_espace;
2637 tree = create_tree (dfa, tree, NULL, OP_ALT);
2638 if (BE (tree == NULL, 0))
2639 goto parse_dup_op_espace;
2643 tree = create_tree (dfa, old_tree, tree, CONCAT);
2647 parse_dup_op_espace:
2652 /* Size of the names for collating symbol/equivalence_class/character_class.
2653 I'm not sure, but maybe enough. */
2654 #define BRACKET_NAME_BUF_SIZE 32
2657 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2658 Build the range expression which starts from START_ELEM, and ends
2659 at END_ELEM. The result are written to MBCSET and SBCSET.
2660 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2661 mbcset->range_ends, is a pointer argument since we may
2664 static reg_errcode_t
2666 # ifdef RE_ENABLE_I18N
2667 build_range_exp (const reg_syntax_t syntax,
2669 re_charset_t *mbcset,
2671 const bracket_elem_t *start_elem,
2672 const bracket_elem_t *end_elem)
2673 # else /* not RE_ENABLE_I18N */
2674 build_range_exp (const reg_syntax_t syntax,
2676 const bracket_elem_t *start_elem,
2677 const bracket_elem_t *end_elem)
2678 # endif /* not RE_ENABLE_I18N */
2680 unsigned int start_ch, end_ch;
2681 /* Equivalence Classes and Character Classes can't be a range start/end. */
2682 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2683 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2687 /* We can handle no multi character collating elements without libc
2689 if (BE ((start_elem->type == COLL_SYM
2690 && strlen ((char *) start_elem->opr.name) > 1)
2691 || (end_elem->type == COLL_SYM
2692 && strlen ((char *) end_elem->opr.name) > 1), 0))
2693 return REG_ECOLLATE;
2695 # ifdef RE_ENABLE_I18N
2700 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2702 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2703 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2705 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2706 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2708 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2709 ? __btowc (start_ch) : start_elem->opr.wch);
2710 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2711 ? __btowc (end_ch) : end_elem->opr.wch);
2712 if (start_wc == WEOF || end_wc == WEOF)
2713 return REG_ECOLLATE;
2714 cmp_buf[0] = start_wc;
2715 cmp_buf[4] = end_wc;
2717 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2718 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2721 /* Got valid collation sequence values, add them as a new entry.
2722 However, for !_LIBC we have no collation elements: if the
2723 character set is single byte, the single byte character set
2724 that we build below suffices. parse_bracket_exp passes
2725 no MBCSET if dfa->mb_cur_max == 1. */
2728 /* Check the space of the arrays. */
2729 if (BE (*range_alloc == mbcset->nranges, 0))
2731 /* There is not enough space, need realloc. */
2732 wchar_t *new_array_start, *new_array_end;
2735 /* +1 in case of mbcset->nranges is 0. */
2736 new_nranges = 2 * mbcset->nranges + 1;
2737 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2738 are NULL if *range_alloc == 0. */
2739 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2741 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2744 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2747 mbcset->range_starts = new_array_start;
2748 mbcset->range_ends = new_array_end;
2749 *range_alloc = new_nranges;
2752 mbcset->range_starts[mbcset->nranges] = start_wc;
2753 mbcset->range_ends[mbcset->nranges++] = end_wc;
2756 /* Build the table for single byte characters. */
2757 for (wc = 0; wc < SBC_MAX; ++wc)
2760 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2761 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2762 bitset_set (sbcset, wc);
2765 # else /* not RE_ENABLE_I18N */
2768 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2769 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2771 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2772 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2774 if (start_ch > end_ch)
2776 /* Build the table for single byte characters. */
2777 for (ch = 0; ch < SBC_MAX; ++ch)
2778 if (start_ch <= ch && ch <= end_ch)
2779 bitset_set (sbcset, ch);
2781 # endif /* not RE_ENABLE_I18N */
2784 #endif /* not _LIBC */
2787 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2788 Build the collating element which is represented by NAME.
2789 The result are written to MBCSET and SBCSET.
2790 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2791 pointer argument since we may update it. */
2793 static reg_errcode_t
2795 # ifdef RE_ENABLE_I18N
2796 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2797 Idx *coll_sym_alloc, const unsigned char *name)
2798 # else /* not RE_ENABLE_I18N */
2799 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2800 # endif /* not RE_ENABLE_I18N */
2802 size_t name_len = strlen ((const char *) name);
2803 if (BE (name_len != 1, 0))
2804 return REG_ECOLLATE;
2807 bitset_set (sbcset, name[0]);
2811 #endif /* not _LIBC */
2813 /* This function parse bracket expression like "[abc]", "[a-c]",
2817 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2818 reg_syntax_t syntax, reg_errcode_t *err)
2821 const unsigned char *collseqmb;
2822 const char *collseqwc;
2825 const int32_t *symb_table;
2826 const unsigned char *extra;
2828 /* Local function for parse_bracket_exp used in _LIBC environment.
2829 Seek the collating symbol entry corresponding to NAME.
2830 Return the index of the symbol in the SYMB_TABLE. */
2833 __attribute ((always_inline))
2834 seek_collating_symbol_entry (name, name_len)
2835 const unsigned char *name;
2838 int32_t hash = elem_hash ((const char *) name, name_len);
2839 int32_t elem = hash % table_size;
2840 if (symb_table[2 * elem] != 0)
2842 int32_t second = hash % (table_size - 2) + 1;
2846 /* First compare the hashing value. */
2847 if (symb_table[2 * elem] == hash
2848 /* Compare the length of the name. */
2849 && name_len == extra[symb_table[2 * elem + 1]]
2850 /* Compare the name. */
2851 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2854 /* Yep, this is the entry. */
2861 while (symb_table[2 * elem] != 0);
2866 /* Local function for parse_bracket_exp used in _LIBC environment.
2867 Look up the collation sequence value of BR_ELEM.
2868 Return the value if succeeded, UINT_MAX otherwise. */
2870 auto inline unsigned int
2871 __attribute ((always_inline))
2872 lookup_collation_sequence_value (br_elem)
2873 bracket_elem_t *br_elem;
2875 if (br_elem->type == SB_CHAR)
2878 if (MB_CUR_MAX == 1)
2881 return collseqmb[br_elem->opr.ch];
2884 wint_t wc = __btowc (br_elem->opr.ch);
2885 return __collseq_table_lookup (collseqwc, wc);
2888 else if (br_elem->type == MB_CHAR)
2891 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893 else if (br_elem->type == COLL_SYM)
2895 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2899 elem = seek_collating_symbol_entry (br_elem->opr.name,
2901 if (symb_table[2 * elem] != 0)
2903 /* We found the entry. */
2904 idx = symb_table[2 * elem + 1];
2905 /* Skip the name of collating element name. */
2906 idx += 1 + extra[idx];
2907 /* Skip the byte sequence of the collating element. */
2908 idx += 1 + extra[idx];
2909 /* Adjust for the alignment. */
2910 idx = (idx + 3) & ~3;
2911 /* Skip the multibyte collation sequence value. */
2912 idx += sizeof (unsigned int);
2913 /* Skip the wide char sequence of the collating element. */
2914 idx += sizeof (unsigned int) *
2915 (1 + *(unsigned int *) (extra + idx));
2916 /* Return the collation sequence value. */
2917 return *(unsigned int *) (extra + idx);
2919 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2921 /* No valid character. Match it as a single byte
2923 return collseqmb[br_elem->opr.name[0]];
2926 else if (sym_name_len == 1)
2927 return collseqmb[br_elem->opr.name[0]];
2932 /* Local function for parse_bracket_exp used in _LIBC environment.
2933 Build the range expression which starts from START_ELEM, and ends
2934 at END_ELEM. The result are written to MBCSET and SBCSET.
2935 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2936 mbcset->range_ends, is a pointer argument since we may
2939 auto inline reg_errcode_t
2940 __attribute ((always_inline))
2941 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2942 re_charset_t *mbcset;
2945 bracket_elem_t *start_elem, *end_elem;
2948 uint32_t start_collseq;
2949 uint32_t end_collseq;
2951 /* Equivalence Classes and Character Classes can't be a range
2953 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2954 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2958 start_collseq = lookup_collation_sequence_value (start_elem);
2959 end_collseq = lookup_collation_sequence_value (end_elem);
2960 /* Check start/end collation sequence values. */
2961 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2962 return REG_ECOLLATE;
2963 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2966 /* Got valid collation sequence values, add them as a new entry.
2967 However, if we have no collation elements, and the character set
2968 is single byte, the single byte character set that we
2969 build below suffices. */
2970 if (nrules > 0 || dfa->mb_cur_max > 1)
2972 /* Check the space of the arrays. */
2973 if (BE (*range_alloc == mbcset->nranges, 0))
2975 /* There is not enough space, need realloc. */
2976 uint32_t *new_array_start;
2977 uint32_t *new_array_end;
2980 /* +1 in case of mbcset->nranges is 0. */
2981 new_nranges = 2 * mbcset->nranges + 1;
2982 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2984 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2987 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2990 mbcset->range_starts = new_array_start;
2991 mbcset->range_ends = new_array_end;
2992 *range_alloc = new_nranges;
2995 mbcset->range_starts[mbcset->nranges] = start_collseq;
2996 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2999 /* Build the table for single byte characters. */
3000 for (ch = 0; ch < SBC_MAX; ch++)
3002 uint32_t ch_collseq;
3004 if (MB_CUR_MAX == 1)
3007 ch_collseq = collseqmb[ch];
3009 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3010 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3011 bitset_set (sbcset, ch);
3016 /* Local function for parse_bracket_exp used in _LIBC environment.
3017 Build the collating element which is represented by NAME.
3018 The result are written to MBCSET and SBCSET.
3019 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3020 pointer argument since we may update it. */
3022 auto inline reg_errcode_t
3023 __attribute ((always_inline))
3024 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3025 re_charset_t *mbcset;
3026 Idx *coll_sym_alloc;
3028 const unsigned char *name;
3031 size_t name_len = strlen ((const char *) name);
3034 elem = seek_collating_symbol_entry (name, name_len);
3035 if (symb_table[2 * elem] != 0)
3037 /* We found the entry. */
3038 idx = symb_table[2 * elem + 1];
3039 /* Skip the name of collating element name. */
3040 idx += 1 + extra[idx];
3042 else if (symb_table[2 * elem] == 0 && name_len == 1)
3044 /* No valid character, treat it as a normal
3046 bitset_set (sbcset, name[0]);
3050 return REG_ECOLLATE;
3052 /* Got valid collation sequence, add it as a new entry. */
3053 /* Check the space of the arrays. */
3054 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3056 /* Not enough, realloc it. */
3057 /* +1 in case of mbcset->ncoll_syms is 0. */
3058 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3059 /* Use realloc since mbcset->coll_syms is NULL
3061 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3062 new_coll_sym_alloc);
3063 if (BE (new_coll_syms == NULL, 0))
3065 mbcset->coll_syms = new_coll_syms;
3066 *coll_sym_alloc = new_coll_sym_alloc;
3068 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3073 if (BE (name_len != 1, 0))
3074 return REG_ECOLLATE;
3077 bitset_set (sbcset, name[0]);
3084 re_token_t br_token;
3085 re_bitset_ptr_t sbcset;
3086 #ifdef RE_ENABLE_I18N
3087 re_charset_t *mbcset;
3088 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3089 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3090 #endif /* not RE_ENABLE_I18N */
3091 bool non_match = false;
3092 bin_tree_t *work_tree;
3094 bool first_round = true;
3096 collseqmb = (const unsigned char *)
3097 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3098 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3104 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3105 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3106 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3107 _NL_COLLATE_SYMB_TABLEMB);
3108 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3109 _NL_COLLATE_SYMB_EXTRAMB);
3112 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3113 #ifdef RE_ENABLE_I18N
3114 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3115 #endif /* RE_ENABLE_I18N */
3116 #ifdef RE_ENABLE_I18N
3117 if (BE (sbcset == NULL || mbcset == NULL, 0))
3119 if (BE (sbcset == NULL, 0))
3120 #endif /* RE_ENABLE_I18N */
3123 #ifdef RE_ENABLE_I18N
3130 token_len = peek_token_bracket (token, regexp, syntax);
3131 if (BE (token->type == END_OF_RE, 0))
3134 goto parse_bracket_exp_free_return;
3136 if (token->type == OP_NON_MATCH_LIST)
3138 #ifdef RE_ENABLE_I18N
3139 mbcset->non_match = 1;
3140 #endif /* not RE_ENABLE_I18N */
3142 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3143 bitset_set (sbcset, '\n');
3144 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3145 token_len = peek_token_bracket (token, regexp, syntax);
3146 if (BE (token->type == END_OF_RE, 0))
3149 goto parse_bracket_exp_free_return;
3153 /* We treat the first ']' as a normal character. */
3154 if (token->type == OP_CLOSE_BRACKET)
3155 token->type = CHARACTER;
3159 bracket_elem_t start_elem, end_elem;
3160 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3161 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3164 bool is_range_exp = false;
3167 start_elem.opr.name = start_name_buf;
3168 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3169 syntax, first_round);
3170 if (BE (ret != REG_NOERROR, 0))
3173 goto parse_bracket_exp_free_return;
3175 first_round = false;
3177 /* Get information about the next token. We need it in any case. */
3178 token_len = peek_token_bracket (token, regexp, syntax);
3180 /* Do not check for ranges if we know they are not allowed. */
3181 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3183 if (BE (token->type == END_OF_RE, 0))
3186 goto parse_bracket_exp_free_return;
3188 if (token->type == OP_CHARSET_RANGE)
3190 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3191 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3192 if (BE (token2.type == END_OF_RE, 0))
3195 goto parse_bracket_exp_free_return;
3197 if (token2.type == OP_CLOSE_BRACKET)
3199 /* We treat the last '-' as a normal character. */
3200 re_string_skip_bytes (regexp, -token_len);
3201 token->type = CHARACTER;
3204 is_range_exp = true;
3208 if (is_range_exp == true)
3210 end_elem.opr.name = end_name_buf;
3211 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3213 if (BE (ret != REG_NOERROR, 0))
3216 goto parse_bracket_exp_free_return;
3219 token_len = peek_token_bracket (token, regexp, syntax);
3222 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3223 &start_elem, &end_elem);
3225 # ifdef RE_ENABLE_I18N
3226 *err = build_range_exp (syntax, sbcset,
3227 dfa->mb_cur_max > 1 ? mbcset : NULL,
3228 &range_alloc, &start_elem, &end_elem);
3230 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3232 #endif /* RE_ENABLE_I18N */
3233 if (BE (*err != REG_NOERROR, 0))
3234 goto parse_bracket_exp_free_return;
3238 switch (start_elem.type)
3241 bitset_set (sbcset, start_elem.opr.ch);
3243 #ifdef RE_ENABLE_I18N
3245 /* Check whether the array has enough space. */
3246 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3248 wchar_t *new_mbchars;
3249 /* Not enough, realloc it. */
3250 /* +1 in case of mbcset->nmbchars is 0. */
3251 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3252 /* Use realloc since array is NULL if *alloc == 0. */
3253 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3255 if (BE (new_mbchars == NULL, 0))
3256 goto parse_bracket_exp_espace;
3257 mbcset->mbchars = new_mbchars;
3259 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3261 #endif /* RE_ENABLE_I18N */
3263 *err = build_equiv_class (sbcset,
3264 #ifdef RE_ENABLE_I18N
3265 mbcset, &equiv_class_alloc,
3266 #endif /* RE_ENABLE_I18N */
3267 start_elem.opr.name);
3268 if (BE (*err != REG_NOERROR, 0))
3269 goto parse_bracket_exp_free_return;
3272 *err = build_collating_symbol (sbcset,
3273 #ifdef RE_ENABLE_I18N
3274 mbcset, &coll_sym_alloc,
3275 #endif /* RE_ENABLE_I18N */
3276 start_elem.opr.name);
3277 if (BE (*err != REG_NOERROR, 0))
3278 goto parse_bracket_exp_free_return;
3281 *err = build_charclass (regexp->trans, sbcset,
3282 #ifdef RE_ENABLE_I18N
3283 mbcset, &char_class_alloc,
3284 #endif /* RE_ENABLE_I18N */
3285 start_elem.opr.name, syntax);
3286 if (BE (*err != REG_NOERROR, 0))
3287 goto parse_bracket_exp_free_return;
3294 if (BE (token->type == END_OF_RE, 0))
3297 goto parse_bracket_exp_free_return;
3299 if (token->type == OP_CLOSE_BRACKET)
3303 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3305 /* If it is non-matching list. */
3307 bitset_not (sbcset);
3309 #ifdef RE_ENABLE_I18N
3310 /* Ensure only single byte characters are set. */
3311 if (dfa->mb_cur_max > 1)
3312 bitset_mask (sbcset, dfa->sb_char);
3314 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3315 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3316 || mbcset->non_match)))
3318 bin_tree_t *mbc_tree;
3320 /* Build a tree for complex bracket. */
3321 dfa->has_mb_node = 1;
3322 br_token.type = COMPLEX_BRACKET;
3323 br_token.opr.mbcset = mbcset;
3324 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325 if (BE (mbc_tree == NULL, 0))
3326 goto parse_bracket_exp_espace;
3327 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3328 if (sbcset[sbc_idx])
3330 /* If there are no bits set in sbcset, there is no point
3331 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3332 if (sbc_idx < BITSET_WORDS)
3334 /* Build a tree for simple bracket. */
3335 br_token.type = SIMPLE_BRACKET;
3336 br_token.opr.sbcset = sbcset;
3337 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338 if (BE (work_tree == NULL, 0))
3339 goto parse_bracket_exp_espace;
3341 /* Then join them by ALT node. */
3342 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3343 if (BE (work_tree == NULL, 0))
3344 goto parse_bracket_exp_espace;
3349 work_tree = mbc_tree;
3353 #endif /* not RE_ENABLE_I18N */
3355 #ifdef RE_ENABLE_I18N
3356 free_charset (mbcset);
3358 /* Build a tree for simple bracket. */
3359 br_token.type = SIMPLE_BRACKET;
3360 br_token.opr.sbcset = sbcset;
3361 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3362 if (BE (work_tree == NULL, 0))
3363 goto parse_bracket_exp_espace;
3367 parse_bracket_exp_espace:
3369 parse_bracket_exp_free_return:
3371 #ifdef RE_ENABLE_I18N
3372 free_charset (mbcset);
3373 #endif /* RE_ENABLE_I18N */
3377 /* Parse an element in the bracket expression. */
3379 static reg_errcode_t
3380 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3381 re_token_t *token, int token_len, re_dfa_t *dfa,
3382 reg_syntax_t syntax, bool accept_hyphen)
3384 #ifdef RE_ENABLE_I18N
3386 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3387 if (cur_char_size > 1)
3389 elem->type = MB_CHAR;
3390 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3391 re_string_skip_bytes (regexp, cur_char_size);
3394 #endif /* RE_ENABLE_I18N */
3395 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3396 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3397 || token->type == OP_OPEN_EQUIV_CLASS)
3398 return parse_bracket_symbol (elem, regexp, token);
3399 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3401 /* A '-' must only appear as anything but a range indicator before
3402 the closing bracket. Everything else is an error. */
3404 (void) peek_token_bracket (&token2, regexp, syntax);
3405 if (token2.type != OP_CLOSE_BRACKET)
3406 /* The actual error value is not standardized since this whole
3407 case is undefined. But ERANGE makes good sense. */
3410 elem->type = SB_CHAR;
3411 elem->opr.ch = token->opr.c;
3415 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3416 such as [:<character_class>:], [.<collating_element>.], and
3417 [=<equivalent_class>=]. */
3419 static reg_errcode_t
3420 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3423 unsigned char ch, delim = token->opr.c;
3425 if (re_string_eoi(regexp))
3429 if (i >= BRACKET_NAME_BUF_SIZE)
3431 if (token->type == OP_OPEN_CHAR_CLASS)
3432 ch = re_string_fetch_byte_case (regexp);
3434 ch = re_string_fetch_byte (regexp);
3435 if (re_string_eoi(regexp))
3437 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3439 elem->opr.name[i] = ch;
3441 re_string_skip_bytes (regexp, 1);
3442 elem->opr.name[i] = '\0';
3443 switch (token->type)
3445 case OP_OPEN_COLL_ELEM:
3446 elem->type = COLL_SYM;
3448 case OP_OPEN_EQUIV_CLASS:
3449 elem->type = EQUIV_CLASS;
3451 case OP_OPEN_CHAR_CLASS:
3452 elem->type = CHAR_CLASS;
3460 /* Helper function for parse_bracket_exp.
3461 Build the equivalence class which is represented by NAME.
3462 The result are written to MBCSET and SBCSET.
3463 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3464 is a pointer argument since we may update it. */
3466 static reg_errcode_t
3467 #ifdef RE_ENABLE_I18N
3468 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3469 Idx *equiv_class_alloc, const unsigned char *name)
3470 #else /* not RE_ENABLE_I18N */
3471 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3472 #endif /* not RE_ENABLE_I18N */
3475 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3478 const int32_t *table, *indirect;
3479 const unsigned char *weights, *extra, *cp;
3480 unsigned char char_buf[2];
3484 /* This #include defines a local function! */
3485 # include <locale/weight.h>
3486 /* Calculate the index for equivalence class. */
3488 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3489 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3490 _NL_COLLATE_WEIGHTMB);
3491 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3492 _NL_COLLATE_EXTRAMB);
3493 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3494 _NL_COLLATE_INDIRECTMB);
3495 idx1 = findidx (&cp, -1);
3496 if (BE (idx1 == 0 || *cp != '\0', 0))
3497 /* This isn't a valid character. */
3498 return REG_ECOLLATE;
3500 /* Build single byte matching table for this equivalence class. */
3501 len = weights[idx1 & 0xffffff];
3502 for (ch = 0; ch < SBC_MAX; ++ch)
3506 idx2 = findidx (&cp, 1);
3511 /* This isn't a valid character. */
3513 /* Compare only if the length matches and the collation rule
3514 index is the same. */
3515 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3519 while (cnt <= len &&
3520 weights[(idx1 & 0xffffff) + 1 + cnt]
3521 == weights[(idx2 & 0xffffff) + 1 + cnt])
3525 bitset_set (sbcset, ch);
3528 /* Check whether the array has enough space. */
3529 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3531 /* Not enough, realloc it. */
3532 /* +1 in case of mbcset->nequiv_classes is 0. */
3533 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3534 /* Use realloc since the array is NULL if *alloc == 0. */
3535 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3537 new_equiv_class_alloc);
3538 if (BE (new_equiv_classes == NULL, 0))
3540 mbcset->equiv_classes = new_equiv_classes;
3541 *equiv_class_alloc = new_equiv_class_alloc;
3543 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3548 if (BE (strlen ((const char *) name) != 1, 0))
3549 return REG_ECOLLATE;
3550 bitset_set (sbcset, *name);
3555 /* Helper function for parse_bracket_exp.
3556 Build the character class which is represented by NAME.
3557 The result are written to MBCSET and SBCSET.
3558 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3559 is a pointer argument since we may update it. */
3561 static reg_errcode_t
3562 #ifdef RE_ENABLE_I18N
3563 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3564 re_charset_t *mbcset, Idx *char_class_alloc,
3565 const unsigned char *class_name, reg_syntax_t syntax)
3566 #else /* not RE_ENABLE_I18N */
3567 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3568 const unsigned char *class_name, reg_syntax_t syntax)
3569 #endif /* not RE_ENABLE_I18N */
3572 const char *name = (const char *) class_name;
3574 /* In case of REG_ICASE "upper" and "lower" match the both of
3575 upper and lower cases. */
3576 if ((syntax & RE_ICASE)
3577 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3580 #ifdef RE_ENABLE_I18N
3581 /* Check the space of the arrays. */
3582 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3584 /* Not enough, realloc it. */
3585 /* +1 in case of mbcset->nchar_classes is 0. */
3586 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3587 /* Use realloc since array is NULL if *alloc == 0. */
3588 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3589 new_char_class_alloc);
3590 if (BE (new_char_classes == NULL, 0))
3592 mbcset->char_classes = new_char_classes;
3593 *char_class_alloc = new_char_class_alloc;
3595 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3596 #endif /* RE_ENABLE_I18N */
3598 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3600 if (BE (trans != NULL, 0)) \
3602 for (i = 0; i < SBC_MAX; ++i) \
3603 if (ctype_func (i)) \
3604 bitset_set (sbcset, trans[i]); \
3608 for (i = 0; i < SBC_MAX; ++i) \
3609 if (ctype_func (i)) \
3610 bitset_set (sbcset, i); \
3614 if (strcmp (name, "alnum") == 0)
3615 BUILD_CHARCLASS_LOOP (isalnum);
3616 else if (strcmp (name, "cntrl") == 0)
3617 BUILD_CHARCLASS_LOOP (iscntrl);
3618 else if (strcmp (name, "lower") == 0)
3619 BUILD_CHARCLASS_LOOP (islower);
3620 else if (strcmp (name, "space") == 0)
3621 BUILD_CHARCLASS_LOOP (isspace);
3622 else if (strcmp (name, "alpha") == 0)
3623 BUILD_CHARCLASS_LOOP (isalpha);
3624 else if (strcmp (name, "digit") == 0)
3625 BUILD_CHARCLASS_LOOP (isdigit);
3626 else if (strcmp (name, "print") == 0)
3627 BUILD_CHARCLASS_LOOP (isprint);
3628 else if (strcmp (name, "upper") == 0)
3629 BUILD_CHARCLASS_LOOP (isupper);
3630 else if (strcmp (name, "blank") == 0)
3631 BUILD_CHARCLASS_LOOP (isblank);
3632 else if (strcmp (name, "graph") == 0)
3633 BUILD_CHARCLASS_LOOP (isgraph);
3634 else if (strcmp (name, "punct") == 0)
3635 BUILD_CHARCLASS_LOOP (ispunct);
3636 else if (strcmp (name, "xdigit") == 0)
3637 BUILD_CHARCLASS_LOOP (isxdigit);
3645 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3646 const unsigned char *class_name,
3647 const unsigned char *extra, bool non_match,
3650 re_bitset_ptr_t sbcset;
3651 #ifdef RE_ENABLE_I18N
3652 re_charset_t *mbcset;
3654 #endif /* not RE_ENABLE_I18N */
3656 re_token_t br_token;
3659 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3660 #ifdef RE_ENABLE_I18N
3661 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3662 #endif /* RE_ENABLE_I18N */
3664 #ifdef RE_ENABLE_I18N
3665 if (BE (sbcset == NULL || mbcset == NULL, 0))
3666 #else /* not RE_ENABLE_I18N */
3667 if (BE (sbcset == NULL, 0))
3668 #endif /* not RE_ENABLE_I18N */
3676 #ifdef RE_ENABLE_I18N
3677 mbcset->non_match = 1;
3678 #endif /* not RE_ENABLE_I18N */
3681 /* We don't care the syntax in this case. */
3682 ret = build_charclass (trans, sbcset,
3683 #ifdef RE_ENABLE_I18N
3685 #endif /* RE_ENABLE_I18N */
3688 if (BE (ret != REG_NOERROR, 0))
3691 #ifdef RE_ENABLE_I18N
3692 free_charset (mbcset);
3693 #endif /* RE_ENABLE_I18N */
3697 /* \w match '_' also. */
3698 for (; *extra; extra++)
3699 bitset_set (sbcset, *extra);
3701 /* If it is non-matching list. */
3703 bitset_not (sbcset);
3705 #ifdef RE_ENABLE_I18N
3706 /* Ensure only single byte characters are set. */
3707 if (dfa->mb_cur_max > 1)
3708 bitset_mask (sbcset, dfa->sb_char);
3711 /* Build a tree for simple bracket. */
3712 br_token.type = SIMPLE_BRACKET;
3713 br_token.opr.sbcset = sbcset;
3714 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3715 if (BE (tree == NULL, 0))
3716 goto build_word_op_espace;
3718 #ifdef RE_ENABLE_I18N
3719 if (dfa->mb_cur_max > 1)
3721 bin_tree_t *mbc_tree;
3722 /* Build a tree for complex bracket. */
3723 br_token.type = COMPLEX_BRACKET;
3724 br_token.opr.mbcset = mbcset;
3725 dfa->has_mb_node = 1;
3726 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3727 if (BE (mbc_tree == NULL, 0))
3728 goto build_word_op_espace;
3729 /* Then join them by ALT node. */
3730 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3731 if (BE (mbc_tree != NULL, 1))
3736 free_charset (mbcset);
3739 #else /* not RE_ENABLE_I18N */
3741 #endif /* not RE_ENABLE_I18N */
3743 build_word_op_espace:
3745 #ifdef RE_ENABLE_I18N
3746 free_charset (mbcset);
3747 #endif /* RE_ENABLE_I18N */
3752 /* This is intended for the expressions like "a{1,3}".
3753 Fetch a number from 'input', and return the number.
3754 Return REG_MISSING if the number field is empty like "{,1}".
3755 Return REG_ERROR if an error occurred. */
3758 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3760 Idx num = REG_MISSING;
3764 fetch_token (token, input, syntax);
3766 if (BE (token->type == END_OF_RE, 0))
3768 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3770 num = ((token->type != CHARACTER || c < '0' || '9' < c
3771 || num == REG_ERROR)
3773 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3774 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3779 #ifdef RE_ENABLE_I18N
3781 free_charset (re_charset_t *cset)
3783 re_free (cset->mbchars);
3785 re_free (cset->coll_syms);
3786 re_free (cset->equiv_classes);
3787 re_free (cset->range_starts);
3788 re_free (cset->range_ends);
3790 re_free (cset->char_classes);
3793 #endif /* RE_ENABLE_I18N */
3795 /* Functions for binary tree operation. */
3797 /* Create a tree node. */
3800 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3801 re_token_type_t type)
3805 return create_token_tree (dfa, left, right, &t);
3809 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3810 const re_token_t *token)
3813 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3815 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3817 if (storage == NULL)
3819 storage->next = dfa->str_tree_storage;
3820 dfa->str_tree_storage = storage;
3821 dfa->str_tree_storage_idx = 0;
3823 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3825 tree->parent = NULL;
3827 tree->right = right;
3828 tree->token = *token;
3829 tree->token.duplicated = 0;
3830 tree->token.opt_subexp = 0;
3833 tree->node_idx = REG_MISSING;
3836 left->parent = tree;
3838 right->parent = tree;
3842 /* Mark the tree SRC as an optional subexpression.
3843 To be called from preorder or postorder. */
3845 static reg_errcode_t
3846 mark_opt_subexp (void *extra, bin_tree_t *node)
3848 Idx idx = (Idx) (long) extra;
3849 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3850 node->token.opt_subexp = 1;
3855 /* Free the allocated memory inside NODE. */
3858 free_token (re_token_t *node)
3860 #ifdef RE_ENABLE_I18N
3861 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3862 free_charset (node->opr.mbcset);
3864 #endif /* RE_ENABLE_I18N */
3865 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3866 re_free (node->opr.sbcset);
3869 /* Worker function for tree walking. Free the allocated memory inside NODE
3870 and its children. */
3872 static reg_errcode_t
3873 free_tree (void *extra, bin_tree_t *node)
3875 free_token (&node->token);
3880 /* Duplicate the node SRC, and return new node. This is a preorder
3881 visit similar to the one implemented by the generic visitor, but
3882 we need more infrastructure to maintain two parallel trees --- so,
3883 it's easier to duplicate. */
3886 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3888 const bin_tree_t *node;
3889 bin_tree_t *dup_root;
3890 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3892 for (node = root; ; )
3894 /* Create a new tree and link it back to the current parent. */
3895 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3898 (*p_new)->parent = dup_node;
3899 (*p_new)->token.duplicated = 1;
3902 /* Go to the left node, or up and to the right. */
3906 p_new = &dup_node->left;
3910 const bin_tree_t *prev = NULL;
3911 while (node->right == prev || node->right == NULL)
3914 node = node->parent;
3915 dup_node = dup_node->parent;
3920 p_new = &dup_node->right;