1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 /* See if we have to try all bytes which start multiple collation
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
376 re_set_fastmap (fastmap, icase, i);
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
387 || cset->nequiv_classes
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
528 weak_alias (__regcomp, regcomp)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
536 regerror (errcode, preg, errbuf, errbuf_size)
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
541 #else /* size_t might promote */
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
561 msg_size = strlen (msg) + 1; /* Includes the null. */
563 if (BE (errbuf_size != 0, 1))
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
571 memcpy (errbuf, msg, cpy_size);
577 weak_alias (__regerror, regerror)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
588 /* Set the first 128 bits. */
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
611 free_dfa_content (re_dfa_t *dfa)
616 for (i = 0; i < dfa->nodes_len; ++i)
617 free_token (dfa->nodes + i);
618 re_free (dfa->nexts);
619 for (i = 0; i < dfa->nodes_len; ++i)
621 if (dfa->eclosures != NULL)
622 re_node_set_free (dfa->eclosures + i);
623 if (dfa->inveclosures != NULL)
624 re_node_set_free (dfa->inveclosures + i);
625 if (dfa->edests != NULL)
626 re_node_set_free (dfa->edests + i);
628 re_free (dfa->edests);
629 re_free (dfa->eclosures);
630 re_free (dfa->inveclosures);
631 re_free (dfa->nodes);
633 if (dfa->state_table)
634 for (i = 0; i <= dfa->state_hash_mask; ++i)
636 struct re_state_table_entry *entry = dfa->state_table + i;
637 for (j = 0; j < entry->num; ++j)
639 re_dfastate_t *state = entry->array[j];
642 re_free (entry->array);
644 re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646 if (dfa->sb_char != utf8_sb_map)
647 re_free (dfa->sb_char);
649 re_free (dfa->subexp_map);
651 re_free (dfa->re_str);
658 /* Free dynamically allocated space used by PREG. */
664 re_dfa_t *dfa = preg->buffer;
665 if (BE (dfa != NULL, 1))
666 free_dfa_content (dfa);
670 re_free (preg->fastmap);
671 preg->fastmap = NULL;
673 re_free (preg->translate);
674 preg->translate = NULL;
677 weak_alias (__regfree, regfree)
680 /* Entry points compatible with 4.2 BSD regex library. We don't define
681 them unless specifically requested. */
683 #if defined _REGEX_RE_COMP || defined _LIBC
685 /* BSD has one and only one pattern buffer. */
686 static struct re_pattern_buffer re_comp_buf;
690 /* Make these definitions weak in libc, so POSIX programs can redefine
691 these names if they don't use our functions, and still use
692 regcomp/regexec above without link errors. */
703 if (!re_comp_buf.buffer)
704 return gettext ("No previous regular expression");
708 if (re_comp_buf.buffer)
710 fastmap = re_comp_buf.fastmap;
711 re_comp_buf.fastmap = NULL;
712 __regfree (&re_comp_buf);
713 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
714 re_comp_buf.fastmap = fastmap;
717 if (re_comp_buf.fastmap == NULL)
719 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
720 if (re_comp_buf.fastmap == NULL)
721 return (char *) gettext (__re_error_msgid
722 + __re_error_msgid_idx[(int) REG_ESPACE]);
725 /* Since 're_exec' always passes NULL for the 'regs' argument, we
726 don't need to initialize the pattern buffer fields which affect it. */
728 /* Match anchors at newlines. */
729 re_comp_buf.newline_anchor = 1;
731 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
736 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
737 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741 libc_freeres_fn (free_mem)
743 __regfree (&re_comp_buf);
747 #endif /* _REGEX_RE_COMP */
749 /* Internal entry point.
750 Compile the regular expression PATTERN, whose length is LENGTH.
751 SYNTAX indicate regular expression's syntax. */
754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
757 reg_errcode_t err = REG_NOERROR;
761 /* Initialize the pattern buffer. */
762 preg->fastmap_accurate = 0;
763 preg->syntax = syntax;
764 preg->not_bol = preg->not_eol = 0;
767 preg->can_be_null = 0;
768 preg->regs_allocated = REGS_UNALLOCATED;
770 /* Initialize the dfa. */
772 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
774 /* If zero allocated, but buffer is non-null, try to realloc
775 enough space. This loses if buffer's address is bogus, but
776 that is the user's responsibility. If ->buffer is NULL this
777 is a simple allocation. */
778 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
781 preg->allocated = sizeof (re_dfa_t);
784 preg->used = sizeof (re_dfa_t);
786 err = init_dfa (dfa, length);
787 if (BE (err != REG_NOERROR, 0))
789 free_dfa_content (dfa);
795 /* Note: length+1 will not overflow since it is checked in init_dfa. */
796 dfa->re_str = re_malloc (char, length + 1);
797 strncpy (dfa->re_str, pattern, length + 1);
800 __libc_lock_init (dfa->lock);
802 err = re_string_construct (®exp, pattern, length, preg->translate,
803 (syntax & RE_ICASE) != 0, dfa);
804 if (BE (err != REG_NOERROR, 0))
806 re_compile_internal_free_return:
807 free_workarea_compile (preg);
808 re_string_destruct (®exp);
809 free_dfa_content (dfa);
815 /* Parse the regular expression, and build a structure tree. */
817 dfa->str_tree = parse (®exp, preg, syntax, &err);
818 if (BE (dfa->str_tree == NULL, 0))
819 goto re_compile_internal_free_return;
821 /* Analyze the tree and create the nfa. */
822 err = analyze (preg);
823 if (BE (err != REG_NOERROR, 0))
824 goto re_compile_internal_free_return;
826 #ifdef RE_ENABLE_I18N
827 /* If possible, do searching in single byte encoding to speed things up. */
828 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
832 /* Then create the initial state of the dfa. */
833 err = create_initial_state (dfa);
835 /* Release work areas. */
836 free_workarea_compile (preg);
837 re_string_destruct (®exp);
839 if (BE (err != REG_NOERROR, 0))
841 free_dfa_content (dfa);
849 /* Initialize DFA. We use the length of the regular expression PAT_LEN
850 as the initial length of some arrays. */
853 init_dfa (re_dfa_t *dfa, size_t pat_len)
855 __re_size_t table_size;
857 const char *codeset_name;
859 #ifdef RE_ENABLE_I18N
860 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
862 size_t max_i18n_object_size = 0;
864 size_t max_object_size =
865 MAX (sizeof (struct re_state_table_entry),
866 MAX (sizeof (re_token_t),
867 MAX (sizeof (re_node_set),
868 MAX (sizeof (regmatch_t),
869 max_i18n_object_size))));
871 memset (dfa, '\0', sizeof (re_dfa_t));
873 /* Force allocation of str_tree_storage the first time. */
874 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
876 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
877 calculation below, and for similar doubling calculations
878 elsewhere. And it's <= rather than <, because some of the
879 doubling calculations add 1 afterwards. */
880 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
883 dfa->nodes_alloc = pat_len + 1;
884 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
886 /* table_size = 2 ^ ceil(log pat_len) */
887 for (table_size = 1; ; table_size <<= 1)
888 if (table_size > pat_len)
891 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
892 dfa->state_hash_mask = table_size - 1;
894 dfa->mb_cur_max = MB_CUR_MAX;
896 if (dfa->mb_cur_max == 6
897 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
899 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
902 codeset_name = nl_langinfo (CODESET);
903 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
904 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
905 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
906 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
909 /* We check exhaustively in the loop below if this charset is a
910 superset of ASCII. */
911 dfa->map_notascii = 0;
914 #ifdef RE_ENABLE_I18N
915 if (dfa->mb_cur_max > 1)
918 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
923 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
924 if (BE (dfa->sb_char == NULL, 0))
927 /* Set the bits corresponding to single byte chars. */
928 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
929 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
931 wint_t wch = __btowc (ch);
933 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
935 if (isascii (ch) && wch != ch)
936 dfa->map_notascii = 1;
943 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
948 /* Initialize WORD_CHAR table, which indicate which character is
949 "word". In this case "word" means that it is the word construction
950 character used by some operators like "\<", "\>", etc. */
954 init_word_char (re_dfa_t *dfa)
956 dfa->word_ops_used = 1;
960 if (BE (dfa->map_notascii == 0, 1))
962 bitset_word_t bits0 = 0x00000000;
963 bitset_word_t bits1 = 0x03ff0000;
964 bitset_word_t bits2 = 0x87fffffe;
965 bitset_word_t bits3 = 0x07fffffe;
966 if (BITSET_WORD_BITS == 64)
968 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
969 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
972 else if (BITSET_WORD_BITS == 32)
974 dfa->word_char[0] = bits0;
975 dfa->word_char[1] = bits1;
976 dfa->word_char[2] = bits2;
977 dfa->word_char[3] = bits3;
984 if (BE (dfa->is_utf8, 1))
986 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992 for (; i < BITSET_WORDS; ++i)
993 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
994 if (isalnum (ch) || ch == '_')
995 dfa->word_char[i] |= (bitset_word_t) 1 << j;
998 /* Free the work area which are only used while compiling. */
1001 free_workarea_compile (regex_t *preg)
1003 re_dfa_t *dfa = preg->buffer;
1004 bin_tree_storage_t *storage, *next;
1005 for (storage = dfa->str_tree_storage; storage; storage = next)
1007 next = storage->next;
1010 dfa->str_tree_storage = NULL;
1011 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1012 dfa->str_tree = NULL;
1013 re_free (dfa->org_indices);
1014 dfa->org_indices = NULL;
1017 /* Create initial states for all contexts. */
1019 static reg_errcode_t
1020 create_initial_state (re_dfa_t *dfa)
1024 re_node_set init_nodes;
1026 /* Initial states have the epsilon closure of the node which is
1027 the first node of the regular expression. */
1028 first = dfa->str_tree->first->node_idx;
1029 dfa->init_node = first;
1030 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1031 if (BE (err != REG_NOERROR, 0))
1034 /* The back-references which are in initial states can epsilon transit,
1035 since in this case all of the subexpressions can be null.
1036 Then we add epsilon closures of the nodes which are the next nodes of
1037 the back-references. */
1038 if (dfa->nbackref > 0)
1039 for (i = 0; i < init_nodes.nelem; ++i)
1041 Idx node_idx = init_nodes.elems[i];
1042 re_token_type_t type = dfa->nodes[node_idx].type;
1045 if (type != OP_BACK_REF)
1047 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1049 re_token_t *clexp_node;
1050 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1051 if (clexp_node->type == OP_CLOSE_SUBEXP
1052 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1055 if (clexp_idx == init_nodes.nelem)
1058 if (type == OP_BACK_REF)
1060 Idx dest_idx = dfa->edests[node_idx].elems[0];
1061 if (!re_node_set_contains (&init_nodes, dest_idx))
1063 reg_errcode_t merge_err
1064 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1065 if (merge_err != REG_NOERROR)
1072 /* It must be the first time to invoke acquire_state. */
1073 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1074 /* We don't check ERR here, since the initial state must not be NULL. */
1075 if (BE (dfa->init_state == NULL, 0))
1077 if (dfa->init_state->has_constraint)
1079 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1081 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1083 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1087 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1088 || dfa->init_state_begbuf == NULL, 0))
1092 dfa->init_state_word = dfa->init_state_nl
1093 = dfa->init_state_begbuf = dfa->init_state;
1095 re_node_set_free (&init_nodes);
1099 #ifdef RE_ENABLE_I18N
1100 /* If it is possible to do searching in single byte encoding instead of UTF-8
1101 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1102 DFA nodes where needed. */
1105 optimize_utf8 (re_dfa_t *dfa)
1109 bool mb_chars = false;
1110 bool has_period = false;
1112 for (node = 0; node < dfa->nodes_len; ++node)
1113 switch (dfa->nodes[node].type)
1116 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1120 switch (dfa->nodes[node].opr.ctx_type)
1128 /* Word anchors etc. cannot be handled. It's okay to test
1129 opr.ctx_type since constraints (for all DFA nodes) are
1130 created by ORing one or more opr.ctx_type values. */
1140 case OP_DUP_ASTERISK:
1141 case OP_OPEN_SUBEXP:
1142 case OP_CLOSE_SUBEXP:
1144 case COMPLEX_BRACKET:
1146 case SIMPLE_BRACKET:
1147 /* Just double check. */
1149 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1151 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1152 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1154 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1164 if (mb_chars || has_period)
1165 for (node = 0; node < dfa->nodes_len; ++node)
1167 if (dfa->nodes[node].type == CHARACTER
1168 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1169 dfa->nodes[node].mb_partial = 0;
1170 else if (dfa->nodes[node].type == OP_PERIOD)
1171 dfa->nodes[node].type = OP_UTF8_PERIOD;
1174 /* The search can be in single byte locale. */
1175 dfa->mb_cur_max = 1;
1177 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1181 /* Analyze the structure tree, and calculate "first", "next", "edest",
1182 "eclosure", and "inveclosure". */
1184 static reg_errcode_t
1185 analyze (regex_t *preg)
1187 re_dfa_t *dfa = preg->buffer;
1190 /* Allocate arrays. */
1191 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1196 || dfa->eclosures == NULL, 0))
1199 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200 if (dfa->subexp_map != NULL)
1203 for (i = 0; i < preg->re_nsub; i++)
1204 dfa->subexp_map[i] = i;
1205 preorder (dfa->str_tree, optimize_subexps, dfa);
1206 for (i = 0; i < preg->re_nsub; i++)
1207 if (dfa->subexp_map[i] != i)
1209 if (i == preg->re_nsub)
1211 free (dfa->subexp_map);
1212 dfa->subexp_map = NULL;
1216 ret = postorder (dfa->str_tree, lower_subexps, preg);
1217 if (BE (ret != REG_NOERROR, 0))
1219 ret = postorder (dfa->str_tree, calc_first, dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1222 preorder (dfa->str_tree, calc_next, dfa);
1223 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224 if (BE (ret != REG_NOERROR, 0))
1226 ret = calc_eclosure (dfa);
1227 if (BE (ret != REG_NOERROR, 0))
1230 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1231 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1232 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1235 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236 if (BE (dfa->inveclosures == NULL, 0))
1238 ret = calc_inveclosure (dfa);
1244 /* Our parse trees are very unbalanced, so we cannot use a stack to
1245 implement parse tree visits. Instead, we use parent pointers and
1246 some hairy code in these two functions. */
1247 static reg_errcode_t
1248 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1251 bin_tree_t *node, *prev;
1253 for (node = root; ; )
1255 /* Descend down the tree, preferably to the left (or to the right
1256 if that's the only child). */
1257 while (node->left || node->right)
1265 reg_errcode_t err = fn (extra, node);
1266 if (BE (err != REG_NOERROR, 0))
1268 if (node->parent == NULL)
1271 node = node->parent;
1273 /* Go up while we have a node that is reached from the right. */
1274 while (node->right == prev || node->right == NULL);
1279 static reg_errcode_t
1280 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1285 for (node = root; ; )
1287 reg_errcode_t err = fn (extra, node);
1288 if (BE (err != REG_NOERROR, 0))
1291 /* Go to the left node, or up and to the right. */
1296 bin_tree_t *prev = NULL;
1297 while (node->right == prev || node->right == NULL)
1300 node = node->parent;
1309 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1311 backreferences as well. Requires a preorder visit. */
1312 static reg_errcode_t
1313 optimize_subexps (void *extra, bin_tree_t *node)
1315 re_dfa_t *dfa = (re_dfa_t *) extra;
1317 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1319 int idx = node->token.opr.idx;
1320 node->token.opr.idx = dfa->subexp_map[idx];
1321 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1324 else if (node->token.type == SUBEXP
1325 && node->left && node->left->token.type == SUBEXP)
1327 Idx other_idx = node->left->token.opr.idx;
1329 node->left = node->left->left;
1331 node->left->parent = node;
1333 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334 if (other_idx < BITSET_WORD_BITS)
1335 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1343 static reg_errcode_t
1344 lower_subexps (void *extra, bin_tree_t *node)
1346 regex_t *preg = (regex_t *) extra;
1347 reg_errcode_t err = REG_NOERROR;
1349 if (node->left && node->left->token.type == SUBEXP)
1351 node->left = lower_subexp (&err, preg, node->left);
1353 node->left->parent = node;
1355 if (node->right && node->right->token.type == SUBEXP)
1357 node->right = lower_subexp (&err, preg, node->right);
1359 node->right->parent = node;
1366 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1368 re_dfa_t *dfa = preg->buffer;
1369 bin_tree_t *body = node->left;
1370 bin_tree_t *op, *cls, *tree1, *tree;
1373 /* We do not optimize empty subexpressions, because otherwise we may
1374 have bad CONCAT nodes with NULL children. This is obviously not
1375 very common, so we do not lose much. An example that triggers
1376 this case is the sed "script" /\(\)/x. */
1377 && node->left != NULL
1378 && (node->token.opr.idx >= BITSET_WORD_BITS
1379 || !(dfa->used_bkref_map
1380 & ((bitset_word_t) 1 << node->token.opr.idx))))
1383 /* Convert the SUBEXP node to the concatenation of an
1384 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1385 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388 tree = create_tree (dfa, op, tree1, CONCAT);
1389 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1400 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401 nodes. Requires a postorder visit. */
1402 static reg_errcode_t
1403 calc_first (void *extra, bin_tree_t *node)
1405 re_dfa_t *dfa = (re_dfa_t *) extra;
1406 if (node->token.type == CONCAT)
1408 node->first = node->left->first;
1409 node->node_idx = node->left->node_idx;
1414 node->node_idx = re_dfa_add_node (dfa, node->token);
1415 if (BE (node->node_idx == REG_MISSING, 0))
1417 if (node->token.type == ANCHOR)
1418 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1423 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1424 static reg_errcode_t
1425 calc_next (void *extra, bin_tree_t *node)
1427 switch (node->token.type)
1429 case OP_DUP_ASTERISK:
1430 node->left->next = node;
1433 node->left->next = node->right->first;
1434 node->right->next = node->next;
1438 node->left->next = node->next;
1440 node->right->next = node->next;
1446 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1447 static reg_errcode_t
1448 link_nfa_nodes (void *extra, bin_tree_t *node)
1450 re_dfa_t *dfa = (re_dfa_t *) extra;
1451 Idx idx = node->node_idx;
1452 reg_errcode_t err = REG_NOERROR;
1454 switch (node->token.type)
1460 assert (node->next == NULL);
1463 case OP_DUP_ASTERISK:
1467 dfa->has_plural_match = 1;
1468 if (node->left != NULL)
1469 left = node->left->first->node_idx;
1471 left = node->next->node_idx;
1472 if (node->right != NULL)
1473 right = node->right->first->node_idx;
1475 right = node->next->node_idx;
1476 assert (REG_VALID_INDEX (left));
1477 assert (REG_VALID_INDEX (right));
1478 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1483 case OP_OPEN_SUBEXP:
1484 case OP_CLOSE_SUBEXP:
1485 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1489 dfa->nexts[idx] = node->next->node_idx;
1490 if (node->token.type == OP_BACK_REF)
1491 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1495 assert (!IS_EPSILON_NODE (node->token.type));
1496 dfa->nexts[idx] = node->next->node_idx;
1503 /* Duplicate the epsilon closure of the node ROOT_NODE.
1504 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505 to their own constraint. */
1507 static reg_errcode_t
1509 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510 Idx root_node, unsigned int init_constraint)
1512 Idx org_node, clone_node;
1514 unsigned int constraint = init_constraint;
1515 for (org_node = top_org_node, clone_node = top_clone_node;;)
1517 Idx org_dest, clone_dest;
1518 if (dfa->nodes[org_node].type == OP_BACK_REF)
1520 /* If the back reference epsilon-transit, its destination must
1521 also have the constraint. Then duplicate the epsilon closure
1522 of the destination of the back reference, and store it in
1523 edests of the back reference. */
1524 org_dest = dfa->nexts[org_node];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 clone_dest = duplicate_node (dfa, org_dest, constraint);
1527 if (BE (clone_dest == REG_MISSING, 0))
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 else if (dfa->edests[org_node].nelem == 0)
1536 /* In case of the node can't epsilon-transit, don't duplicate the
1537 destination and store the original destination as the
1538 destination of the node. */
1539 dfa->nexts[clone_node] = dfa->nexts[org_node];
1542 else if (dfa->edests[org_node].nelem == 1)
1544 /* In case of the node can epsilon-transit, and it has only one
1546 org_dest = dfa->edests[org_node].elems[0];
1547 re_node_set_empty (dfa->edests + clone_node);
1548 /* If the node is root_node itself, it means the epsilon closure
1549 has a loop. Then tie it to the destination of the root_node. */
1550 if (org_node == root_node && clone_node != org_node)
1552 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1557 /* In case the node has another constraint, append it. */
1558 constraint |= dfa->nodes[org_node].constraint;
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == REG_MISSING, 0))
1562 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 else /* dfa->edests[org_node].nelem == 2 */
1568 /* In case of the node can epsilon-transit, and it has two
1569 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1570 org_dest = dfa->edests[org_node].elems[0];
1571 re_node_set_empty (dfa->edests + clone_node);
1572 /* Search for a duplicated node which satisfies the constraint. */
1573 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574 if (clone_dest == REG_MISSING)
1576 /* There is no such duplicated node, create a new one. */
1578 clone_dest = duplicate_node (dfa, org_dest, constraint);
1579 if (BE (clone_dest == REG_MISSING, 0))
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585 root_node, constraint);
1586 if (BE (err != REG_NOERROR, 0))
1591 /* There is a duplicated node which satisfies the constraint,
1592 use it to avoid infinite loop. */
1593 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1598 org_dest = dfa->edests[org_node].elems[1];
1599 clone_dest = duplicate_node (dfa, org_dest, constraint);
1600 if (BE (clone_dest == REG_MISSING, 0))
1602 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1606 org_node = org_dest;
1607 clone_node = clone_dest;
1612 /* Search for a node which is duplicated from the node ORG_NODE, and
1613 satisfies the constraint CONSTRAINT. */
1616 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617 unsigned int constraint)
1620 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1622 if (org_node == dfa->org_indices[idx]
1623 && constraint == dfa->nodes[idx].constraint)
1624 return idx; /* Found. */
1626 return REG_MISSING; /* Not found. */
1629 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630 Return the index of the new node, or REG_MISSING if insufficient storage is
1634 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1636 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637 if (BE (dup_idx != REG_MISSING, 1))
1639 dfa->nodes[dup_idx].constraint = constraint;
1640 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1641 dfa->nodes[dup_idx].duplicated = 1;
1643 /* Store the index of the original node. */
1644 dfa->org_indices[dup_idx] = org_idx;
1649 static reg_errcode_t
1650 calc_inveclosure (re_dfa_t *dfa)
1654 for (idx = 0; idx < dfa->nodes_len; ++idx)
1655 re_node_set_init_empty (dfa->inveclosures + idx);
1657 for (src = 0; src < dfa->nodes_len; ++src)
1659 Idx *elems = dfa->eclosures[src].elems;
1660 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1662 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1671 /* Calculate "eclosure" for all the node in DFA. */
1673 static reg_errcode_t
1674 calc_eclosure (re_dfa_t *dfa)
1679 assert (dfa->nodes_len > 0);
1682 /* For each nodes, calculate epsilon closure. */
1683 for (node_idx = 0; ; ++node_idx)
1686 re_node_set eclosure_elem;
1687 if (node_idx == dfa->nodes_len)
1696 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1699 /* If we have already calculated, skip it. */
1700 if (dfa->eclosures[node_idx].nelem != 0)
1702 /* Calculate epsilon closure of 'node_idx'. */
1703 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704 if (BE (err != REG_NOERROR, 0))
1707 if (dfa->eclosures[node_idx].nelem == 0)
1710 re_node_set_free (&eclosure_elem);
1716 /* Calculate epsilon closure of NODE. */
1718 static reg_errcode_t
1719 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1723 re_node_set eclosure;
1725 bool incomplete = false;
1726 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727 if (BE (err != REG_NOERROR, 0))
1730 /* This indicates that we are calculating this node now.
1731 We reference this value to avoid infinite loop. */
1732 dfa->eclosures[node].nelem = REG_MISSING;
1734 /* If the current node has constraints, duplicate all nodes
1735 since they must inherit the constraints. */
1736 if (dfa->nodes[node].constraint
1737 && dfa->edests[node].nelem
1738 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1740 err = duplicate_node_closure (dfa, node, node, node,
1741 dfa->nodes[node].constraint);
1742 if (BE (err != REG_NOERROR, 0))
1746 /* Expand each epsilon destination nodes. */
1747 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748 for (i = 0; i < dfa->edests[node].nelem; ++i)
1750 re_node_set eclosure_elem;
1751 Idx edest = dfa->edests[node].elems[i];
1752 /* If calculating the epsilon closure of 'edest' is in progress,
1753 return intermediate result. */
1754 if (dfa->eclosures[edest].nelem == REG_MISSING)
1759 /* If we haven't calculated the epsilon closure of 'edest' yet,
1760 calculate now. Otherwise use calculated epsilon closure. */
1761 if (dfa->eclosures[edest].nelem == 0)
1763 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764 if (BE (err != REG_NOERROR, 0))
1768 eclosure_elem = dfa->eclosures[edest];
1769 /* Merge the epsilon closure of 'edest'. */
1770 err = re_node_set_merge (&eclosure, &eclosure_elem);
1771 if (BE (err != REG_NOERROR, 0))
1773 /* If the epsilon closure of 'edest' is incomplete,
1774 the epsilon closure of this node is also incomplete. */
1775 if (dfa->eclosures[edest].nelem == 0)
1778 re_node_set_free (&eclosure_elem);
1782 /* An epsilon closure includes itself. */
1783 ok = re_node_set_insert (&eclosure, node);
1786 if (incomplete && !root)
1787 dfa->eclosures[node].nelem = 0;
1789 dfa->eclosures[node] = eclosure;
1790 *new_set = eclosure;
1794 /* Functions for token which are used in the parser. */
1796 /* Fetch a token from INPUT.
1797 We must not use this function inside bracket expressions. */
1801 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1803 re_string_skip_bytes (input, peek_token (result, input, syntax));
1806 /* Peek a token from INPUT, and return the length of the token.
1807 We must not use this function inside bracket expressions. */
1811 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1815 if (re_string_eoi (input))
1817 token->type = END_OF_RE;
1821 c = re_string_peek_byte (input, 0);
1824 token->word_char = 0;
1825 #ifdef RE_ENABLE_I18N
1826 token->mb_partial = 0;
1827 if (input->mb_cur_max > 1 &&
1828 !re_string_first_byte (input, re_string_cur_idx (input)))
1830 token->type = CHARACTER;
1831 token->mb_partial = 1;
1838 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1840 token->type = BACK_SLASH;
1844 c2 = re_string_peek_byte_case (input, 1);
1846 token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848 if (input->mb_cur_max > 1)
1850 wint_t wc = re_string_wchar_at (input,
1851 re_string_cur_idx (input) + 1);
1852 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1856 token->word_char = IS_WORD_CHAR (c2) != 0;
1861 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1862 token->type = OP_ALT;
1864 case '1': case '2': case '3': case '4': case '5':
1865 case '6': case '7': case '8': case '9':
1866 if (!(syntax & RE_NO_BK_REFS))
1868 token->type = OP_BACK_REF;
1869 token->opr.idx = c2 - '1';
1873 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_FIRST;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_LAST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = WORD_DELIM;
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = NOT_WORD_DELIM;
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 token->type = OP_WORD;
1905 if (!(syntax & RE_NO_GNU_OPS))
1906 token->type = OP_NOTWORD;
1909 if (!(syntax & RE_NO_GNU_OPS))
1910 token->type = OP_SPACE;
1913 if (!(syntax & RE_NO_GNU_OPS))
1914 token->type = OP_NOTSPACE;
1917 if (!(syntax & RE_NO_GNU_OPS))
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = BUF_FIRST;
1924 if (!(syntax & RE_NO_GNU_OPS))
1926 token->type = ANCHOR;
1927 token->opr.ctx_type = BUF_LAST;
1931 if (!(syntax & RE_NO_BK_PARENS))
1932 token->type = OP_OPEN_SUBEXP;
1935 if (!(syntax & RE_NO_BK_PARENS))
1936 token->type = OP_CLOSE_SUBEXP;
1939 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_PLUS;
1943 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_QUESTION;
1947 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1948 token->type = OP_OPEN_DUP_NUM;
1951 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1952 token->type = OP_CLOSE_DUP_NUM;
1960 token->type = CHARACTER;
1961 #ifdef RE_ENABLE_I18N
1962 if (input->mb_cur_max > 1)
1964 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1965 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1969 token->word_char = IS_WORD_CHAR (token->opr.c);
1974 if (syntax & RE_NEWLINE_ALT)
1975 token->type = OP_ALT;
1978 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1979 token->type = OP_ALT;
1982 token->type = OP_DUP_ASTERISK;
1985 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1986 token->type = OP_DUP_PLUS;
1989 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1990 token->type = OP_DUP_QUESTION;
1993 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1994 token->type = OP_OPEN_DUP_NUM;
1997 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1998 token->type = OP_CLOSE_DUP_NUM;
2001 if (syntax & RE_NO_BK_PARENS)
2002 token->type = OP_OPEN_SUBEXP;
2005 if (syntax & RE_NO_BK_PARENS)
2006 token->type = OP_CLOSE_SUBEXP;
2009 token->type = OP_OPEN_BRACKET;
2012 token->type = OP_PERIOD;
2015 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2016 re_string_cur_idx (input) != 0)
2018 char prev = re_string_peek_byte (input, -1);
2019 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2022 token->type = ANCHOR;
2023 token->opr.ctx_type = LINE_FIRST;
2026 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2027 re_string_cur_idx (input) + 1 != re_string_length (input))
2030 re_string_skip_bytes (input, 1);
2031 peek_token (&next, input, syntax);
2032 re_string_skip_bytes (input, -1);
2033 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2036 token->type = ANCHOR;
2037 token->opr.ctx_type = LINE_LAST;
2045 /* Peek a token from INPUT, and return the length of the token.
2046 We must not use this function out of bracket expressions. */
2050 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2053 if (re_string_eoi (input))
2055 token->type = END_OF_RE;
2058 c = re_string_peek_byte (input, 0);
2061 #ifdef RE_ENABLE_I18N
2062 if (input->mb_cur_max > 1 &&
2063 !re_string_first_byte (input, re_string_cur_idx (input)))
2065 token->type = CHARACTER;
2068 #endif /* RE_ENABLE_I18N */
2070 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2071 && re_string_cur_idx (input) + 1 < re_string_length (input))
2073 /* In this case, '\' escape a character. */
2075 re_string_skip_bytes (input, 1);
2076 c2 = re_string_peek_byte (input, 0);
2078 token->type = CHARACTER;
2081 if (c == '[') /* '[' is a special char in a bracket exps. */
2085 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2086 c2 = re_string_peek_byte (input, 1);
2094 token->type = OP_OPEN_COLL_ELEM;
2097 token->type = OP_OPEN_EQUIV_CLASS;
2100 if (syntax & RE_CHAR_CLASSES)
2102 token->type = OP_OPEN_CHAR_CLASS;
2105 /* else fall through. */
2107 token->type = CHARACTER;
2117 token->type = OP_CHARSET_RANGE;
2120 token->type = OP_CLOSE_BRACKET;
2123 token->type = OP_NON_MATCH_LIST;
2126 token->type = CHARACTER;
2131 /* Functions for parser. */
2133 /* Entry point of the parser.
2134 Parse the regular expression REGEXP and return the structure tree.
2135 If an error occurs, ERR is set by error code, and return NULL.
2136 This function build the following tree, from regular expression <reg_exp>:
2142 CAT means concatenation.
2143 EOR means end of regular expression. */
2146 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2149 re_dfa_t *dfa = preg->buffer;
2150 bin_tree_t *tree, *eor, *root;
2151 re_token_t current_token;
2152 dfa->syntax = syntax;
2153 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2155 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2159 root = create_tree (dfa, tree, eor, CONCAT);
2162 if (BE (eor == NULL || root == NULL, 0))
2170 /* This function build the following tree, from regular expression
2171 <branch1>|<branch2>:
2177 ALT means alternative, which represents the operator '|'. */
2180 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2183 re_dfa_t *dfa = preg->buffer;
2184 bin_tree_t *tree, *branch = NULL;
2185 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 while (token->type == OP_ALT)
2191 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2192 if (token->type != OP_ALT && token->type != END_OF_RE
2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2196 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2201 tree = create_tree (dfa, tree, branch, OP_ALT);
2202 if (BE (tree == NULL, 0))
2211 /* This function build the following tree, from regular expression
2218 CAT means concatenation. */
2221 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2224 bin_tree_t *tree, *expr;
2225 re_dfa_t *dfa = preg->buffer;
2226 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2230 while (token->type != OP_ALT && token->type != END_OF_RE
2231 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2233 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2234 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2237 postorder (tree, free_tree, NULL);
2240 if (tree != NULL && expr != NULL)
2242 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2243 if (newtree == NULL)
2245 postorder (expr, free_tree, NULL);
2246 postorder (tree, free_tree, NULL);
2252 else if (tree == NULL)
2254 /* Otherwise expr == NULL, we don't need to create new tree. */
2259 /* This function build the following tree, from regular expression a*:
2266 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2267 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2269 re_dfa_t *dfa = preg->buffer;
2271 switch (token->type)
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2280 #ifdef RE_ENABLE_I18N
2281 if (dfa->mb_cur_max > 1)
2283 while (!re_string_eoi (regexp)
2284 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2286 bin_tree_t *mbc_remain;
2287 fetch_token (token, regexp, syntax);
2288 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290 if (BE (mbc_remain == NULL || tree == NULL, 0))
2299 case OP_OPEN_SUBEXP:
2300 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 case OP_OPEN_BRACKET:
2305 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2310 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2315 dfa->used_bkref_map |= 1 << token->opr.idx;
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2323 dfa->has_mb_node = 1;
2325 case OP_OPEN_DUP_NUM:
2326 if (syntax & RE_CONTEXT_INVALID_DUP)
2332 case OP_DUP_ASTERISK:
2334 case OP_DUP_QUESTION:
2335 if (syntax & RE_CONTEXT_INVALID_OPS)
2340 else if (syntax & RE_CONTEXT_INDEP_OPS)
2342 fetch_token (token, regexp, syntax);
2343 return parse_expression (regexp, preg, token, syntax, nest, err);
2345 /* else fall through */
2346 case OP_CLOSE_SUBEXP:
2347 if ((token->type == OP_CLOSE_SUBEXP) &&
2348 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2353 /* else fall through */
2354 case OP_CLOSE_DUP_NUM:
2355 /* We treat it as a normal character. */
2357 /* Then we can these characters as normal characters. */
2358 token->type = CHARACTER;
2359 /* mb_partial and word_char bits should be initialized already
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (BE (tree == NULL, 0))
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2413 tree = create_token_tree (dfa, NULL, NULL, token);
2414 if (BE (tree == NULL, 0))
2419 if (dfa->mb_cur_max > 1)
2420 dfa->has_mb_node = 1;
2424 tree = build_charclass_op (dfa, regexp->trans,
2425 (const unsigned char *) "alnum",
2426 (const unsigned char *) "_",
2427 token->type == OP_NOTWORD, err);
2428 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2433 tree = build_charclass_op (dfa, regexp->trans,
2434 (const unsigned char *) "space",
2435 (const unsigned char *) "",
2436 token->type == OP_NOTSPACE, err);
2437 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2447 /* Must not happen? */
2453 fetch_token (token, regexp, syntax);
2455 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2456 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2458 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2461 /* In BRE consecutive duplications are not allowed. */
2462 if ((syntax & RE_CONTEXT_INVALID_DUP)
2463 && (token->type == OP_DUP_ASTERISK
2464 || token->type == OP_OPEN_DUP_NUM))
2474 /* This function build the following tree, from regular expression
2482 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2483 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2485 re_dfa_t *dfa = preg->buffer;
2488 cur_nsub = preg->re_nsub++;
2490 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2492 /* The subexpression may be a null string. */
2493 if (token->type == OP_CLOSE_SUBEXP)
2497 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2498 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2501 postorder (tree, free_tree, NULL);
2504 if (BE (*err != REG_NOERROR, 0))
2508 if (cur_nsub <= '9' - '1')
2509 dfa->completed_bkref_map |= 1 << cur_nsub;
2511 tree = create_tree (dfa, tree, NULL, SUBEXP);
2512 if (BE (tree == NULL, 0))
2517 tree->token.opr.idx = cur_nsub;
2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2527 bin_tree_t *tree = NULL, *old_tree = NULL;
2528 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529 re_token_t start_token = *token;
2531 if (token->type == OP_OPEN_DUP_NUM)
2534 start = fetch_number (regexp, token, syntax);
2535 if (start == REG_MISSING)
2537 if (token->type == CHARACTER && token->opr.c == ',')
2538 start = 0; /* We treat "{,m}" as "{0,m}". */
2541 *err = REG_BADBR; /* <re>{} is invalid. */
2545 if (BE (start != REG_ERROR, 1))
2547 /* We treat "{n}" as "{n,n}". */
2548 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549 : ((token->type == CHARACTER && token->opr.c == ',')
2550 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2552 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2554 /* Invalid sequence. */
2555 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2557 if (token->type == END_OF_RE)
2565 /* If the syntax bit is set, rollback. */
2566 re_string_set_index (regexp, start_idx);
2567 *token = start_token;
2568 token->type = CHARACTER;
2569 /* mb_partial and word_char bits should be already initialized by
2574 if (BE ((end != REG_MISSING && start > end)
2575 || token->type != OP_CLOSE_DUP_NUM, 0))
2577 /* First number greater than second. */
2582 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2590 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2594 fetch_token (token, regexp, syntax);
2596 if (BE (elem == NULL, 0))
2598 if (BE (start == 0 && end == 0, 0))
2600 postorder (elem, free_tree, NULL);
2604 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2605 if (BE (start > 0, 0))
2608 for (i = 2; i <= start; ++i)
2610 elem = duplicate_tree (elem, dfa);
2611 tree = create_tree (dfa, tree, elem, CONCAT);
2612 if (BE (elem == NULL || tree == NULL, 0))
2613 goto parse_dup_op_espace;
2619 /* Duplicate ELEM before it is marked optional. */
2620 elem = duplicate_tree (elem, dfa);
2626 if (elem->token.type == SUBEXP)
2628 uintptr_t subidx = elem->token.opr.idx;
2629 postorder (elem, mark_opt_subexp, (void *) subidx);
2632 tree = create_tree (dfa, elem, NULL,
2633 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2634 if (BE (tree == NULL, 0))
2635 goto parse_dup_op_espace;
2637 /* From gnulib's "intprops.h":
2638 True if the arithmetic type T is signed. */
2639 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2641 /* This loop is actually executed only when end != REG_MISSING,
2642 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2643 already created the start+1-th copy. */
2644 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2645 for (i = start + 2; i <= end; ++i)
2647 elem = duplicate_tree (elem, dfa);
2648 tree = create_tree (dfa, tree, elem, CONCAT);
2649 if (BE (elem == NULL || tree == NULL, 0))
2650 goto parse_dup_op_espace;
2652 tree = create_tree (dfa, tree, NULL, OP_ALT);
2653 if (BE (tree == NULL, 0))
2654 goto parse_dup_op_espace;
2658 tree = create_tree (dfa, old_tree, tree, CONCAT);
2662 parse_dup_op_espace:
2667 /* Size of the names for collating symbol/equivalence_class/character_class.
2668 I'm not sure, but maybe enough. */
2669 #define BRACKET_NAME_BUF_SIZE 32
2672 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2673 Build the range expression which starts from START_ELEM, and ends
2674 at END_ELEM. The result are written to MBCSET and SBCSET.
2675 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2676 mbcset->range_ends, is a pointer argument since we may
2679 static reg_errcode_t
2681 # ifdef RE_ENABLE_I18N
2682 build_range_exp (const reg_syntax_t syntax,
2684 re_charset_t *mbcset,
2686 const bracket_elem_t *start_elem,
2687 const bracket_elem_t *end_elem)
2688 # else /* not RE_ENABLE_I18N */
2689 build_range_exp (const reg_syntax_t syntax,
2691 const bracket_elem_t *start_elem,
2692 const bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2695 unsigned int start_ch, end_ch;
2696 /* Equivalence Classes and Character Classes can't be a range start/end. */
2697 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2702 /* We can handle no multi character collating elements without libc
2704 if (BE ((start_elem->type == COLL_SYM
2705 && strlen ((char *) start_elem->opr.name) > 1)
2706 || (end_elem->type == COLL_SYM
2707 && strlen ((char *) end_elem->opr.name) > 1), 0))
2708 return REG_ECOLLATE;
2710 # ifdef RE_ENABLE_I18N
2715 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2717 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2718 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2720 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2721 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2723 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2724 ? __btowc (start_ch) : start_elem->opr.wch);
2725 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2726 ? __btowc (end_ch) : end_elem->opr.wch);
2727 if (start_wc == WEOF || end_wc == WEOF)
2728 return REG_ECOLLATE;
2729 cmp_buf[0] = start_wc;
2730 cmp_buf[4] = end_wc;
2732 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2733 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2736 /* Got valid collation sequence values, add them as a new entry.
2737 However, for !_LIBC we have no collation elements: if the
2738 character set is single byte, the single byte character set
2739 that we build below suffices. parse_bracket_exp passes
2740 no MBCSET if dfa->mb_cur_max == 1. */
2743 /* Check the space of the arrays. */
2744 if (BE (*range_alloc == mbcset->nranges, 0))
2746 /* There is not enough space, need realloc. */
2747 wchar_t *new_array_start, *new_array_end;
2750 /* +1 in case of mbcset->nranges is 0. */
2751 new_nranges = 2 * mbcset->nranges + 1;
2752 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753 are NULL if *range_alloc == 0. */
2754 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2756 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2759 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2762 mbcset->range_starts = new_array_start;
2763 mbcset->range_ends = new_array_end;
2764 *range_alloc = new_nranges;
2767 mbcset->range_starts[mbcset->nranges] = start_wc;
2768 mbcset->range_ends[mbcset->nranges++] = end_wc;
2771 /* Build the table for single byte characters. */
2772 for (wc = 0; wc < SBC_MAX; ++wc)
2775 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2776 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2777 bitset_set (sbcset, wc);
2780 # else /* not RE_ENABLE_I18N */
2783 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2784 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2786 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2787 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2789 if (start_ch > end_ch)
2791 /* Build the table for single byte characters. */
2792 for (ch = 0; ch < SBC_MAX; ++ch)
2793 if (start_ch <= ch && ch <= end_ch)
2794 bitset_set (sbcset, ch);
2796 # endif /* not RE_ENABLE_I18N */
2799 #endif /* not _LIBC */
2802 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2803 Build the collating element which is represented by NAME.
2804 The result are written to MBCSET and SBCSET.
2805 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2806 pointer argument since we may update it. */
2808 static reg_errcode_t
2810 # ifdef RE_ENABLE_I18N
2811 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2812 Idx *coll_sym_alloc, const unsigned char *name)
2813 # else /* not RE_ENABLE_I18N */
2814 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2815 # endif /* not RE_ENABLE_I18N */
2817 size_t name_len = strlen ((const char *) name);
2818 if (BE (name_len != 1, 0))
2819 return REG_ECOLLATE;
2822 bitset_set (sbcset, name[0]);
2826 #endif /* not _LIBC */
2828 /* This function parse bracket expression like "[abc]", "[a-c]",
2832 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2833 reg_syntax_t syntax, reg_errcode_t *err)
2836 const unsigned char *collseqmb;
2837 const char *collseqwc;
2840 const int32_t *symb_table;
2841 const unsigned char *extra;
2843 /* Local function for parse_bracket_exp used in _LIBC environment.
2844 Seek the collating symbol entry corresponding to NAME.
2845 Return the index of the symbol in the SYMB_TABLE. */
2848 __attribute ((always_inline))
2849 seek_collating_symbol_entry (name, name_len)
2850 const unsigned char *name;
2853 int32_t hash = elem_hash ((const char *) name, name_len);
2854 int32_t elem = hash % table_size;
2855 if (symb_table[2 * elem] != 0)
2857 int32_t second = hash % (table_size - 2) + 1;
2861 /* First compare the hashing value. */
2862 if (symb_table[2 * elem] == hash
2863 /* Compare the length of the name. */
2864 && name_len == extra[symb_table[2 * elem + 1]]
2865 /* Compare the name. */
2866 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2869 /* Yep, this is the entry. */
2876 while (symb_table[2 * elem] != 0);
2881 /* Local function for parse_bracket_exp used in _LIBC environment.
2882 Look up the collation sequence value of BR_ELEM.
2883 Return the value if succeeded, UINT_MAX otherwise. */
2885 auto inline unsigned int
2886 __attribute ((always_inline))
2887 lookup_collation_sequence_value (br_elem)
2888 bracket_elem_t *br_elem;
2890 if (br_elem->type == SB_CHAR)
2893 if (MB_CUR_MAX == 1)
2896 return collseqmb[br_elem->opr.ch];
2899 wint_t wc = __btowc (br_elem->opr.ch);
2900 return __collseq_table_lookup (collseqwc, wc);
2903 else if (br_elem->type == MB_CHAR)
2906 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908 else if (br_elem->type == COLL_SYM)
2910 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2914 elem = seek_collating_symbol_entry (br_elem->opr.name,
2916 if (symb_table[2 * elem] != 0)
2918 /* We found the entry. */
2919 idx = symb_table[2 * elem + 1];
2920 /* Skip the name of collating element name. */
2921 idx += 1 + extra[idx];
2922 /* Skip the byte sequence of the collating element. */
2923 idx += 1 + extra[idx];
2924 /* Adjust for the alignment. */
2925 idx = (idx + 3) & ~3;
2926 /* Skip the multibyte collation sequence value. */
2927 idx += sizeof (unsigned int);
2928 /* Skip the wide char sequence of the collating element. */
2929 idx += sizeof (unsigned int) *
2930 (1 + *(unsigned int *) (extra + idx));
2931 /* Return the collation sequence value. */
2932 return *(unsigned int *) (extra + idx);
2934 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2936 /* No valid character. Match it as a single byte
2938 return collseqmb[br_elem->opr.name[0]];
2941 else if (sym_name_len == 1)
2942 return collseqmb[br_elem->opr.name[0]];
2947 /* Local function for parse_bracket_exp used in _LIBC environment.
2948 Build the range expression which starts from START_ELEM, and ends
2949 at END_ELEM. The result are written to MBCSET and SBCSET.
2950 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2951 mbcset->range_ends, is a pointer argument since we may
2954 auto inline reg_errcode_t
2955 __attribute ((always_inline))
2956 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2957 re_charset_t *mbcset;
2960 bracket_elem_t *start_elem, *end_elem;
2963 uint32_t start_collseq;
2964 uint32_t end_collseq;
2966 /* Equivalence Classes and Character Classes can't be a range
2968 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2969 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2973 start_collseq = lookup_collation_sequence_value (start_elem);
2974 end_collseq = lookup_collation_sequence_value (end_elem);
2975 /* Check start/end collation sequence values. */
2976 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2977 return REG_ECOLLATE;
2978 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2981 /* Got valid collation sequence values, add them as a new entry.
2982 However, if we have no collation elements, and the character set
2983 is single byte, the single byte character set that we
2984 build below suffices. */
2985 if (nrules > 0 || dfa->mb_cur_max > 1)
2987 /* Check the space of the arrays. */
2988 if (BE (*range_alloc == mbcset->nranges, 0))
2990 /* There is not enough space, need realloc. */
2991 uint32_t *new_array_start;
2992 uint32_t *new_array_end;
2995 /* +1 in case of mbcset->nranges is 0. */
2996 new_nranges = 2 * mbcset->nranges + 1;
2997 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2999 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3002 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3005 mbcset->range_starts = new_array_start;
3006 mbcset->range_ends = new_array_end;
3007 *range_alloc = new_nranges;
3010 mbcset->range_starts[mbcset->nranges] = start_collseq;
3011 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3014 /* Build the table for single byte characters. */
3015 for (ch = 0; ch < SBC_MAX; ch++)
3017 uint32_t ch_collseq;
3019 if (MB_CUR_MAX == 1)
3022 ch_collseq = collseqmb[ch];
3024 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3025 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3026 bitset_set (sbcset, ch);
3031 /* Local function for parse_bracket_exp used in _LIBC environment.
3032 Build the collating element which is represented by NAME.
3033 The result are written to MBCSET and SBCSET.
3034 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3035 pointer argument since we may update it. */
3037 auto inline reg_errcode_t
3038 __attribute ((always_inline))
3039 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3040 re_charset_t *mbcset;
3041 Idx *coll_sym_alloc;
3043 const unsigned char *name;
3046 size_t name_len = strlen ((const char *) name);
3049 elem = seek_collating_symbol_entry (name, name_len);
3050 if (symb_table[2 * elem] != 0)
3052 /* We found the entry. */
3053 idx = symb_table[2 * elem + 1];
3054 /* Skip the name of collating element name. */
3055 idx += 1 + extra[idx];
3057 else if (symb_table[2 * elem] == 0 && name_len == 1)
3059 /* No valid character, treat it as a normal
3061 bitset_set (sbcset, name[0]);
3065 return REG_ECOLLATE;
3067 /* Got valid collation sequence, add it as a new entry. */
3068 /* Check the space of the arrays. */
3069 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3071 /* Not enough, realloc it. */
3072 /* +1 in case of mbcset->ncoll_syms is 0. */
3073 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3074 /* Use realloc since mbcset->coll_syms is NULL
3076 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3077 new_coll_sym_alloc);
3078 if (BE (new_coll_syms == NULL, 0))
3080 mbcset->coll_syms = new_coll_syms;
3081 *coll_sym_alloc = new_coll_sym_alloc;
3083 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3088 if (BE (name_len != 1, 0))
3089 return REG_ECOLLATE;
3092 bitset_set (sbcset, name[0]);
3099 re_token_t br_token;
3100 re_bitset_ptr_t sbcset;
3101 #ifdef RE_ENABLE_I18N
3102 re_charset_t *mbcset;
3103 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3104 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3105 #endif /* not RE_ENABLE_I18N */
3106 bool non_match = false;
3107 bin_tree_t *work_tree;
3109 bool first_round = true;
3111 collseqmb = (const unsigned char *)
3112 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3113 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3119 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3120 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3121 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3122 _NL_COLLATE_SYMB_TABLEMB);
3123 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3124 _NL_COLLATE_SYMB_EXTRAMB);
3127 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3128 #ifdef RE_ENABLE_I18N
3129 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3130 #endif /* RE_ENABLE_I18N */
3131 #ifdef RE_ENABLE_I18N
3132 if (BE (sbcset == NULL || mbcset == NULL, 0))
3134 if (BE (sbcset == NULL, 0))
3135 #endif /* RE_ENABLE_I18N */
3138 #ifdef RE_ENABLE_I18N
3145 token_len = peek_token_bracket (token, regexp, syntax);
3146 if (BE (token->type == END_OF_RE, 0))
3149 goto parse_bracket_exp_free_return;
3151 if (token->type == OP_NON_MATCH_LIST)
3153 #ifdef RE_ENABLE_I18N
3154 mbcset->non_match = 1;
3155 #endif /* not RE_ENABLE_I18N */
3157 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3158 bitset_set (sbcset, '\n');
3159 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3160 token_len = peek_token_bracket (token, regexp, syntax);
3161 if (BE (token->type == END_OF_RE, 0))
3164 goto parse_bracket_exp_free_return;
3168 /* We treat the first ']' as a normal character. */
3169 if (token->type == OP_CLOSE_BRACKET)
3170 token->type = CHARACTER;
3174 bracket_elem_t start_elem, end_elem;
3175 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3176 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3179 bool is_range_exp = false;
3182 start_elem.opr.name = start_name_buf;
3183 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3184 syntax, first_round);
3185 if (BE (ret != REG_NOERROR, 0))
3188 goto parse_bracket_exp_free_return;
3190 first_round = false;
3192 /* Get information about the next token. We need it in any case. */
3193 token_len = peek_token_bracket (token, regexp, syntax);
3195 /* Do not check for ranges if we know they are not allowed. */
3196 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3198 if (BE (token->type == END_OF_RE, 0))
3201 goto parse_bracket_exp_free_return;
3203 if (token->type == OP_CHARSET_RANGE)
3205 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3206 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3207 if (BE (token2.type == END_OF_RE, 0))
3210 goto parse_bracket_exp_free_return;
3212 if (token2.type == OP_CLOSE_BRACKET)
3214 /* We treat the last '-' as a normal character. */
3215 re_string_skip_bytes (regexp, -token_len);
3216 token->type = CHARACTER;
3219 is_range_exp = true;
3223 if (is_range_exp == true)
3225 end_elem.opr.name = end_name_buf;
3226 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3228 if (BE (ret != REG_NOERROR, 0))
3231 goto parse_bracket_exp_free_return;
3234 token_len = peek_token_bracket (token, regexp, syntax);
3237 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3238 &start_elem, &end_elem);
3240 # ifdef RE_ENABLE_I18N
3241 *err = build_range_exp (syntax, sbcset,
3242 dfa->mb_cur_max > 1 ? mbcset : NULL,
3243 &range_alloc, &start_elem, &end_elem);
3245 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3247 #endif /* RE_ENABLE_I18N */
3248 if (BE (*err != REG_NOERROR, 0))
3249 goto parse_bracket_exp_free_return;
3253 switch (start_elem.type)
3256 bitset_set (sbcset, start_elem.opr.ch);
3258 #ifdef RE_ENABLE_I18N
3260 /* Check whether the array has enough space. */
3261 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3263 wchar_t *new_mbchars;
3264 /* Not enough, realloc it. */
3265 /* +1 in case of mbcset->nmbchars is 0. */
3266 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3267 /* Use realloc since array is NULL if *alloc == 0. */
3268 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3270 if (BE (new_mbchars == NULL, 0))
3271 goto parse_bracket_exp_espace;
3272 mbcset->mbchars = new_mbchars;
3274 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3276 #endif /* RE_ENABLE_I18N */
3278 *err = build_equiv_class (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280 mbcset, &equiv_class_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282 start_elem.opr.name);
3283 if (BE (*err != REG_NOERROR, 0))
3284 goto parse_bracket_exp_free_return;
3287 *err = build_collating_symbol (sbcset,
3288 #ifdef RE_ENABLE_I18N
3289 mbcset, &coll_sym_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291 start_elem.opr.name);
3292 if (BE (*err != REG_NOERROR, 0))
3293 goto parse_bracket_exp_free_return;
3296 *err = build_charclass (regexp->trans, sbcset,
3297 #ifdef RE_ENABLE_I18N
3298 mbcset, &char_class_alloc,
3299 #endif /* RE_ENABLE_I18N */
3300 start_elem.opr.name, syntax);
3301 if (BE (*err != REG_NOERROR, 0))
3302 goto parse_bracket_exp_free_return;
3309 if (BE (token->type == END_OF_RE, 0))
3312 goto parse_bracket_exp_free_return;
3314 if (token->type == OP_CLOSE_BRACKET)
3318 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3320 /* If it is non-matching list. */
3322 bitset_not (sbcset);
3324 #ifdef RE_ENABLE_I18N
3325 /* Ensure only single byte characters are set. */
3326 if (dfa->mb_cur_max > 1)
3327 bitset_mask (sbcset, dfa->sb_char);
3329 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3330 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3331 || mbcset->non_match)))
3333 bin_tree_t *mbc_tree;
3335 /* Build a tree for complex bracket. */
3336 dfa->has_mb_node = 1;
3337 br_token.type = COMPLEX_BRACKET;
3338 br_token.opr.mbcset = mbcset;
3339 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3340 if (BE (mbc_tree == NULL, 0))
3341 goto parse_bracket_exp_espace;
3342 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3343 if (sbcset[sbc_idx])
3345 /* If there are no bits set in sbcset, there is no point
3346 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3347 if (sbc_idx < BITSET_WORDS)
3349 /* Build a tree for simple bracket. */
3350 br_token.type = SIMPLE_BRACKET;
3351 br_token.opr.sbcset = sbcset;
3352 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3353 if (BE (work_tree == NULL, 0))
3354 goto parse_bracket_exp_espace;
3356 /* Then join them by ALT node. */
3357 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3358 if (BE (work_tree == NULL, 0))
3359 goto parse_bracket_exp_espace;
3364 work_tree = mbc_tree;
3368 #endif /* not RE_ENABLE_I18N */
3370 #ifdef RE_ENABLE_I18N
3371 free_charset (mbcset);
3373 /* Build a tree for simple bracket. */
3374 br_token.type = SIMPLE_BRACKET;
3375 br_token.opr.sbcset = sbcset;
3376 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3377 if (BE (work_tree == NULL, 0))
3378 goto parse_bracket_exp_espace;
3382 parse_bracket_exp_espace:
3384 parse_bracket_exp_free_return:
3386 #ifdef RE_ENABLE_I18N
3387 free_charset (mbcset);
3388 #endif /* RE_ENABLE_I18N */
3392 /* Parse an element in the bracket expression. */
3394 static reg_errcode_t
3395 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3396 re_token_t *token, int token_len, re_dfa_t *dfa,
3397 reg_syntax_t syntax, bool accept_hyphen)
3399 #ifdef RE_ENABLE_I18N
3401 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3402 if (cur_char_size > 1)
3404 elem->type = MB_CHAR;
3405 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3406 re_string_skip_bytes (regexp, cur_char_size);
3409 #endif /* RE_ENABLE_I18N */
3410 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3411 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3412 || token->type == OP_OPEN_EQUIV_CLASS)
3413 return parse_bracket_symbol (elem, regexp, token);
3414 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3416 /* A '-' must only appear as anything but a range indicator before
3417 the closing bracket. Everything else is an error. */
3419 (void) peek_token_bracket (&token2, regexp, syntax);
3420 if (token2.type != OP_CLOSE_BRACKET)
3421 /* The actual error value is not standardized since this whole
3422 case is undefined. But ERANGE makes good sense. */
3425 elem->type = SB_CHAR;
3426 elem->opr.ch = token->opr.c;
3430 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3431 such as [:<character_class>:], [.<collating_element>.], and
3432 [=<equivalent_class>=]. */
3434 static reg_errcode_t
3435 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3438 unsigned char ch, delim = token->opr.c;
3440 if (re_string_eoi(regexp))
3444 if (i >= BRACKET_NAME_BUF_SIZE)
3446 if (token->type == OP_OPEN_CHAR_CLASS)
3447 ch = re_string_fetch_byte_case (regexp);
3449 ch = re_string_fetch_byte (regexp);
3450 if (re_string_eoi(regexp))
3452 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3454 elem->opr.name[i] = ch;
3456 re_string_skip_bytes (regexp, 1);
3457 elem->opr.name[i] = '\0';
3458 switch (token->type)
3460 case OP_OPEN_COLL_ELEM:
3461 elem->type = COLL_SYM;
3463 case OP_OPEN_EQUIV_CLASS:
3464 elem->type = EQUIV_CLASS;
3466 case OP_OPEN_CHAR_CLASS:
3467 elem->type = CHAR_CLASS;
3475 /* Helper function for parse_bracket_exp.
3476 Build the equivalence class which is represented by NAME.
3477 The result are written to MBCSET and SBCSET.
3478 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3479 is a pointer argument since we may update it. */
3481 static reg_errcode_t
3482 #ifdef RE_ENABLE_I18N
3483 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3484 Idx *equiv_class_alloc, const unsigned char *name)
3485 #else /* not RE_ENABLE_I18N */
3486 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3487 #endif /* not RE_ENABLE_I18N */
3490 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3493 const int32_t *table, *indirect;
3494 const unsigned char *weights, *extra, *cp;
3495 unsigned char char_buf[2];
3499 /* This #include defines a local function! */
3500 # include <locale/weight.h>
3501 /* Calculate the index for equivalence class. */
3503 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3504 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3505 _NL_COLLATE_WEIGHTMB);
3506 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3507 _NL_COLLATE_EXTRAMB);
3508 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_INDIRECTMB);
3510 idx1 = findidx (&cp, -1);
3511 if (BE (idx1 == 0 || *cp != '\0', 0))
3512 /* This isn't a valid character. */
3513 return REG_ECOLLATE;
3515 /* Build single byte matching table for this equivalence class. */
3516 len = weights[idx1 & 0xffffff];
3517 for (ch = 0; ch < SBC_MAX; ++ch)
3521 idx2 = findidx (&cp, 1);
3526 /* This isn't a valid character. */
3528 /* Compare only if the length matches and the collation rule
3529 index is the same. */
3530 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3534 while (cnt <= len &&
3535 weights[(idx1 & 0xffffff) + 1 + cnt]
3536 == weights[(idx2 & 0xffffff) + 1 + cnt])
3540 bitset_set (sbcset, ch);
3543 /* Check whether the array has enough space. */
3544 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3546 /* Not enough, realloc it. */
3547 /* +1 in case of mbcset->nequiv_classes is 0. */
3548 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3549 /* Use realloc since the array is NULL if *alloc == 0. */
3550 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3552 new_equiv_class_alloc);
3553 if (BE (new_equiv_classes == NULL, 0))
3555 mbcset->equiv_classes = new_equiv_classes;
3556 *equiv_class_alloc = new_equiv_class_alloc;
3558 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563 if (BE (strlen ((const char *) name) != 1, 0))
3564 return REG_ECOLLATE;
3565 bitset_set (sbcset, *name);
3570 /* Helper function for parse_bracket_exp.
3571 Build the character class which is represented by NAME.
3572 The result are written to MBCSET and SBCSET.
3573 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3574 is a pointer argument since we may update it. */
3576 static reg_errcode_t
3577 #ifdef RE_ENABLE_I18N
3578 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579 re_charset_t *mbcset, Idx *char_class_alloc,
3580 const unsigned char *class_name, reg_syntax_t syntax)
3581 #else /* not RE_ENABLE_I18N */
3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 const unsigned char *class_name, reg_syntax_t syntax)
3584 #endif /* not RE_ENABLE_I18N */
3587 const char *name = (const char *) class_name;
3589 /* In case of REG_ICASE "upper" and "lower" match the both of
3590 upper and lower cases. */
3591 if ((syntax & RE_ICASE)
3592 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3595 #ifdef RE_ENABLE_I18N
3596 /* Check the space of the arrays. */
3597 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3599 /* Not enough, realloc it. */
3600 /* +1 in case of mbcset->nchar_classes is 0. */
3601 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3602 /* Use realloc since array is NULL if *alloc == 0. */
3603 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3604 new_char_class_alloc);
3605 if (BE (new_char_classes == NULL, 0))
3607 mbcset->char_classes = new_char_classes;
3608 *char_class_alloc = new_char_class_alloc;
3610 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3611 #endif /* RE_ENABLE_I18N */
3613 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3615 if (BE (trans != NULL, 0)) \
3617 for (i = 0; i < SBC_MAX; ++i) \
3618 if (ctype_func (i)) \
3619 bitset_set (sbcset, trans[i]); \
3623 for (i = 0; i < SBC_MAX; ++i) \
3624 if (ctype_func (i)) \
3625 bitset_set (sbcset, i); \
3629 if (strcmp (name, "alnum") == 0)
3630 BUILD_CHARCLASS_LOOP (isalnum);
3631 else if (strcmp (name, "cntrl") == 0)
3632 BUILD_CHARCLASS_LOOP (iscntrl);
3633 else if (strcmp (name, "lower") == 0)
3634 BUILD_CHARCLASS_LOOP (islower);
3635 else if (strcmp (name, "space") == 0)
3636 BUILD_CHARCLASS_LOOP (isspace);
3637 else if (strcmp (name, "alpha") == 0)
3638 BUILD_CHARCLASS_LOOP (isalpha);
3639 else if (strcmp (name, "digit") == 0)
3640 BUILD_CHARCLASS_LOOP (isdigit);
3641 else if (strcmp (name, "print") == 0)
3642 BUILD_CHARCLASS_LOOP (isprint);
3643 else if (strcmp (name, "upper") == 0)
3644 BUILD_CHARCLASS_LOOP (isupper);
3645 else if (strcmp (name, "blank") == 0)
3646 BUILD_CHARCLASS_LOOP (isblank);
3647 else if (strcmp (name, "graph") == 0)
3648 BUILD_CHARCLASS_LOOP (isgraph);
3649 else if (strcmp (name, "punct") == 0)
3650 BUILD_CHARCLASS_LOOP (ispunct);
3651 else if (strcmp (name, "xdigit") == 0)
3652 BUILD_CHARCLASS_LOOP (isxdigit);
3660 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3661 const unsigned char *class_name,
3662 const unsigned char *extra, bool non_match,
3665 re_bitset_ptr_t sbcset;
3666 #ifdef RE_ENABLE_I18N
3667 re_charset_t *mbcset;
3669 #endif /* not RE_ENABLE_I18N */
3671 re_token_t br_token;
3674 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3675 #ifdef RE_ENABLE_I18N
3676 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3677 #endif /* RE_ENABLE_I18N */
3679 #ifdef RE_ENABLE_I18N
3680 if (BE (sbcset == NULL || mbcset == NULL, 0))
3681 #else /* not RE_ENABLE_I18N */
3682 if (BE (sbcset == NULL, 0))
3683 #endif /* not RE_ENABLE_I18N */
3691 #ifdef RE_ENABLE_I18N
3692 mbcset->non_match = 1;
3693 #endif /* not RE_ENABLE_I18N */
3696 /* We don't care the syntax in this case. */
3697 ret = build_charclass (trans, sbcset,
3698 #ifdef RE_ENABLE_I18N
3700 #endif /* RE_ENABLE_I18N */
3703 if (BE (ret != REG_NOERROR, 0))
3706 #ifdef RE_ENABLE_I18N
3707 free_charset (mbcset);
3708 #endif /* RE_ENABLE_I18N */
3712 /* \w match '_' also. */
3713 for (; *extra; extra++)
3714 bitset_set (sbcset, *extra);
3716 /* If it is non-matching list. */
3718 bitset_not (sbcset);
3720 #ifdef RE_ENABLE_I18N
3721 /* Ensure only single byte characters are set. */
3722 if (dfa->mb_cur_max > 1)
3723 bitset_mask (sbcset, dfa->sb_char);
3726 /* Build a tree for simple bracket. */
3727 br_token.type = SIMPLE_BRACKET;
3728 br_token.opr.sbcset = sbcset;
3729 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3730 if (BE (tree == NULL, 0))
3731 goto build_word_op_espace;
3733 #ifdef RE_ENABLE_I18N
3734 if (dfa->mb_cur_max > 1)
3736 bin_tree_t *mbc_tree;
3737 /* Build a tree for complex bracket. */
3738 br_token.type = COMPLEX_BRACKET;
3739 br_token.opr.mbcset = mbcset;
3740 dfa->has_mb_node = 1;
3741 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3742 if (BE (mbc_tree == NULL, 0))
3743 goto build_word_op_espace;
3744 /* Then join them by ALT node. */
3745 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3746 if (BE (mbc_tree != NULL, 1))
3751 free_charset (mbcset);
3754 #else /* not RE_ENABLE_I18N */
3756 #endif /* not RE_ENABLE_I18N */
3758 build_word_op_espace:
3760 #ifdef RE_ENABLE_I18N
3761 free_charset (mbcset);
3762 #endif /* RE_ENABLE_I18N */
3767 /* This is intended for the expressions like "a{1,3}".
3768 Fetch a number from 'input', and return the number.
3769 Return REG_MISSING if the number field is empty like "{,1}".
3770 Return RE_DUP_MAX + 1 if the number field is too large.
3771 Return REG_ERROR if an error occurred. */
3774 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3776 Idx num = REG_MISSING;
3780 fetch_token (token, input, syntax);
3782 if (BE (token->type == END_OF_RE, 0))
3784 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3786 num = ((token->type != CHARACTER || c < '0' || '9' < c
3787 || num == REG_ERROR)
3789 : num == REG_MISSING
3791 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796 #ifdef RE_ENABLE_I18N
3798 free_charset (re_charset_t *cset)
3800 re_free (cset->mbchars);
3802 re_free (cset->coll_syms);
3803 re_free (cset->equiv_classes);
3804 re_free (cset->range_starts);
3805 re_free (cset->range_ends);
3807 re_free (cset->char_classes);
3810 #endif /* RE_ENABLE_I18N */
3812 /* Functions for binary tree operation. */
3814 /* Create a tree node. */
3817 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3818 re_token_type_t type)
3822 return create_token_tree (dfa, left, right, &t);
3826 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3827 const re_token_t *token)
3830 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3832 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3834 if (storage == NULL)
3836 storage->next = dfa->str_tree_storage;
3837 dfa->str_tree_storage = storage;
3838 dfa->str_tree_storage_idx = 0;
3840 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3842 tree->parent = NULL;
3844 tree->right = right;
3845 tree->token = *token;
3846 tree->token.duplicated = 0;
3847 tree->token.opt_subexp = 0;
3850 tree->node_idx = REG_MISSING;
3853 left->parent = tree;
3855 right->parent = tree;
3859 /* Mark the tree SRC as an optional subexpression.
3860 To be called from preorder or postorder. */
3862 static reg_errcode_t
3863 mark_opt_subexp (void *extra, bin_tree_t *node)
3865 Idx idx = (uintptr_t) extra;
3866 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3867 node->token.opt_subexp = 1;
3872 /* Free the allocated memory inside NODE. */
3875 free_token (re_token_t *node)
3877 #ifdef RE_ENABLE_I18N
3878 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3879 free_charset (node->opr.mbcset);
3881 #endif /* RE_ENABLE_I18N */
3882 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3883 re_free (node->opr.sbcset);
3886 /* Worker function for tree walking. Free the allocated memory inside NODE
3887 and its children. */
3889 static reg_errcode_t
3890 free_tree (void *extra, bin_tree_t *node)
3892 free_token (&node->token);
3897 /* Duplicate the node SRC, and return new node. This is a preorder
3898 visit similar to the one implemented by the generic visitor, but
3899 we need more infrastructure to maintain two parallel trees --- so,
3900 it's easier to duplicate. */
3903 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3905 const bin_tree_t *node;
3906 bin_tree_t *dup_root;
3907 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3909 for (node = root; ; )
3911 /* Create a new tree and link it back to the current parent. */
3912 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3915 (*p_new)->parent = dup_node;
3916 (*p_new)->token.duplicated = 1;
3919 /* Go to the left node, or up and to the right. */
3923 p_new = &dup_node->left;
3927 const bin_tree_t *prev = NULL;
3928 while (node->right == prev || node->right == NULL)
3931 node = node->parent;
3932 dup_node = dup_node->parent;
3937 p_new = &dup_node->right;