1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 static void optimize_utf8 (re_dfa_t *dfa);
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
37 static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_token_t *token, int token_len,
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
98 #else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 /* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
127 static const char __re_error_msgid[] =
129 #define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 static const size_t __re_error_msgid_idx[] =
202 /* Entry points for GNU code. */
204 /* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
213 re_compile_pattern (pattern, length, bufp)
216 struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240 weak_alias (__re_compile_pattern, re_compile_pattern)
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
259 re_set_syntax (syntax)
262 reg_syntax_t ret = re_syntax_options;
264 re_syntax_options = syntax;
268 weak_alias (__re_set_syntax, re_set_syntax)
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
275 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
276 char *fastmap = bufp->fastmap;
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[tolower (ch)] = 1;
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
317 if (type == CHARACTER)
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 unsigned char buf[MB_LEN_MAX];
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 re_set_fastmap (fastmap, false, buf[0]);
343 else if (type == SIMPLE_BRACKET)
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 /* See if we have to try all bytes which start multiple collation
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
375 re_set_fastmap (fastmap, icase, i);
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
386 || cset->nequiv_classes
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
527 weak_alias (__regcomp, regcomp)
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode, preg, errbuf, errbuf_size)
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
540 #else /* size_t might promote */
542 regerror (int errcode, const regex_t *_Restrict_ preg,
543 char *_Restrict_ errbuf, size_t errbuf_size)
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 msg_size = strlen (msg) + 1; /* Includes the null. */
562 if (BE (errbuf_size != 0, 1))
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
570 memcpy (errbuf, msg, cpy_size);
576 weak_alias (__regerror, regerror)
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset_t utf8_sb_map =
587 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
610 free_dfa_content (re_dfa_t *dfa)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
638 re_dfastate_t *state = entry->array[j];
641 re_free (entry->array);
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
648 re_free (dfa->subexp_map);
650 re_free (dfa->re_str);
657 /* Free dynamically allocated space used by PREG. */
663 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
672 re_free (preg->translate);
673 preg->translate = NULL;
676 weak_alias (__regfree, regfree)
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
682 #if defined _REGEX_RE_COMP || defined _LIBC
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
707 if (re_comp_buf.buffer)
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
716 if (re_comp_buf.fastmap == NULL)
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740 libc_freeres_fn (free_mem)
742 __regfree (&re_comp_buf);
746 #endif /* _REGEX_RE_COMP */
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
756 reg_errcode_t err = REG_NOERROR;
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
769 /* Initialize the dfa. */
770 dfa = (re_dfa_t *) preg->buffer;
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
780 preg->allocated = sizeof (re_dfa_t);
781 preg->buffer = (unsigned char *) dfa;
783 preg->used = sizeof (re_dfa_t);
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
788 free_dfa_content (dfa);
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
799 __libc_lock_init (dfa->lock);
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
814 /* Parse the regular expression, and build a structure tree. */
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
838 if (BE (err != REG_NOERROR, 0))
840 free_dfa_content (dfa);
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 __re_size_t table_size;
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 size_t max_i18n_object_size = 0;
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
870 memset (dfa, '\0', sizeof (re_dfa_t));
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
893 dfa->mb_cur_max = MB_CUR_MAX;
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
901 codeset_name = nl_langinfo (CODESET);
902 if (strcasecmp (codeset_name, "UTF-8") == 0
903 || strcasecmp (codeset_name, "UTF8") == 0)
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
911 #ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 if (BE (dfa->sb_char == NULL, 0))
924 /* Set the bits corresponding to single byte chars. */
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
928 wint_t wch = __btowc (ch);
930 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
932 if (isascii (ch) && wch != ch)
933 dfa->map_notascii = 1;
940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
945 /* Initialize WORD_CHAR table, which indicate which character is
946 "word". In this case "word" means that it is the word construction
947 character used by some operators like "\<", "\>", etc. */
951 init_word_char (re_dfa_t *dfa)
953 dfa->word_ops_used = 1;
957 if (BE (dfa->map_notascii == 0, 1))
959 if (BITSET_WORD_BITS == 64)
961 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
962 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
965 else if (BITSET_WORD_BITS == 32)
967 dfa->word_char[0] = UINT32_C (0x00000000);
968 dfa->word_char[1] = UINT32_C (0x03ff0000);
969 dfa->word_char[2] = UINT32_C (0x87fffffe);
970 dfa->word_char[3] = UINT32_C (0x07fffffe);
977 if (BE (dfa->is_utf8, 1))
979 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
985 for (; i < BITSET_WORDS; ++i)
986 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
987 if (isalnum (ch) || ch == '_')
988 dfa->word_char[i] |= (bitset_word_t) 1 << j;
991 /* Free the work area which are only used while compiling. */
994 free_workarea_compile (regex_t *preg)
996 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
997 bin_tree_storage_t *storage, *next;
998 for (storage = dfa->str_tree_storage; storage; storage = next)
1000 next = storage->next;
1003 dfa->str_tree_storage = NULL;
1004 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1005 dfa->str_tree = NULL;
1006 re_free (dfa->org_indices);
1007 dfa->org_indices = NULL;
1010 /* Create initial states for all contexts. */
1012 static reg_errcode_t
1013 create_initial_state (re_dfa_t *dfa)
1017 re_node_set init_nodes;
1019 /* Initial states have the epsilon closure of the node which is
1020 the first node of the regular expression. */
1021 first = dfa->str_tree->first->node_idx;
1022 dfa->init_node = first;
1023 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1024 if (BE (err != REG_NOERROR, 0))
1027 /* The back-references which are in initial states can epsilon transit,
1028 since in this case all of the subexpressions can be null.
1029 Then we add epsilon closures of the nodes which are the next nodes of
1030 the back-references. */
1031 if (dfa->nbackref > 0)
1032 for (i = 0; i < init_nodes.nelem; ++i)
1034 Idx node_idx = init_nodes.elems[i];
1035 re_token_type_t type = dfa->nodes[node_idx].type;
1038 if (type != OP_BACK_REF)
1040 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1042 re_token_t *clexp_node;
1043 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1044 if (clexp_node->type == OP_CLOSE_SUBEXP
1045 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1048 if (clexp_idx == init_nodes.nelem)
1051 if (type == OP_BACK_REF)
1053 Idx dest_idx = dfa->edests[node_idx].elems[0];
1054 if (!re_node_set_contains (&init_nodes, dest_idx))
1056 reg_errcode_t merge_err
1057 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1058 if (merge_err != REG_NOERROR)
1065 /* It must be the first time to invoke acquire_state. */
1066 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1067 /* We don't check ERR here, since the initial state must not be NULL. */
1068 if (BE (dfa->init_state == NULL, 0))
1070 if (dfa->init_state->has_constraint)
1072 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1074 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1076 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1080 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1081 || dfa->init_state_begbuf == NULL, 0))
1085 dfa->init_state_word = dfa->init_state_nl
1086 = dfa->init_state_begbuf = dfa->init_state;
1088 re_node_set_free (&init_nodes);
1092 #ifdef RE_ENABLE_I18N
1093 /* If it is possible to do searching in single byte encoding instead of UTF-8
1094 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1095 DFA nodes where needed. */
1098 optimize_utf8 (re_dfa_t *dfa)
1102 bool mb_chars = false;
1103 bool has_period = false;
1105 for (node = 0; node < dfa->nodes_len; ++node)
1106 switch (dfa->nodes[node].type)
1109 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1113 switch (dfa->nodes[node].opr.ctx_type)
1121 /* Word anchors etc. cannot be handled. It's okay to test
1122 opr.ctx_type since constraints (for all DFA nodes) are
1123 created by ORing one or more opr.ctx_type values. */
1133 case OP_DUP_ASTERISK:
1134 case OP_OPEN_SUBEXP:
1135 case OP_CLOSE_SUBEXP:
1137 case COMPLEX_BRACKET:
1139 case SIMPLE_BRACKET:
1140 /* Just double check. */
1142 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1144 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1145 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1147 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1157 if (mb_chars || has_period)
1158 for (node = 0; node < dfa->nodes_len; ++node)
1160 if (dfa->nodes[node].type == CHARACTER
1161 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1162 dfa->nodes[node].mb_partial = 0;
1163 else if (dfa->nodes[node].type == OP_PERIOD)
1164 dfa->nodes[node].type = OP_UTF8_PERIOD;
1167 /* The search can be in single byte locale. */
1168 dfa->mb_cur_max = 1;
1170 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1174 /* Analyze the structure tree, and calculate "first", "next", "edest",
1175 "eclosure", and "inveclosure". */
1177 static reg_errcode_t
1178 analyze (regex_t *preg)
1180 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1183 /* Allocate arrays. */
1184 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1185 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1186 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1187 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1188 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1189 || dfa->eclosures == NULL, 0))
1192 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1193 if (dfa->subexp_map != NULL)
1196 for (i = 0; i < preg->re_nsub; i++)
1197 dfa->subexp_map[i] = i;
1198 preorder (dfa->str_tree, optimize_subexps, dfa);
1199 for (i = 0; i < preg->re_nsub; i++)
1200 if (dfa->subexp_map[i] != i)
1202 if (i == preg->re_nsub)
1204 free (dfa->subexp_map);
1205 dfa->subexp_map = NULL;
1209 ret = postorder (dfa->str_tree, lower_subexps, preg);
1210 if (BE (ret != REG_NOERROR, 0))
1212 ret = postorder (dfa->str_tree, calc_first, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1215 preorder (dfa->str_tree, calc_next, dfa);
1216 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1217 if (BE (ret != REG_NOERROR, 0))
1219 ret = calc_eclosure (dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1223 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1224 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1225 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1228 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1229 if (BE (dfa->inveclosures == NULL, 0))
1231 ret = calc_inveclosure (dfa);
1237 /* Our parse trees are very unbalanced, so we cannot use a stack to
1238 implement parse tree visits. Instead, we use parent pointers and
1239 some hairy code in these two functions. */
1240 static reg_errcode_t
1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244 bin_tree_t *node, *prev;
1246 for (node = root; ; )
1248 /* Descend down the tree, preferably to the left (or to the right
1249 if that's the only child). */
1250 while (node->left || node->right)
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1261 if (node->parent == NULL)
1264 node = node->parent;
1266 /* Go up while we have a node that is reached from the right. */
1267 while (node->right == prev || node->right == NULL);
1272 static reg_errcode_t
1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1278 for (node = root; ; )
1280 reg_errcode_t err = fn (extra, node);
1281 if (BE (err != REG_NOERROR, 0))
1284 /* Go to the left node, or up and to the right. */
1289 bin_tree_t *prev = NULL;
1290 while (node->right == prev || node->right == NULL)
1293 node = node->parent;
1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1303 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1304 backreferences as well. Requires a preorder visit. */
1305 static reg_errcode_t
1306 optimize_subexps (void *extra, bin_tree_t *node)
1308 re_dfa_t *dfa = (re_dfa_t *) extra;
1310 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1312 int idx = node->token.opr.idx;
1313 node->token.opr.idx = dfa->subexp_map[idx];
1314 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1317 else if (node->token.type == SUBEXP
1318 && node->left && node->left->token.type == SUBEXP)
1320 Idx other_idx = node->left->token.opr.idx;
1322 node->left = node->left->left;
1324 node->left->parent = node;
1326 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1327 if (other_idx < BITSET_WORD_BITS)
1328 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1335 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1336 static reg_errcode_t
1337 lower_subexps (void *extra, bin_tree_t *node)
1339 regex_t *preg = (regex_t *) extra;
1340 reg_errcode_t err = REG_NOERROR;
1342 if (node->left && node->left->token.type == SUBEXP)
1344 node->left = lower_subexp (&err, preg, node->left);
1346 node->left->parent = node;
1348 if (node->right && node->right->token.type == SUBEXP)
1350 node->right = lower_subexp (&err, preg, node->right);
1352 node->right->parent = node;
1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1361 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1362 bin_tree_t *body = node->left;
1363 bin_tree_t *op, *cls, *tree1, *tree;
1366 /* We do not optimize empty subexpressions, because otherwise we may
1367 have bad CONCAT nodes with NULL children. This is obviously not
1368 very common, so we do not lose much. An example that triggers
1369 this case is the sed "script" /\(\)/x. */
1370 && node->left != NULL
1371 && (node->token.opr.idx >= BITSET_WORD_BITS
1372 || !(dfa->used_bkref_map
1373 & ((bitset_word_t) 1 << node->token.opr.idx))))
1376 /* Convert the SUBEXP node to the concatenation of an
1377 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1378 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1379 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1380 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1381 tree = create_tree (dfa, op, tree1, CONCAT);
1382 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1388 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1389 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1394 nodes. Requires a postorder visit. */
1395 static reg_errcode_t
1396 calc_first (void *extra, bin_tree_t *node)
1398 re_dfa_t *dfa = (re_dfa_t *) extra;
1399 if (node->token.type == CONCAT)
1401 node->first = node->left->first;
1402 node->node_idx = node->left->node_idx;
1407 node->node_idx = re_dfa_add_node (dfa, node->token);
1408 if (BE (node->node_idx == REG_MISSING, 0))
1410 if (node->token.type == ANCHOR)
1411 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1416 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1417 static reg_errcode_t
1418 calc_next (void *extra, bin_tree_t *node)
1420 switch (node->token.type)
1422 case OP_DUP_ASTERISK:
1423 node->left->next = node;
1426 node->left->next = node->right->first;
1427 node->right->next = node->next;
1431 node->left->next = node->next;
1433 node->right->next = node->next;
1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1440 static reg_errcode_t
1441 link_nfa_nodes (void *extra, bin_tree_t *node)
1443 re_dfa_t *dfa = (re_dfa_t *) extra;
1444 Idx idx = node->node_idx;
1445 reg_errcode_t err = REG_NOERROR;
1447 switch (node->token.type)
1453 assert (node->next == NULL);
1456 case OP_DUP_ASTERISK:
1460 dfa->has_plural_match = 1;
1461 if (node->left != NULL)
1462 left = node->left->first->node_idx;
1464 left = node->next->node_idx;
1465 if (node->right != NULL)
1466 right = node->right->first->node_idx;
1468 right = node->next->node_idx;
1469 assert (REG_VALID_INDEX (left));
1470 assert (REG_VALID_INDEX (right));
1471 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1476 case OP_OPEN_SUBEXP:
1477 case OP_CLOSE_SUBEXP:
1478 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1482 dfa->nexts[idx] = node->next->node_idx;
1483 if (node->token.type == OP_BACK_REF)
1484 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1488 assert (!IS_EPSILON_NODE (node->token.type));
1489 dfa->nexts[idx] = node->next->node_idx;
1496 /* Duplicate the epsilon closure of the node ROOT_NODE.
1497 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1498 to their own constraint. */
1500 static reg_errcode_t
1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1503 Idx root_node, unsigned int init_constraint)
1505 Idx org_node, clone_node;
1507 unsigned int constraint = init_constraint;
1508 for (org_node = top_org_node, clone_node = top_clone_node;;)
1510 Idx org_dest, clone_dest;
1511 if (dfa->nodes[org_node].type == OP_BACK_REF)
1513 /* If the back reference epsilon-transit, its destination must
1514 also have the constraint. Then duplicate the epsilon closure
1515 of the destination of the back reference, and store it in
1516 edests of the back reference. */
1517 org_dest = dfa->nexts[org_node];
1518 re_node_set_empty (dfa->edests + clone_node);
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == REG_MISSING, 0))
1522 dfa->nexts[clone_node] = dfa->nexts[org_node];
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 else if (dfa->edests[org_node].nelem == 0)
1529 /* In case of the node can't epsilon-transit, don't duplicate the
1530 destination and store the original destination as the
1531 destination of the node. */
1532 dfa->nexts[clone_node] = dfa->nexts[org_node];
1535 else if (dfa->edests[org_node].nelem == 1)
1537 /* In case of the node can epsilon-transit, and it has only one
1539 org_dest = dfa->edests[org_node].elems[0];
1540 re_node_set_empty (dfa->edests + clone_node);
1541 /* If the node is root_node itself, it means the epsilon closure
1542 has a loop. Then tie it to the destination of the root_node. */
1543 if (org_node == root_node && clone_node != org_node)
1545 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1550 /* In case the node has another constraint, append it. */
1551 constraint |= dfa->nodes[org_node].constraint;
1552 clone_dest = duplicate_node (dfa, org_dest, constraint);
1553 if (BE (clone_dest == REG_MISSING, 0))
1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 else /* dfa->edests[org_node].nelem == 2 */
1561 /* In case of the node can epsilon-transit, and it has two
1562 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1563 org_dest = dfa->edests[org_node].elems[0];
1564 re_node_set_empty (dfa->edests + clone_node);
1565 /* Search for a duplicated node which satisfies the constraint. */
1566 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1567 if (clone_dest == REG_MISSING)
1569 /* There is no such duplicated node, create a new one. */
1571 clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 if (BE (clone_dest == REG_MISSING, 0))
1574 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1577 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1578 root_node, constraint);
1579 if (BE (err != REG_NOERROR, 0))
1584 /* There is a duplicated node which satisfies the constraint,
1585 use it to avoid infinite loop. */
1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 org_dest = dfa->edests[org_node].elems[1];
1592 clone_dest = duplicate_node (dfa, org_dest, constraint);
1593 if (BE (clone_dest == REG_MISSING, 0))
1595 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599 org_node = org_dest;
1600 clone_node = clone_dest;
1605 /* Search for a node which is duplicated from the node ORG_NODE, and
1606 satisfies the constraint CONSTRAINT. */
1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1610 unsigned int constraint)
1613 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1615 if (org_node == dfa->org_indices[idx]
1616 && constraint == dfa->nodes[idx].constraint)
1617 return idx; /* Found. */
1619 return REG_MISSING; /* Not found. */
1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1623 Return the index of the new node, or REG_MISSING if insufficient storage is
1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1629 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1630 if (BE (dup_idx != REG_MISSING, 1))
1632 dfa->nodes[dup_idx].constraint = constraint;
1633 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1634 dfa->nodes[dup_idx].duplicated = 1;
1636 /* Store the index of the original node. */
1637 dfa->org_indices[dup_idx] = org_idx;
1642 static reg_errcode_t
1643 calc_inveclosure (re_dfa_t *dfa)
1647 for (idx = 0; idx < dfa->nodes_len; ++idx)
1648 re_node_set_init_empty (dfa->inveclosures + idx);
1650 for (src = 0; src < dfa->nodes_len; ++src)
1652 Idx *elems = dfa->eclosures[src].elems;
1653 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1655 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1664 /* Calculate "eclosure" for all the node in DFA. */
1666 static reg_errcode_t
1667 calc_eclosure (re_dfa_t *dfa)
1672 assert (dfa->nodes_len > 0);
1675 /* For each nodes, calculate epsilon closure. */
1676 for (node_idx = 0; ; ++node_idx)
1679 re_node_set eclosure_elem;
1680 if (node_idx == dfa->nodes_len)
1689 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1692 /* If we have already calculated, skip it. */
1693 if (dfa->eclosures[node_idx].nelem != 0)
1695 /* Calculate epsilon closure of 'node_idx'. */
1696 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1697 if (BE (err != REG_NOERROR, 0))
1700 if (dfa->eclosures[node_idx].nelem == 0)
1703 re_node_set_free (&eclosure_elem);
1709 /* Calculate epsilon closure of NODE. */
1711 static reg_errcode_t
1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1716 re_node_set eclosure;
1718 bool incomplete = false;
1719 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1720 if (BE (err != REG_NOERROR, 0))
1723 /* This indicates that we are calculating this node now.
1724 We reference this value to avoid infinite loop. */
1725 dfa->eclosures[node].nelem = REG_MISSING;
1727 /* If the current node has constraints, duplicate all nodes
1728 since they must inherit the constraints. */
1729 if (dfa->nodes[node].constraint
1730 && dfa->edests[node].nelem
1731 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1733 err = duplicate_node_closure (dfa, node, node, node,
1734 dfa->nodes[node].constraint);
1735 if (BE (err != REG_NOERROR, 0))
1739 /* Expand each epsilon destination nodes. */
1740 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1741 for (i = 0; i < dfa->edests[node].nelem; ++i)
1743 re_node_set eclosure_elem;
1744 Idx edest = dfa->edests[node].elems[i];
1745 /* If calculating the epsilon closure of 'edest' is in progress,
1746 return intermediate result. */
1747 if (dfa->eclosures[edest].nelem == REG_MISSING)
1752 /* If we haven't calculated the epsilon closure of 'edest' yet,
1753 calculate now. Otherwise use calculated epsilon closure. */
1754 if (dfa->eclosures[edest].nelem == 0)
1756 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1757 if (BE (err != REG_NOERROR, 0))
1761 eclosure_elem = dfa->eclosures[edest];
1762 /* Merge the epsilon closure of 'edest'. */
1763 err = re_node_set_merge (&eclosure, &eclosure_elem);
1764 if (BE (err != REG_NOERROR, 0))
1766 /* If the epsilon closure of 'edest' is incomplete,
1767 the epsilon closure of this node is also incomplete. */
1768 if (dfa->eclosures[edest].nelem == 0)
1771 re_node_set_free (&eclosure_elem);
1775 /* An epsilon closure includes itself. */
1776 ok = re_node_set_insert (&eclosure, node);
1779 if (incomplete && !root)
1780 dfa->eclosures[node].nelem = 0;
1782 dfa->eclosures[node] = eclosure;
1783 *new_set = eclosure;
1787 /* Functions for token which are used in the parser. */
1789 /* Fetch a token from INPUT.
1790 We must not use this function inside bracket expressions. */
1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1796 re_string_skip_bytes (input, peek_token (result, input, syntax));
1799 /* Peek a token from INPUT, and return the length of the token.
1800 We must not use this function inside bracket expressions. */
1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1808 if (re_string_eoi (input))
1810 token->type = END_OF_RE;
1814 c = re_string_peek_byte (input, 0);
1817 token->word_char = 0;
1818 #ifdef RE_ENABLE_I18N
1819 token->mb_partial = 0;
1820 if (input->mb_cur_max > 1 &&
1821 !re_string_first_byte (input, re_string_cur_idx (input)))
1823 token->type = CHARACTER;
1824 token->mb_partial = 1;
1831 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1833 token->type = BACK_SLASH;
1837 c2 = re_string_peek_byte_case (input, 1);
1839 token->type = CHARACTER;
1840 #ifdef RE_ENABLE_I18N
1841 if (input->mb_cur_max > 1)
1843 wint_t wc = re_string_wchar_at (input,
1844 re_string_cur_idx (input) + 1);
1845 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1849 token->word_char = IS_WORD_CHAR (c2) != 0;
1854 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1855 token->type = OP_ALT;
1857 case '1': case '2': case '3': case '4': case '5':
1858 case '6': case '7': case '8': case '9':
1859 if (!(syntax & RE_NO_BK_REFS))
1861 token->type = OP_BACK_REF;
1862 token->opr.idx = c2 - '1';
1866 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_FIRST;
1873 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_LAST;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_DELIM;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = NOT_WORD_DELIM;
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = OP_WORD;
1898 if (!(syntax & RE_NO_GNU_OPS))
1899 token->type = OP_NOTWORD;
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_SPACE;
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_NOTSPACE;
1910 if (!(syntax & RE_NO_GNU_OPS))
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_FIRST;
1917 if (!(syntax & RE_NO_GNU_OPS))
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = BUF_LAST;
1924 if (!(syntax & RE_NO_BK_PARENS))
1925 token->type = OP_OPEN_SUBEXP;
1928 if (!(syntax & RE_NO_BK_PARENS))
1929 token->type = OP_CLOSE_SUBEXP;
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1933 token->type = OP_DUP_PLUS;
1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937 token->type = OP_DUP_QUESTION;
1940 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1941 token->type = OP_OPEN_DUP_NUM;
1944 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945 token->type = OP_CLOSE_DUP_NUM;
1953 token->type = CHARACTER;
1954 #ifdef RE_ENABLE_I18N
1955 if (input->mb_cur_max > 1)
1957 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1958 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1962 token->word_char = IS_WORD_CHAR (token->opr.c);
1967 if (syntax & RE_NEWLINE_ALT)
1968 token->type = OP_ALT;
1971 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1972 token->type = OP_ALT;
1975 token->type = OP_DUP_ASTERISK;
1978 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1979 token->type = OP_DUP_PLUS;
1982 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983 token->type = OP_DUP_QUESTION;
1986 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1987 token->type = OP_OPEN_DUP_NUM;
1990 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991 token->type = OP_CLOSE_DUP_NUM;
1994 if (syntax & RE_NO_BK_PARENS)
1995 token->type = OP_OPEN_SUBEXP;
1998 if (syntax & RE_NO_BK_PARENS)
1999 token->type = OP_CLOSE_SUBEXP;
2002 token->type = OP_OPEN_BRACKET;
2005 token->type = OP_PERIOD;
2008 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2009 re_string_cur_idx (input) != 0)
2011 char prev = re_string_peek_byte (input, -1);
2012 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2015 token->type = ANCHOR;
2016 token->opr.ctx_type = LINE_FIRST;
2019 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2020 re_string_cur_idx (input) + 1 != re_string_length (input))
2023 re_string_skip_bytes (input, 1);
2024 peek_token (&next, input, syntax);
2025 re_string_skip_bytes (input, -1);
2026 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2029 token->type = ANCHOR;
2030 token->opr.ctx_type = LINE_LAST;
2038 /* Peek a token from INPUT, and return the length of the token.
2039 We must not use this function out of bracket expressions. */
2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2046 if (re_string_eoi (input))
2048 token->type = END_OF_RE;
2051 c = re_string_peek_byte (input, 0);
2054 #ifdef RE_ENABLE_I18N
2055 if (input->mb_cur_max > 1 &&
2056 !re_string_first_byte (input, re_string_cur_idx (input)))
2058 token->type = CHARACTER;
2061 #endif /* RE_ENABLE_I18N */
2063 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2064 && re_string_cur_idx (input) + 1 < re_string_length (input))
2066 /* In this case, '\' escape a character. */
2068 re_string_skip_bytes (input, 1);
2069 c2 = re_string_peek_byte (input, 0);
2071 token->type = CHARACTER;
2074 if (c == '[') /* '[' is a special char in a bracket exps. */
2078 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2079 c2 = re_string_peek_byte (input, 1);
2087 token->type = OP_OPEN_COLL_ELEM;
2090 token->type = OP_OPEN_EQUIV_CLASS;
2093 if (syntax & RE_CHAR_CLASSES)
2095 token->type = OP_OPEN_CHAR_CLASS;
2098 /* else fall through. */
2100 token->type = CHARACTER;
2110 token->type = OP_CHARSET_RANGE;
2113 token->type = OP_CLOSE_BRACKET;
2116 token->type = OP_NON_MATCH_LIST;
2119 token->type = CHARACTER;
2124 /* Functions for parser. */
2126 /* Entry point of the parser.
2127 Parse the regular expression REGEXP and return the structure tree.
2128 If an error occurs, ERR is set by error code, and return NULL.
2129 This function build the following tree, from regular expression <reg_exp>:
2135 CAT means concatenation.
2136 EOR means end of regular expression. */
2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2142 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2143 bin_tree_t *tree, *eor, *root;
2144 re_token_t current_token;
2145 dfa->syntax = syntax;
2146 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2152 root = create_tree (dfa, tree, eor, CONCAT);
2155 if (BE (eor == NULL || root == NULL, 0))
2163 /* This function build the following tree, from regular expression
2164 <branch1>|<branch2>:
2170 ALT means alternative, which represents the operator '|'. */
2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2176 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177 bin_tree_t *tree, *branch = NULL;
2178 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182 while (token->type == OP_ALT)
2184 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2185 if (token->type != OP_ALT && token->type != END_OF_RE
2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2188 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2194 tree = create_tree (dfa, tree, branch, OP_ALT);
2195 if (BE (tree == NULL, 0))
2204 /* This function build the following tree, from regular expression
2211 CAT means concatenation. */
2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217 bin_tree_t *tree, *expr;
2218 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2219 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 while (token->type != OP_ALT && token->type != END_OF_RE
2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2226 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230 postorder (tree, free_tree, NULL);
2233 if (tree != NULL && expr != NULL)
2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 if (newtree == NULL)
2238 postorder (expr, free_tree, NULL);
2239 postorder (tree, free_tree, NULL);
2245 else if (tree == NULL)
2247 /* Otherwise expr == NULL, we don't need to create new tree. */
2252 /* This function build the following tree, from regular expression a*:
2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2262 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2264 switch (token->type)
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2273 #ifdef RE_ENABLE_I18N
2274 if (dfa->mb_cur_max > 1)
2276 while (!re_string_eoi (regexp)
2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2279 bin_tree_t *mbc_remain;
2280 fetch_token (token, regexp, syntax);
2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 if (BE (mbc_remain == NULL || tree == NULL, 0))
2292 case OP_OPEN_SUBEXP:
2293 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2297 case OP_OPEN_BRACKET:
2298 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2308 dfa->used_bkref_map |= 1 << token->opr.idx;
2309 tree = create_token_tree (dfa, NULL, NULL, token);
2310 if (BE (tree == NULL, 0))
2316 dfa->has_mb_node = 1;
2318 case OP_OPEN_DUP_NUM:
2319 if (syntax & RE_CONTEXT_INVALID_DUP)
2325 case OP_DUP_ASTERISK:
2327 case OP_DUP_QUESTION:
2328 if (syntax & RE_CONTEXT_INVALID_OPS)
2333 else if (syntax & RE_CONTEXT_INDEP_OPS)
2335 fetch_token (token, regexp, syntax);
2336 return parse_expression (regexp, preg, token, syntax, nest, err);
2338 /* else fall through */
2339 case OP_CLOSE_SUBEXP:
2340 if ((token->type == OP_CLOSE_SUBEXP) &&
2341 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346 /* else fall through */
2347 case OP_CLOSE_DUP_NUM:
2348 /* We treat it as a normal character. */
2350 /* Then we can these characters as normal characters. */
2351 token->type = CHARACTER;
2352 /* mb_partial and word_char bits should be initialized already
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2362 if ((token->opr.ctx_type
2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 && dfa->word_ops_used == 0)
2365 init_word_char (dfa);
2366 if (token->opr.ctx_type == WORD_DELIM
2367 || token->opr.ctx_type == NOT_WORD_DELIM)
2369 bin_tree_t *tree_first, *tree_last;
2370 if (token->opr.ctx_type == WORD_DELIM)
2372 token->opr.ctx_type = WORD_FIRST;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = WORD_LAST;
2378 token->opr.ctx_type = INSIDE_WORD;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = INSIDE_NOTWORD;
2382 tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 tree = create_token_tree (dfa, NULL, NULL, token);
2393 if (BE (tree == NULL, 0))
2399 /* We must return here, since ANCHORs can't be followed
2400 by repetition operators.
2401 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2403 fetch_token (token, regexp, syntax);
2406 tree = create_token_tree (dfa, NULL, NULL, token);
2407 if (BE (tree == NULL, 0))
2412 if (dfa->mb_cur_max > 1)
2413 dfa->has_mb_node = 1;
2417 tree = build_charclass_op (dfa, regexp->trans,
2418 (const unsigned char *) "alnum",
2419 (const unsigned char *) "_",
2420 token->type == OP_NOTWORD, err);
2421 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 (const unsigned char *) "space",
2428 (const unsigned char *) "",
2429 token->type == OP_NOTSPACE, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2440 /* Must not happen? */
2446 fetch_token (token, regexp, syntax);
2448 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2451 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2452 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2454 /* In BRE consecutive duplications are not allowed. */
2455 if ((syntax & RE_CONTEXT_INVALID_DUP)
2456 && (token->type == OP_DUP_ASTERISK
2457 || token->type == OP_OPEN_DUP_NUM))
2467 /* This function build the following tree, from regular expression
2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2478 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2481 cur_nsub = preg->re_nsub++;
2483 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485 /* The subexpression may be a null string. */
2486 if (token->type == OP_CLOSE_SUBEXP)
2490 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2491 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2494 postorder (tree, free_tree, NULL);
2497 if (BE (*err != REG_NOERROR, 0))
2501 if (cur_nsub <= '9' - '1')
2502 dfa->completed_bkref_map |= 1 << cur_nsub;
2504 tree = create_tree (dfa, tree, NULL, SUBEXP);
2505 if (BE (tree == NULL, 0))
2510 tree->token.opr.idx = cur_nsub;
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2518 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2520 bin_tree_t *tree = NULL, *old_tree = NULL;
2521 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2522 re_token_t start_token = *token;
2524 if (token->type == OP_OPEN_DUP_NUM)
2527 start = fetch_number (regexp, token, syntax);
2528 if (start == REG_MISSING)
2530 if (token->type == CHARACTER && token->opr.c == ',')
2531 start = 0; /* We treat "{,m}" as "{0,m}". */
2534 *err = REG_BADBR; /* <re>{} is invalid. */
2538 if (BE (start != REG_ERROR, 1))
2540 /* We treat "{n}" as "{n,n}". */
2541 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542 : ((token->type == CHARACTER && token->opr.c == ',')
2543 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2545 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2547 /* Invalid sequence. */
2548 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2550 if (token->type == END_OF_RE)
2558 /* If the syntax bit is set, rollback. */
2559 re_string_set_index (regexp, start_idx);
2560 *token = start_token;
2561 token->type = CHARACTER;
2562 /* mb_partial and word_char bits should be already initialized by
2567 if (BE ((end != REG_MISSING && start > end)
2568 || token->type != OP_CLOSE_DUP_NUM, 0))
2570 /* First number greater than second. */
2577 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2578 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2581 fetch_token (token, regexp, syntax);
2583 if (BE (elem == NULL, 0))
2585 if (BE (start == 0 && end == 0, 0))
2587 postorder (elem, free_tree, NULL);
2591 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2592 if (BE (start > 0, 0))
2595 for (i = 2; i <= start; ++i)
2597 elem = duplicate_tree (elem, dfa);
2598 tree = create_tree (dfa, tree, elem, CONCAT);
2599 if (BE (elem == NULL || tree == NULL, 0))
2600 goto parse_dup_op_espace;
2606 /* Duplicate ELEM before it is marked optional. */
2607 elem = duplicate_tree (elem, dfa);
2613 if (elem->token.type == SUBEXP)
2614 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2616 tree = create_tree (dfa, elem, NULL,
2617 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2618 if (BE (tree == NULL, 0))
2619 goto parse_dup_op_espace;
2621 /* From gnulib's "intprops.h":
2622 True if the arithmetic type T is signed. */
2623 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2625 /* This loop is actually executed only when end != REG_MISSING,
2626 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2627 already created the start+1-th copy. */
2628 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2629 for (i = start + 2; i <= end; ++i)
2631 elem = duplicate_tree (elem, dfa);
2632 tree = create_tree (dfa, tree, elem, CONCAT);
2633 if (BE (elem == NULL || tree == NULL, 0))
2634 goto parse_dup_op_espace;
2636 tree = create_tree (dfa, tree, NULL, OP_ALT);
2637 if (BE (tree == NULL, 0))
2638 goto parse_dup_op_espace;
2642 tree = create_tree (dfa, old_tree, tree, CONCAT);
2646 parse_dup_op_espace:
2651 /* Size of the names for collating symbol/equivalence_class/character_class.
2652 I'm not sure, but maybe enough. */
2653 #define BRACKET_NAME_BUF_SIZE 32
2656 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2657 Build the range expression which starts from START_ELEM, and ends
2658 at END_ELEM. The result are written to MBCSET and SBCSET.
2659 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2660 mbcset->range_ends, is a pointer argument since we may
2663 static reg_errcode_t
2665 # ifdef RE_ENABLE_I18N
2666 build_range_exp (const reg_syntax_t syntax,
2668 re_charset_t *mbcset,
2670 const bracket_elem_t *start_elem,
2671 const bracket_elem_t *end_elem)
2672 # else /* not RE_ENABLE_I18N */
2673 build_range_exp (const reg_syntax_t syntax,
2675 const bracket_elem_t *start_elem,
2676 const bracket_elem_t *end_elem)
2677 # endif /* not RE_ENABLE_I18N */
2679 unsigned int start_ch, end_ch;
2680 /* Equivalence Classes and Character Classes can't be a range start/end. */
2681 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2682 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2686 /* We can handle no multi character collating elements without libc
2688 if (BE ((start_elem->type == COLL_SYM
2689 && strlen ((char *) start_elem->opr.name) > 1)
2690 || (end_elem->type == COLL_SYM
2691 && strlen ((char *) end_elem->opr.name) > 1), 0))
2692 return REG_ECOLLATE;
2694 # ifdef RE_ENABLE_I18N
2699 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2701 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2702 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2704 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2705 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2707 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2708 ? __btowc (start_ch) : start_elem->opr.wch);
2709 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2710 ? __btowc (end_ch) : end_elem->opr.wch);
2711 if (start_wc == WEOF || end_wc == WEOF)
2712 return REG_ECOLLATE;
2713 cmp_buf[0] = start_wc;
2714 cmp_buf[4] = end_wc;
2716 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2717 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2720 /* Got valid collation sequence values, add them as a new entry.
2721 However, for !_LIBC we have no collation elements: if the
2722 character set is single byte, the single byte character set
2723 that we build below suffices. parse_bracket_exp passes
2724 no MBCSET if dfa->mb_cur_max == 1. */
2727 /* Check the space of the arrays. */
2728 if (BE (*range_alloc == mbcset->nranges, 0))
2730 /* There is not enough space, need realloc. */
2731 wchar_t *new_array_start, *new_array_end;
2734 /* +1 in case of mbcset->nranges is 0. */
2735 new_nranges = 2 * mbcset->nranges + 1;
2736 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2737 are NULL if *range_alloc == 0. */
2738 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2740 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2743 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2746 mbcset->range_starts = new_array_start;
2747 mbcset->range_ends = new_array_end;
2748 *range_alloc = new_nranges;
2751 mbcset->range_starts[mbcset->nranges] = start_wc;
2752 mbcset->range_ends[mbcset->nranges++] = end_wc;
2755 /* Build the table for single byte characters. */
2756 for (wc = 0; wc < SBC_MAX; ++wc)
2759 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2760 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2761 bitset_set (sbcset, wc);
2764 # else /* not RE_ENABLE_I18N */
2767 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2768 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2770 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2771 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2773 if (start_ch > end_ch)
2775 /* Build the table for single byte characters. */
2776 for (ch = 0; ch < SBC_MAX; ++ch)
2777 if (start_ch <= ch && ch <= end_ch)
2778 bitset_set (sbcset, ch);
2780 # endif /* not RE_ENABLE_I18N */
2783 #endif /* not _LIBC */
2786 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2787 Build the collating element which is represented by NAME.
2788 The result are written to MBCSET and SBCSET.
2789 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2790 pointer argument since we may update it. */
2792 static reg_errcode_t
2794 # ifdef RE_ENABLE_I18N
2795 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2796 Idx *coll_sym_alloc, const unsigned char *name)
2797 # else /* not RE_ENABLE_I18N */
2798 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2799 # endif /* not RE_ENABLE_I18N */
2801 size_t name_len = strlen ((const char *) name);
2802 if (BE (name_len != 1, 0))
2803 return REG_ECOLLATE;
2806 bitset_set (sbcset, name[0]);
2810 #endif /* not _LIBC */
2812 /* This function parse bracket expression like "[abc]", "[a-c]",
2816 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2817 reg_syntax_t syntax, reg_errcode_t *err)
2820 const unsigned char *collseqmb;
2821 const char *collseqwc;
2824 const int32_t *symb_table;
2825 const unsigned char *extra;
2827 /* Local function for parse_bracket_exp used in _LIBC environment.
2828 Seek the collating symbol entry corresponding to NAME.
2829 Return the index of the symbol in the SYMB_TABLE. */
2832 __attribute ((always_inline))
2833 seek_collating_symbol_entry (name, name_len)
2834 const unsigned char *name;
2837 int32_t hash = elem_hash ((const char *) name, name_len);
2838 int32_t elem = hash % table_size;
2839 if (symb_table[2 * elem] != 0)
2841 int32_t second = hash % (table_size - 2) + 1;
2845 /* First compare the hashing value. */
2846 if (symb_table[2 * elem] == hash
2847 /* Compare the length of the name. */
2848 && name_len == extra[symb_table[2 * elem + 1]]
2849 /* Compare the name. */
2850 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2853 /* Yep, this is the entry. */
2860 while (symb_table[2 * elem] != 0);
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866 Look up the collation sequence value of BR_ELEM.
2867 Return the value if succeeded, UINT_MAX otherwise. */
2869 auto inline unsigned int
2870 __attribute ((always_inline))
2871 lookup_collation_sequence_value (br_elem)
2872 bracket_elem_t *br_elem;
2874 if (br_elem->type == SB_CHAR)
2877 if (MB_CUR_MAX == 1)
2880 return collseqmb[br_elem->opr.ch];
2883 wint_t wc = __btowc (br_elem->opr.ch);
2884 return __collseq_table_lookup (collseqwc, wc);
2887 else if (br_elem->type == MB_CHAR)
2890 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2892 else if (br_elem->type == COLL_SYM)
2894 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2898 elem = seek_collating_symbol_entry (br_elem->opr.name,
2900 if (symb_table[2 * elem] != 0)
2902 /* We found the entry. */
2903 idx = symb_table[2 * elem + 1];
2904 /* Skip the name of collating element name. */
2905 idx += 1 + extra[idx];
2906 /* Skip the byte sequence of the collating element. */
2907 idx += 1 + extra[idx];
2908 /* Adjust for the alignment. */
2909 idx = (idx + 3) & ~3;
2910 /* Skip the multibyte collation sequence value. */
2911 idx += sizeof (unsigned int);
2912 /* Skip the wide char sequence of the collating element. */
2913 idx += sizeof (unsigned int) *
2914 (1 + *(unsigned int *) (extra + idx));
2915 /* Return the collation sequence value. */
2916 return *(unsigned int *) (extra + idx);
2918 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2920 /* No valid character. Match it as a single byte
2922 return collseqmb[br_elem->opr.name[0]];
2925 else if (sym_name_len == 1)
2926 return collseqmb[br_elem->opr.name[0]];
2931 /* Local function for parse_bracket_exp used in _LIBC environment.
2932 Build the range expression which starts from START_ELEM, and ends
2933 at END_ELEM. The result are written to MBCSET and SBCSET.
2934 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2935 mbcset->range_ends, is a pointer argument since we may
2938 auto inline reg_errcode_t
2939 __attribute ((always_inline))
2940 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2941 re_charset_t *mbcset;
2944 bracket_elem_t *start_elem, *end_elem;
2947 uint32_t start_collseq;
2948 uint32_t end_collseq;
2950 /* Equivalence Classes and Character Classes can't be a range
2952 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2953 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2957 start_collseq = lookup_collation_sequence_value (start_elem);
2958 end_collseq = lookup_collation_sequence_value (end_elem);
2959 /* Check start/end collation sequence values. */
2960 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961 return REG_ECOLLATE;
2962 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2965 /* Got valid collation sequence values, add them as a new entry.
2966 However, if we have no collation elements, and the character set
2967 is single byte, the single byte character set that we
2968 build below suffices. */
2969 if (nrules > 0 || dfa->mb_cur_max > 1)
2971 /* Check the space of the arrays. */
2972 if (BE (*range_alloc == mbcset->nranges, 0))
2974 /* There is not enough space, need realloc. */
2975 uint32_t *new_array_start;
2976 uint32_t *new_array_end;
2979 /* +1 in case of mbcset->nranges is 0. */
2980 new_nranges = 2 * mbcset->nranges + 1;
2981 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2983 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2986 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2989 mbcset->range_starts = new_array_start;
2990 mbcset->range_ends = new_array_end;
2991 *range_alloc = new_nranges;
2994 mbcset->range_starts[mbcset->nranges] = start_collseq;
2995 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2998 /* Build the table for single byte characters. */
2999 for (ch = 0; ch < SBC_MAX; ch++)
3001 uint32_t ch_collseq;
3003 if (MB_CUR_MAX == 1)
3006 ch_collseq = collseqmb[ch];
3008 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010 bitset_set (sbcset, ch);
3015 /* Local function for parse_bracket_exp used in _LIBC environment.
3016 Build the collating element which is represented by NAME.
3017 The result are written to MBCSET and SBCSET.
3018 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019 pointer argument since we may update it. */
3021 auto inline reg_errcode_t
3022 __attribute ((always_inline))
3023 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3024 re_charset_t *mbcset;
3025 Idx *coll_sym_alloc;
3027 const unsigned char *name;
3030 size_t name_len = strlen ((const char *) name);
3033 elem = seek_collating_symbol_entry (name, name_len);
3034 if (symb_table[2 * elem] != 0)
3036 /* We found the entry. */
3037 idx = symb_table[2 * elem + 1];
3038 /* Skip the name of collating element name. */
3039 idx += 1 + extra[idx];
3041 else if (symb_table[2 * elem] == 0 && name_len == 1)
3043 /* No valid character, treat it as a normal
3045 bitset_set (sbcset, name[0]);
3049 return REG_ECOLLATE;
3051 /* Got valid collation sequence, add it as a new entry. */
3052 /* Check the space of the arrays. */
3053 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3055 /* Not enough, realloc it. */
3056 /* +1 in case of mbcset->ncoll_syms is 0. */
3057 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3058 /* Use realloc since mbcset->coll_syms is NULL
3060 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3061 new_coll_sym_alloc);
3062 if (BE (new_coll_syms == NULL, 0))
3064 mbcset->coll_syms = new_coll_syms;
3065 *coll_sym_alloc = new_coll_sym_alloc;
3067 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3072 if (BE (name_len != 1, 0))
3073 return REG_ECOLLATE;
3076 bitset_set (sbcset, name[0]);
3083 re_token_t br_token;
3084 re_bitset_ptr_t sbcset;
3085 #ifdef RE_ENABLE_I18N
3086 re_charset_t *mbcset;
3087 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3088 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3089 #endif /* not RE_ENABLE_I18N */
3090 bool non_match = false;
3091 bin_tree_t *work_tree;
3093 bool first_round = true;
3095 collseqmb = (const unsigned char *)
3096 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3097 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3103 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3104 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3105 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3106 _NL_COLLATE_SYMB_TABLEMB);
3107 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3108 _NL_COLLATE_SYMB_EXTRAMB);
3111 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3112 #ifdef RE_ENABLE_I18N
3113 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3114 #endif /* RE_ENABLE_I18N */
3115 #ifdef RE_ENABLE_I18N
3116 if (BE (sbcset == NULL || mbcset == NULL, 0))
3118 if (BE (sbcset == NULL, 0))
3119 #endif /* RE_ENABLE_I18N */
3122 #ifdef RE_ENABLE_I18N
3129 token_len = peek_token_bracket (token, regexp, syntax);
3130 if (BE (token->type == END_OF_RE, 0))
3133 goto parse_bracket_exp_free_return;
3135 if (token->type == OP_NON_MATCH_LIST)
3137 #ifdef RE_ENABLE_I18N
3138 mbcset->non_match = 1;
3139 #endif /* not RE_ENABLE_I18N */
3141 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3142 bitset_set (sbcset, '\n');
3143 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3144 token_len = peek_token_bracket (token, regexp, syntax);
3145 if (BE (token->type == END_OF_RE, 0))
3148 goto parse_bracket_exp_free_return;
3152 /* We treat the first ']' as a normal character. */
3153 if (token->type == OP_CLOSE_BRACKET)
3154 token->type = CHARACTER;
3158 bracket_elem_t start_elem, end_elem;
3159 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3160 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3163 bool is_range_exp = false;
3166 start_elem.opr.name = start_name_buf;
3167 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3168 syntax, first_round);
3169 if (BE (ret != REG_NOERROR, 0))
3172 goto parse_bracket_exp_free_return;
3174 first_round = false;
3176 /* Get information about the next token. We need it in any case. */
3177 token_len = peek_token_bracket (token, regexp, syntax);
3179 /* Do not check for ranges if we know they are not allowed. */
3180 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3182 if (BE (token->type == END_OF_RE, 0))
3185 goto parse_bracket_exp_free_return;
3187 if (token->type == OP_CHARSET_RANGE)
3189 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3190 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3191 if (BE (token2.type == END_OF_RE, 0))
3194 goto parse_bracket_exp_free_return;
3196 if (token2.type == OP_CLOSE_BRACKET)
3198 /* We treat the last '-' as a normal character. */
3199 re_string_skip_bytes (regexp, -token_len);
3200 token->type = CHARACTER;
3203 is_range_exp = true;
3207 if (is_range_exp == true)
3209 end_elem.opr.name = end_name_buf;
3210 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3212 if (BE (ret != REG_NOERROR, 0))
3215 goto parse_bracket_exp_free_return;
3218 token_len = peek_token_bracket (token, regexp, syntax);
3221 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3222 &start_elem, &end_elem);
3224 # ifdef RE_ENABLE_I18N
3225 *err = build_range_exp (syntax, sbcset,
3226 dfa->mb_cur_max > 1 ? mbcset : NULL,
3227 &range_alloc, &start_elem, &end_elem);
3229 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3231 #endif /* RE_ENABLE_I18N */
3232 if (BE (*err != REG_NOERROR, 0))
3233 goto parse_bracket_exp_free_return;
3237 switch (start_elem.type)
3240 bitset_set (sbcset, start_elem.opr.ch);
3242 #ifdef RE_ENABLE_I18N
3244 /* Check whether the array has enough space. */
3245 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3247 wchar_t *new_mbchars;
3248 /* Not enough, realloc it. */
3249 /* +1 in case of mbcset->nmbchars is 0. */
3250 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3251 /* Use realloc since array is NULL if *alloc == 0. */
3252 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3254 if (BE (new_mbchars == NULL, 0))
3255 goto parse_bracket_exp_espace;
3256 mbcset->mbchars = new_mbchars;
3258 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3260 #endif /* RE_ENABLE_I18N */
3262 *err = build_equiv_class (sbcset,
3263 #ifdef RE_ENABLE_I18N
3264 mbcset, &equiv_class_alloc,
3265 #endif /* RE_ENABLE_I18N */
3266 start_elem.opr.name);
3267 if (BE (*err != REG_NOERROR, 0))
3268 goto parse_bracket_exp_free_return;
3271 *err = build_collating_symbol (sbcset,
3272 #ifdef RE_ENABLE_I18N
3273 mbcset, &coll_sym_alloc,
3274 #endif /* RE_ENABLE_I18N */
3275 start_elem.opr.name);
3276 if (BE (*err != REG_NOERROR, 0))
3277 goto parse_bracket_exp_free_return;
3280 *err = build_charclass (regexp->trans, sbcset,
3281 #ifdef RE_ENABLE_I18N
3282 mbcset, &char_class_alloc,
3283 #endif /* RE_ENABLE_I18N */
3284 start_elem.opr.name, syntax);
3285 if (BE (*err != REG_NOERROR, 0))
3286 goto parse_bracket_exp_free_return;
3293 if (BE (token->type == END_OF_RE, 0))
3296 goto parse_bracket_exp_free_return;
3298 if (token->type == OP_CLOSE_BRACKET)
3302 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3304 /* If it is non-matching list. */
3306 bitset_not (sbcset);
3308 #ifdef RE_ENABLE_I18N
3309 /* Ensure only single byte characters are set. */
3310 if (dfa->mb_cur_max > 1)
3311 bitset_mask (sbcset, dfa->sb_char);
3313 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3314 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3315 || mbcset->non_match)))
3317 bin_tree_t *mbc_tree;
3319 /* Build a tree for complex bracket. */
3320 dfa->has_mb_node = 1;
3321 br_token.type = COMPLEX_BRACKET;
3322 br_token.opr.mbcset = mbcset;
3323 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3324 if (BE (mbc_tree == NULL, 0))
3325 goto parse_bracket_exp_espace;
3326 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3327 if (sbcset[sbc_idx])
3329 /* If there are no bits set in sbcset, there is no point
3330 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3331 if (sbc_idx < BITSET_WORDS)
3333 /* Build a tree for simple bracket. */
3334 br_token.type = SIMPLE_BRACKET;
3335 br_token.opr.sbcset = sbcset;
3336 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337 if (BE (work_tree == NULL, 0))
3338 goto parse_bracket_exp_espace;
3340 /* Then join them by ALT node. */
3341 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3342 if (BE (work_tree == NULL, 0))
3343 goto parse_bracket_exp_espace;
3348 work_tree = mbc_tree;
3352 #endif /* not RE_ENABLE_I18N */
3354 #ifdef RE_ENABLE_I18N
3355 free_charset (mbcset);
3357 /* Build a tree for simple bracket. */
3358 br_token.type = SIMPLE_BRACKET;
3359 br_token.opr.sbcset = sbcset;
3360 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361 if (BE (work_tree == NULL, 0))
3362 goto parse_bracket_exp_espace;
3366 parse_bracket_exp_espace:
3368 parse_bracket_exp_free_return:
3370 #ifdef RE_ENABLE_I18N
3371 free_charset (mbcset);
3372 #endif /* RE_ENABLE_I18N */
3376 /* Parse an element in the bracket expression. */
3378 static reg_errcode_t
3379 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3380 re_token_t *token, int token_len, re_dfa_t *dfa,
3381 reg_syntax_t syntax, bool accept_hyphen)
3383 #ifdef RE_ENABLE_I18N
3385 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3386 if (cur_char_size > 1)
3388 elem->type = MB_CHAR;
3389 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3390 re_string_skip_bytes (regexp, cur_char_size);
3393 #endif /* RE_ENABLE_I18N */
3394 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3395 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3396 || token->type == OP_OPEN_EQUIV_CLASS)
3397 return parse_bracket_symbol (elem, regexp, token);
3398 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3400 /* A '-' must only appear as anything but a range indicator before
3401 the closing bracket. Everything else is an error. */
3403 (void) peek_token_bracket (&token2, regexp, syntax);
3404 if (token2.type != OP_CLOSE_BRACKET)
3405 /* The actual error value is not standardized since this whole
3406 case is undefined. But ERANGE makes good sense. */
3409 elem->type = SB_CHAR;
3410 elem->opr.ch = token->opr.c;
3414 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3415 such as [:<character_class>:], [.<collating_element>.], and
3416 [=<equivalent_class>=]. */
3418 static reg_errcode_t
3419 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3422 unsigned char ch, delim = token->opr.c;
3424 if (re_string_eoi(regexp))
3428 if (i >= BRACKET_NAME_BUF_SIZE)
3430 if (token->type == OP_OPEN_CHAR_CLASS)
3431 ch = re_string_fetch_byte_case (regexp);
3433 ch = re_string_fetch_byte (regexp);
3434 if (re_string_eoi(regexp))
3436 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3438 elem->opr.name[i] = ch;
3440 re_string_skip_bytes (regexp, 1);
3441 elem->opr.name[i] = '\0';
3442 switch (token->type)
3444 case OP_OPEN_COLL_ELEM:
3445 elem->type = COLL_SYM;
3447 case OP_OPEN_EQUIV_CLASS:
3448 elem->type = EQUIV_CLASS;
3450 case OP_OPEN_CHAR_CLASS:
3451 elem->type = CHAR_CLASS;
3459 /* Helper function for parse_bracket_exp.
3460 Build the equivalence class which is represented by NAME.
3461 The result are written to MBCSET and SBCSET.
3462 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3463 is a pointer argument since we may update it. */
3465 static reg_errcode_t
3466 #ifdef RE_ENABLE_I18N
3467 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3468 Idx *equiv_class_alloc, const unsigned char *name)
3469 #else /* not RE_ENABLE_I18N */
3470 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3471 #endif /* not RE_ENABLE_I18N */
3474 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3477 const int32_t *table, *indirect;
3478 const unsigned char *weights, *extra, *cp;
3479 unsigned char char_buf[2];
3483 /* This #include defines a local function! */
3484 # include <locale/weight.h>
3485 /* Calculate the index for equivalence class. */
3487 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3488 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489 _NL_COLLATE_WEIGHTMB);
3490 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3491 _NL_COLLATE_EXTRAMB);
3492 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3493 _NL_COLLATE_INDIRECTMB);
3494 idx1 = findidx (&cp, -1);
3495 if (BE (idx1 == 0 || *cp != '\0', 0))
3496 /* This isn't a valid character. */
3497 return REG_ECOLLATE;
3499 /* Build single byte matching table for this equivalence class. */
3500 len = weights[idx1 & 0xffffff];
3501 for (ch = 0; ch < SBC_MAX; ++ch)
3505 idx2 = findidx (&cp, 1);
3510 /* This isn't a valid character. */
3512 /* Compare only if the length matches and the collation rule
3513 index is the same. */
3514 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3518 while (cnt <= len &&
3519 weights[(idx1 & 0xffffff) + 1 + cnt]
3520 == weights[(idx2 & 0xffffff) + 1 + cnt])
3524 bitset_set (sbcset, ch);
3527 /* Check whether the array has enough space. */
3528 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3530 /* Not enough, realloc it. */
3531 /* +1 in case of mbcset->nequiv_classes is 0. */
3532 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3533 /* Use realloc since the array is NULL if *alloc == 0. */
3534 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3536 new_equiv_class_alloc);
3537 if (BE (new_equiv_classes == NULL, 0))
3539 mbcset->equiv_classes = new_equiv_classes;
3540 *equiv_class_alloc = new_equiv_class_alloc;
3542 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3547 if (BE (strlen ((const char *) name) != 1, 0))
3548 return REG_ECOLLATE;
3549 bitset_set (sbcset, *name);
3554 /* Helper function for parse_bracket_exp.
3555 Build the character class which is represented by NAME.
3556 The result are written to MBCSET and SBCSET.
3557 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3558 is a pointer argument since we may update it. */
3560 static reg_errcode_t
3561 #ifdef RE_ENABLE_I18N
3562 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3563 re_charset_t *mbcset, Idx *char_class_alloc,
3564 const unsigned char *class_name, reg_syntax_t syntax)
3565 #else /* not RE_ENABLE_I18N */
3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567 const unsigned char *class_name, reg_syntax_t syntax)
3568 #endif /* not RE_ENABLE_I18N */
3571 const char *name = (const char *) class_name;
3573 /* In case of REG_ICASE "upper" and "lower" match the both of
3574 upper and lower cases. */
3575 if ((syntax & RE_ICASE)
3576 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3579 #ifdef RE_ENABLE_I18N
3580 /* Check the space of the arrays. */
3581 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3583 /* Not enough, realloc it. */
3584 /* +1 in case of mbcset->nchar_classes is 0. */
3585 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3586 /* Use realloc since array is NULL if *alloc == 0. */
3587 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3588 new_char_class_alloc);
3589 if (BE (new_char_classes == NULL, 0))
3591 mbcset->char_classes = new_char_classes;
3592 *char_class_alloc = new_char_class_alloc;
3594 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3595 #endif /* RE_ENABLE_I18N */
3597 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3599 if (BE (trans != NULL, 0)) \
3601 for (i = 0; i < SBC_MAX; ++i) \
3602 if (ctype_func (i)) \
3603 bitset_set (sbcset, trans[i]); \
3607 for (i = 0; i < SBC_MAX; ++i) \
3608 if (ctype_func (i)) \
3609 bitset_set (sbcset, i); \
3613 if (strcmp (name, "alnum") == 0)
3614 BUILD_CHARCLASS_LOOP (isalnum);
3615 else if (strcmp (name, "cntrl") == 0)
3616 BUILD_CHARCLASS_LOOP (iscntrl);
3617 else if (strcmp (name, "lower") == 0)
3618 BUILD_CHARCLASS_LOOP (islower);
3619 else if (strcmp (name, "space") == 0)
3620 BUILD_CHARCLASS_LOOP (isspace);
3621 else if (strcmp (name, "alpha") == 0)
3622 BUILD_CHARCLASS_LOOP (isalpha);
3623 else if (strcmp (name, "digit") == 0)
3624 BUILD_CHARCLASS_LOOP (isdigit);
3625 else if (strcmp (name, "print") == 0)
3626 BUILD_CHARCLASS_LOOP (isprint);
3627 else if (strcmp (name, "upper") == 0)
3628 BUILD_CHARCLASS_LOOP (isupper);
3629 else if (strcmp (name, "blank") == 0)
3630 BUILD_CHARCLASS_LOOP (isblank);
3631 else if (strcmp (name, "graph") == 0)
3632 BUILD_CHARCLASS_LOOP (isgraph);
3633 else if (strcmp (name, "punct") == 0)
3634 BUILD_CHARCLASS_LOOP (ispunct);
3635 else if (strcmp (name, "xdigit") == 0)
3636 BUILD_CHARCLASS_LOOP (isxdigit);
3644 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3645 const unsigned char *class_name,
3646 const unsigned char *extra, bool non_match,
3649 re_bitset_ptr_t sbcset;
3650 #ifdef RE_ENABLE_I18N
3651 re_charset_t *mbcset;
3653 #endif /* not RE_ENABLE_I18N */
3655 re_token_t br_token;
3658 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3659 #ifdef RE_ENABLE_I18N
3660 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3661 #endif /* RE_ENABLE_I18N */
3663 #ifdef RE_ENABLE_I18N
3664 if (BE (sbcset == NULL || mbcset == NULL, 0))
3665 #else /* not RE_ENABLE_I18N */
3666 if (BE (sbcset == NULL, 0))
3667 #endif /* not RE_ENABLE_I18N */
3675 #ifdef RE_ENABLE_I18N
3676 mbcset->non_match = 1;
3677 #endif /* not RE_ENABLE_I18N */
3680 /* We don't care the syntax in this case. */
3681 ret = build_charclass (trans, sbcset,
3682 #ifdef RE_ENABLE_I18N
3684 #endif /* RE_ENABLE_I18N */
3687 if (BE (ret != REG_NOERROR, 0))
3690 #ifdef RE_ENABLE_I18N
3691 free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3696 /* \w match '_' also. */
3697 for (; *extra; extra++)
3698 bitset_set (sbcset, *extra);
3700 /* If it is non-matching list. */
3702 bitset_not (sbcset);
3704 #ifdef RE_ENABLE_I18N
3705 /* Ensure only single byte characters are set. */
3706 if (dfa->mb_cur_max > 1)
3707 bitset_mask (sbcset, dfa->sb_char);
3710 /* Build a tree for simple bracket. */
3711 br_token.type = SIMPLE_BRACKET;
3712 br_token.opr.sbcset = sbcset;
3713 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714 if (BE (tree == NULL, 0))
3715 goto build_word_op_espace;
3717 #ifdef RE_ENABLE_I18N
3718 if (dfa->mb_cur_max > 1)
3720 bin_tree_t *mbc_tree;
3721 /* Build a tree for complex bracket. */
3722 br_token.type = COMPLEX_BRACKET;
3723 br_token.opr.mbcset = mbcset;
3724 dfa->has_mb_node = 1;
3725 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726 if (BE (mbc_tree == NULL, 0))
3727 goto build_word_op_espace;
3728 /* Then join them by ALT node. */
3729 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730 if (BE (mbc_tree != NULL, 1))
3735 free_charset (mbcset);
3738 #else /* not RE_ENABLE_I18N */
3740 #endif /* not RE_ENABLE_I18N */
3742 build_word_op_espace:
3744 #ifdef RE_ENABLE_I18N
3745 free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3751 /* This is intended for the expressions like "a{1,3}".
3752 Fetch a number from 'input', and return the number.
3753 Return REG_MISSING if the number field is empty like "{,1}".
3754 Return REG_ERROR if an error occurred. */
3757 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3759 Idx num = REG_MISSING;
3763 fetch_token (token, input, syntax);
3765 if (BE (token->type == END_OF_RE, 0))
3767 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769 num = ((token->type != CHARACTER || c < '0' || '9' < c
3770 || num == REG_ERROR)
3772 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3773 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3778 #ifdef RE_ENABLE_I18N
3780 free_charset (re_charset_t *cset)
3782 re_free (cset->mbchars);
3784 re_free (cset->coll_syms);
3785 re_free (cset->equiv_classes);
3786 re_free (cset->range_starts);
3787 re_free (cset->range_ends);
3789 re_free (cset->char_classes);
3792 #endif /* RE_ENABLE_I18N */
3794 /* Functions for binary tree operation. */
3796 /* Create a tree node. */
3799 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800 re_token_type_t type)
3804 return create_token_tree (dfa, left, right, &t);
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809 const re_token_t *token)
3812 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3814 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3816 if (storage == NULL)
3818 storage->next = dfa->str_tree_storage;
3819 dfa->str_tree_storage = storage;
3820 dfa->str_tree_storage_idx = 0;
3822 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3824 tree->parent = NULL;
3826 tree->right = right;
3827 tree->token = *token;
3828 tree->token.duplicated = 0;
3829 tree->token.opt_subexp = 0;
3832 tree->node_idx = REG_MISSING;
3835 left->parent = tree;
3837 right->parent = tree;
3841 /* Mark the tree SRC as an optional subexpression.
3842 To be called from preorder or postorder. */
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3847 Idx idx = (Idx) (long) extra;
3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849 node->token.opt_subexp = 1;
3854 /* Free the allocated memory inside NODE. */
3857 free_token (re_token_t *node)
3859 #ifdef RE_ENABLE_I18N
3860 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861 free_charset (node->opr.mbcset);
3863 #endif /* RE_ENABLE_I18N */
3864 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865 re_free (node->opr.sbcset);
3868 /* Worker function for tree walking. Free the allocated memory inside NODE
3869 and its children. */
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3874 free_token (&node->token);
3879 /* Duplicate the node SRC, and return new node. This is a preorder
3880 visit similar to the one implemented by the generic visitor, but
3881 we need more infrastructure to maintain two parallel trees --- so,
3882 it's easier to duplicate. */
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3887 const bin_tree_t *node;
3888 bin_tree_t *dup_root;
3889 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3891 for (node = root; ; )
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3897 (*p_new)->parent = dup_node;
3898 (*p_new)->token.duplicated = 1;
3901 /* Go to the left node, or up and to the right. */
3905 p_new = &dup_node->left;
3909 const bin_tree_t *prev = NULL;
3910 while (node->right == prev || node->right == NULL)
3913 node = node->parent;
3914 dup_node = dup_node->parent;
3919 p_new = &dup_node->right;