1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 static void optimize_utf8 (re_dfa_t *dfa);
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
37 static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_token_t *token, int token_len,
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
98 #else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 /* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
127 static const char __re_error_msgid[] =
129 #define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 static const size_t __re_error_msgid_idx[] =
202 /* Entry points for GNU code. */
204 /* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
213 re_compile_pattern (pattern, length, bufp)
216 struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240 weak_alias (__re_compile_pattern, re_compile_pattern)
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
259 re_set_syntax (syntax)
262 reg_syntax_t ret = re_syntax_options;
264 re_syntax_options = syntax;
268 weak_alias (__re_set_syntax, re_set_syntax)
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
275 re_dfa_t *dfa = bufp->buffer;
276 char *fastmap = bufp->fastmap;
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[tolower (ch)] = 1;
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 re_dfa_t *dfa = bufp->buffer;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
317 if (type == CHARACTER)
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 unsigned char buf[MB_LEN_MAX];
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 re_set_fastmap (fastmap, false, buf[0]);
343 else if (type == SIMPLE_BRACKET)
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 /* See if we have to try all bytes which start multiple collation
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
375 re_set_fastmap (fastmap, icase, i);
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
386 || cset->nequiv_classes
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
527 weak_alias (__regcomp, regcomp)
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode, preg, errbuf, errbuf_size)
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
540 #else /* size_t might promote */
542 regerror (int errcode, const regex_t *_Restrict_ preg,
543 char *_Restrict_ errbuf, size_t errbuf_size)
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 msg_size = strlen (msg) + 1; /* Includes the null. */
562 if (BE (errbuf_size != 0, 1))
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
570 memcpy (errbuf, msg, cpy_size);
576 weak_alias (__regerror, regerror)
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset_t utf8_sb_map =
587 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
610 free_dfa_content (re_dfa_t *dfa)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
638 re_dfastate_t *state = entry->array[j];
641 re_free (entry->array);
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
648 re_free (dfa->subexp_map);
650 re_free (dfa->re_str);
657 /* Free dynamically allocated space used by PREG. */
663 re_dfa_t *dfa = preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
672 re_free (preg->translate);
673 preg->translate = NULL;
676 weak_alias (__regfree, regfree)
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
682 #if defined _REGEX_RE_COMP || defined _LIBC
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
707 if (re_comp_buf.buffer)
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
716 if (re_comp_buf.fastmap == NULL)
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740 libc_freeres_fn (free_mem)
742 __regfree (&re_comp_buf);
746 #endif /* _REGEX_RE_COMP */
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
756 reg_errcode_t err = REG_NOERROR;
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
769 /* Initialize the dfa. */
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
780 preg->allocated = sizeof (re_dfa_t);
783 preg->used = sizeof (re_dfa_t);
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
788 free_dfa_content (dfa);
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
799 __libc_lock_init (dfa->lock);
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
814 /* Parse the regular expression, and build a structure tree. */
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
838 if (BE (err != REG_NOERROR, 0))
840 free_dfa_content (dfa);
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 __re_size_t table_size;
856 const char *codeset_name;
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 size_t max_i18n_object_size = 0;
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
870 memset (dfa, '\0', sizeof (re_dfa_t));
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
893 dfa->mb_cur_max = MB_CUR_MAX;
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
901 codeset_name = nl_langinfo (CODESET);
902 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
903 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
908 /* We check exhaustively in the loop below if this charset is a
909 superset of ASCII. */
910 dfa->map_notascii = 0;
913 #ifdef RE_ENABLE_I18N
914 if (dfa->mb_cur_max > 1)
917 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
922 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
923 if (BE (dfa->sb_char == NULL, 0))
926 /* Set the bits corresponding to single byte chars. */
927 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
930 wint_t wch = __btowc (ch);
932 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
934 if (isascii (ch) && wch != ch)
935 dfa->map_notascii = 1;
942 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
947 /* Initialize WORD_CHAR table, which indicate which character is
948 "word". In this case "word" means that it is the word construction
949 character used by some operators like "\<", "\>", etc. */
953 init_word_char (re_dfa_t *dfa)
955 dfa->word_ops_used = 1;
959 if (BE (dfa->map_notascii == 0, 1))
961 bitset_word_t bits0 = 0x00000000;
962 bitset_word_t bits1 = 0x03ff0000;
963 bitset_word_t bits2 = 0x87fffffe;
964 bitset_word_t bits3 = 0x07fffffe;
965 if (BITSET_WORD_BITS == 64)
967 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
968 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
971 else if (BITSET_WORD_BITS == 32)
973 dfa->word_char[0] = bits0;
974 dfa->word_char[1] = bits1;
975 dfa->word_char[2] = bits2;
976 dfa->word_char[3] = bits3;
983 if (BE (dfa->is_utf8, 1))
985 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
991 for (; i < BITSET_WORDS; ++i)
992 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
993 if (isalnum (ch) || ch == '_')
994 dfa->word_char[i] |= (bitset_word_t) 1 << j;
997 /* Free the work area which are only used while compiling. */
1000 free_workarea_compile (regex_t *preg)
1002 re_dfa_t *dfa = preg->buffer;
1003 bin_tree_storage_t *storage, *next;
1004 for (storage = dfa->str_tree_storage; storage; storage = next)
1006 next = storage->next;
1009 dfa->str_tree_storage = NULL;
1010 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1011 dfa->str_tree = NULL;
1012 re_free (dfa->org_indices);
1013 dfa->org_indices = NULL;
1016 /* Create initial states for all contexts. */
1018 static reg_errcode_t
1019 create_initial_state (re_dfa_t *dfa)
1023 re_node_set init_nodes;
1025 /* Initial states have the epsilon closure of the node which is
1026 the first node of the regular expression. */
1027 first = dfa->str_tree->first->node_idx;
1028 dfa->init_node = first;
1029 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1030 if (BE (err != REG_NOERROR, 0))
1033 /* The back-references which are in initial states can epsilon transit,
1034 since in this case all of the subexpressions can be null.
1035 Then we add epsilon closures of the nodes which are the next nodes of
1036 the back-references. */
1037 if (dfa->nbackref > 0)
1038 for (i = 0; i < init_nodes.nelem; ++i)
1040 Idx node_idx = init_nodes.elems[i];
1041 re_token_type_t type = dfa->nodes[node_idx].type;
1044 if (type != OP_BACK_REF)
1046 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1048 re_token_t *clexp_node;
1049 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1050 if (clexp_node->type == OP_CLOSE_SUBEXP
1051 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1054 if (clexp_idx == init_nodes.nelem)
1057 if (type == OP_BACK_REF)
1059 Idx dest_idx = dfa->edests[node_idx].elems[0];
1060 if (!re_node_set_contains (&init_nodes, dest_idx))
1062 reg_errcode_t merge_err
1063 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1064 if (merge_err != REG_NOERROR)
1071 /* It must be the first time to invoke acquire_state. */
1072 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1073 /* We don't check ERR here, since the initial state must not be NULL. */
1074 if (BE (dfa->init_state == NULL, 0))
1076 if (dfa->init_state->has_constraint)
1078 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1080 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1082 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1086 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1087 || dfa->init_state_begbuf == NULL, 0))
1091 dfa->init_state_word = dfa->init_state_nl
1092 = dfa->init_state_begbuf = dfa->init_state;
1094 re_node_set_free (&init_nodes);
1098 #ifdef RE_ENABLE_I18N
1099 /* If it is possible to do searching in single byte encoding instead of UTF-8
1100 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101 DFA nodes where needed. */
1104 optimize_utf8 (re_dfa_t *dfa)
1108 bool mb_chars = false;
1109 bool has_period = false;
1111 for (node = 0; node < dfa->nodes_len; ++node)
1112 switch (dfa->nodes[node].type)
1115 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1119 switch (dfa->nodes[node].opr.ctx_type)
1127 /* Word anchors etc. cannot be handled. It's okay to test
1128 opr.ctx_type since constraints (for all DFA nodes) are
1129 created by ORing one or more opr.ctx_type values. */
1139 case OP_DUP_ASTERISK:
1140 case OP_OPEN_SUBEXP:
1141 case OP_CLOSE_SUBEXP:
1143 case COMPLEX_BRACKET:
1145 case SIMPLE_BRACKET:
1146 /* Just double check. */
1148 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1150 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1153 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1163 if (mb_chars || has_period)
1164 for (node = 0; node < dfa->nodes_len; ++node)
1166 if (dfa->nodes[node].type == CHARACTER
1167 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1168 dfa->nodes[node].mb_partial = 0;
1169 else if (dfa->nodes[node].type == OP_PERIOD)
1170 dfa->nodes[node].type = OP_UTF8_PERIOD;
1173 /* The search can be in single byte locale. */
1174 dfa->mb_cur_max = 1;
1176 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1180 /* Analyze the structure tree, and calculate "first", "next", "edest",
1181 "eclosure", and "inveclosure". */
1183 static reg_errcode_t
1184 analyze (regex_t *preg)
1186 re_dfa_t *dfa = preg->buffer;
1189 /* Allocate arrays. */
1190 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1191 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1192 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1193 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1194 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1195 || dfa->eclosures == NULL, 0))
1198 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1199 if (dfa->subexp_map != NULL)
1202 for (i = 0; i < preg->re_nsub; i++)
1203 dfa->subexp_map[i] = i;
1204 preorder (dfa->str_tree, optimize_subexps, dfa);
1205 for (i = 0; i < preg->re_nsub; i++)
1206 if (dfa->subexp_map[i] != i)
1208 if (i == preg->re_nsub)
1210 free (dfa->subexp_map);
1211 dfa->subexp_map = NULL;
1215 ret = postorder (dfa->str_tree, lower_subexps, preg);
1216 if (BE (ret != REG_NOERROR, 0))
1218 ret = postorder (dfa->str_tree, calc_first, dfa);
1219 if (BE (ret != REG_NOERROR, 0))
1221 preorder (dfa->str_tree, calc_next, dfa);
1222 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1223 if (BE (ret != REG_NOERROR, 0))
1225 ret = calc_eclosure (dfa);
1226 if (BE (ret != REG_NOERROR, 0))
1229 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1230 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1231 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1234 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1235 if (BE (dfa->inveclosures == NULL, 0))
1237 ret = calc_inveclosure (dfa);
1243 /* Our parse trees are very unbalanced, so we cannot use a stack to
1244 implement parse tree visits. Instead, we use parent pointers and
1245 some hairy code in these two functions. */
1246 static reg_errcode_t
1247 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1250 bin_tree_t *node, *prev;
1252 for (node = root; ; )
1254 /* Descend down the tree, preferably to the left (or to the right
1255 if that's the only child). */
1256 while (node->left || node->right)
1264 reg_errcode_t err = fn (extra, node);
1265 if (BE (err != REG_NOERROR, 0))
1267 if (node->parent == NULL)
1270 node = node->parent;
1272 /* Go up while we have a node that is reached from the right. */
1273 while (node->right == prev || node->right == NULL);
1278 static reg_errcode_t
1279 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1284 for (node = root; ; )
1286 reg_errcode_t err = fn (extra, node);
1287 if (BE (err != REG_NOERROR, 0))
1290 /* Go to the left node, or up and to the right. */
1295 bin_tree_t *prev = NULL;
1296 while (node->right == prev || node->right == NULL)
1299 node = node->parent;
1308 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1309 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1310 backreferences as well. Requires a preorder visit. */
1311 static reg_errcode_t
1312 optimize_subexps (void *extra, bin_tree_t *node)
1314 re_dfa_t *dfa = (re_dfa_t *) extra;
1316 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318 int idx = node->token.opr.idx;
1319 node->token.opr.idx = dfa->subexp_map[idx];
1320 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1323 else if (node->token.type == SUBEXP
1324 && node->left && node->left->token.type == SUBEXP)
1326 Idx other_idx = node->left->token.opr.idx;
1328 node->left = node->left->left;
1330 node->left->parent = node;
1332 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1333 if (other_idx < BITSET_WORD_BITS)
1334 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1340 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1341 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1342 static reg_errcode_t
1343 lower_subexps (void *extra, bin_tree_t *node)
1345 regex_t *preg = (regex_t *) extra;
1346 reg_errcode_t err = REG_NOERROR;
1348 if (node->left && node->left->token.type == SUBEXP)
1350 node->left = lower_subexp (&err, preg, node->left);
1352 node->left->parent = node;
1354 if (node->right && node->right->token.type == SUBEXP)
1356 node->right = lower_subexp (&err, preg, node->right);
1358 node->right->parent = node;
1365 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1367 re_dfa_t *dfa = preg->buffer;
1368 bin_tree_t *body = node->left;
1369 bin_tree_t *op, *cls, *tree1, *tree;
1372 /* We do not optimize empty subexpressions, because otherwise we may
1373 have bad CONCAT nodes with NULL children. This is obviously not
1374 very common, so we do not lose much. An example that triggers
1375 this case is the sed "script" /\(\)/x. */
1376 && node->left != NULL
1377 && (node->token.opr.idx >= BITSET_WORD_BITS
1378 || !(dfa->used_bkref_map
1379 & ((bitset_word_t) 1 << node->token.opr.idx))))
1382 /* Convert the SUBEXP node to the concatenation of an
1383 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1384 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1385 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1386 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1387 tree = create_tree (dfa, op, tree1, CONCAT);
1388 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1394 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1395 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1399 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1400 nodes. Requires a postorder visit. */
1401 static reg_errcode_t
1402 calc_first (void *extra, bin_tree_t *node)
1404 re_dfa_t *dfa = (re_dfa_t *) extra;
1405 if (node->token.type == CONCAT)
1407 node->first = node->left->first;
1408 node->node_idx = node->left->node_idx;
1413 node->node_idx = re_dfa_add_node (dfa, node->token);
1414 if (BE (node->node_idx == REG_MISSING, 0))
1416 if (node->token.type == ANCHOR)
1417 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1422 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1423 static reg_errcode_t
1424 calc_next (void *extra, bin_tree_t *node)
1426 switch (node->token.type)
1428 case OP_DUP_ASTERISK:
1429 node->left->next = node;
1432 node->left->next = node->right->first;
1433 node->right->next = node->next;
1437 node->left->next = node->next;
1439 node->right->next = node->next;
1445 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1446 static reg_errcode_t
1447 link_nfa_nodes (void *extra, bin_tree_t *node)
1449 re_dfa_t *dfa = (re_dfa_t *) extra;
1450 Idx idx = node->node_idx;
1451 reg_errcode_t err = REG_NOERROR;
1453 switch (node->token.type)
1459 assert (node->next == NULL);
1462 case OP_DUP_ASTERISK:
1466 dfa->has_plural_match = 1;
1467 if (node->left != NULL)
1468 left = node->left->first->node_idx;
1470 left = node->next->node_idx;
1471 if (node->right != NULL)
1472 right = node->right->first->node_idx;
1474 right = node->next->node_idx;
1475 assert (REG_VALID_INDEX (left));
1476 assert (REG_VALID_INDEX (right));
1477 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1482 case OP_OPEN_SUBEXP:
1483 case OP_CLOSE_SUBEXP:
1484 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1488 dfa->nexts[idx] = node->next->node_idx;
1489 if (node->token.type == OP_BACK_REF)
1490 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1494 assert (!IS_EPSILON_NODE (node->token.type));
1495 dfa->nexts[idx] = node->next->node_idx;
1502 /* Duplicate the epsilon closure of the node ROOT_NODE.
1503 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1504 to their own constraint. */
1506 static reg_errcode_t
1508 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1509 Idx root_node, unsigned int init_constraint)
1511 Idx org_node, clone_node;
1513 unsigned int constraint = init_constraint;
1514 for (org_node = top_org_node, clone_node = top_clone_node;;)
1516 Idx org_dest, clone_dest;
1517 if (dfa->nodes[org_node].type == OP_BACK_REF)
1519 /* If the back reference epsilon-transit, its destination must
1520 also have the constraint. Then duplicate the epsilon closure
1521 of the destination of the back reference, and store it in
1522 edests of the back reference. */
1523 org_dest = dfa->nexts[org_node];
1524 re_node_set_empty (dfa->edests + clone_node);
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == REG_MISSING, 0))
1528 dfa->nexts[clone_node] = dfa->nexts[org_node];
1529 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533 else if (dfa->edests[org_node].nelem == 0)
1535 /* In case of the node can't epsilon-transit, don't duplicate the
1536 destination and store the original destination as the
1537 destination of the node. */
1538 dfa->nexts[clone_node] = dfa->nexts[org_node];
1541 else if (dfa->edests[org_node].nelem == 1)
1543 /* In case of the node can epsilon-transit, and it has only one
1545 org_dest = dfa->edests[org_node].elems[0];
1546 re_node_set_empty (dfa->edests + clone_node);
1547 /* If the node is root_node itself, it means the epsilon closure
1548 has a loop. Then tie it to the destination of the root_node. */
1549 if (org_node == root_node && clone_node != org_node)
1551 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1556 /* In case the node has another constraint, append it. */
1557 constraint |= dfa->nodes[org_node].constraint;
1558 clone_dest = duplicate_node (dfa, org_dest, constraint);
1559 if (BE (clone_dest == REG_MISSING, 0))
1561 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 else /* dfa->edests[org_node].nelem == 2 */
1567 /* In case of the node can epsilon-transit, and it has two
1568 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1569 org_dest = dfa->edests[org_node].elems[0];
1570 re_node_set_empty (dfa->edests + clone_node);
1571 /* Search for a duplicated node which satisfies the constraint. */
1572 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1573 if (clone_dest == REG_MISSING)
1575 /* There is no such duplicated node, create a new one. */
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (BE (clone_dest == REG_MISSING, 0))
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1583 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1584 root_node, constraint);
1585 if (BE (err != REG_NOERROR, 0))
1590 /* There is a duplicated node which satisfies the constraint,
1591 use it to avoid infinite loop. */
1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1597 org_dest = dfa->edests[org_node].elems[1];
1598 clone_dest = duplicate_node (dfa, org_dest, constraint);
1599 if (BE (clone_dest == REG_MISSING, 0))
1601 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1605 org_node = org_dest;
1606 clone_node = clone_dest;
1611 /* Search for a node which is duplicated from the node ORG_NODE, and
1612 satisfies the constraint CONSTRAINT. */
1615 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1616 unsigned int constraint)
1619 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621 if (org_node == dfa->org_indices[idx]
1622 && constraint == dfa->nodes[idx].constraint)
1623 return idx; /* Found. */
1625 return REG_MISSING; /* Not found. */
1628 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1629 Return the index of the new node, or REG_MISSING if insufficient storage is
1633 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1636 if (BE (dup_idx != REG_MISSING, 1))
1638 dfa->nodes[dup_idx].constraint = constraint;
1639 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1640 dfa->nodes[dup_idx].duplicated = 1;
1642 /* Store the index of the original node. */
1643 dfa->org_indices[dup_idx] = org_idx;
1648 static reg_errcode_t
1649 calc_inveclosure (re_dfa_t *dfa)
1653 for (idx = 0; idx < dfa->nodes_len; ++idx)
1654 re_node_set_init_empty (dfa->inveclosures + idx);
1656 for (src = 0; src < dfa->nodes_len; ++src)
1658 Idx *elems = dfa->eclosures[src].elems;
1659 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1670 /* Calculate "eclosure" for all the node in DFA. */
1672 static reg_errcode_t
1673 calc_eclosure (re_dfa_t *dfa)
1678 assert (dfa->nodes_len > 0);
1681 /* For each nodes, calculate epsilon closure. */
1682 for (node_idx = 0; ; ++node_idx)
1685 re_node_set eclosure_elem;
1686 if (node_idx == dfa->nodes_len)
1695 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1698 /* If we have already calculated, skip it. */
1699 if (dfa->eclosures[node_idx].nelem != 0)
1701 /* Calculate epsilon closure of 'node_idx'. */
1702 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1703 if (BE (err != REG_NOERROR, 0))
1706 if (dfa->eclosures[node_idx].nelem == 0)
1709 re_node_set_free (&eclosure_elem);
1715 /* Calculate epsilon closure of NODE. */
1717 static reg_errcode_t
1718 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1722 re_node_set eclosure;
1724 bool incomplete = false;
1725 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1726 if (BE (err != REG_NOERROR, 0))
1729 /* This indicates that we are calculating this node now.
1730 We reference this value to avoid infinite loop. */
1731 dfa->eclosures[node].nelem = REG_MISSING;
1733 /* If the current node has constraints, duplicate all nodes
1734 since they must inherit the constraints. */
1735 if (dfa->nodes[node].constraint
1736 && dfa->edests[node].nelem
1737 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739 err = duplicate_node_closure (dfa, node, node, node,
1740 dfa->nodes[node].constraint);
1741 if (BE (err != REG_NOERROR, 0))
1745 /* Expand each epsilon destination nodes. */
1746 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1747 for (i = 0; i < dfa->edests[node].nelem; ++i)
1749 re_node_set eclosure_elem;
1750 Idx edest = dfa->edests[node].elems[i];
1751 /* If calculating the epsilon closure of 'edest' is in progress,
1752 return intermediate result. */
1753 if (dfa->eclosures[edest].nelem == REG_MISSING)
1758 /* If we haven't calculated the epsilon closure of 'edest' yet,
1759 calculate now. Otherwise use calculated epsilon closure. */
1760 if (dfa->eclosures[edest].nelem == 0)
1762 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1763 if (BE (err != REG_NOERROR, 0))
1767 eclosure_elem = dfa->eclosures[edest];
1768 /* Merge the epsilon closure of 'edest'. */
1769 err = re_node_set_merge (&eclosure, &eclosure_elem);
1770 if (BE (err != REG_NOERROR, 0))
1772 /* If the epsilon closure of 'edest' is incomplete,
1773 the epsilon closure of this node is also incomplete. */
1774 if (dfa->eclosures[edest].nelem == 0)
1777 re_node_set_free (&eclosure_elem);
1781 /* An epsilon closure includes itself. */
1782 ok = re_node_set_insert (&eclosure, node);
1785 if (incomplete && !root)
1786 dfa->eclosures[node].nelem = 0;
1788 dfa->eclosures[node] = eclosure;
1789 *new_set = eclosure;
1793 /* Functions for token which are used in the parser. */
1795 /* Fetch a token from INPUT.
1796 We must not use this function inside bracket expressions. */
1800 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1802 re_string_skip_bytes (input, peek_token (result, input, syntax));
1805 /* Peek a token from INPUT, and return the length of the token.
1806 We must not use this function inside bracket expressions. */
1810 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1814 if (re_string_eoi (input))
1816 token->type = END_OF_RE;
1820 c = re_string_peek_byte (input, 0);
1823 token->word_char = 0;
1824 #ifdef RE_ENABLE_I18N
1825 token->mb_partial = 0;
1826 if (input->mb_cur_max > 1 &&
1827 !re_string_first_byte (input, re_string_cur_idx (input)))
1829 token->type = CHARACTER;
1830 token->mb_partial = 1;
1837 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1839 token->type = BACK_SLASH;
1843 c2 = re_string_peek_byte_case (input, 1);
1845 token->type = CHARACTER;
1846 #ifdef RE_ENABLE_I18N
1847 if (input->mb_cur_max > 1)
1849 wint_t wc = re_string_wchar_at (input,
1850 re_string_cur_idx (input) + 1);
1851 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1855 token->word_char = IS_WORD_CHAR (c2) != 0;
1860 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1861 token->type = OP_ALT;
1863 case '1': case '2': case '3': case '4': case '5':
1864 case '6': case '7': case '8': case '9':
1865 if (!(syntax & RE_NO_BK_REFS))
1867 token->type = OP_BACK_REF;
1868 token->opr.idx = c2 - '1';
1872 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = WORD_FIRST;
1879 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = WORD_LAST;
1886 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = ANCHOR;
1889 token->opr.ctx_type = WORD_DELIM;
1893 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = ANCHOR;
1896 token->opr.ctx_type = NOT_WORD_DELIM;
1900 if (!(syntax & RE_NO_GNU_OPS))
1901 token->type = OP_WORD;
1904 if (!(syntax & RE_NO_GNU_OPS))
1905 token->type = OP_NOTWORD;
1908 if (!(syntax & RE_NO_GNU_OPS))
1909 token->type = OP_SPACE;
1912 if (!(syntax & RE_NO_GNU_OPS))
1913 token->type = OP_NOTSPACE;
1916 if (!(syntax & RE_NO_GNU_OPS))
1918 token->type = ANCHOR;
1919 token->opr.ctx_type = BUF_FIRST;
1923 if (!(syntax & RE_NO_GNU_OPS))
1925 token->type = ANCHOR;
1926 token->opr.ctx_type = BUF_LAST;
1930 if (!(syntax & RE_NO_BK_PARENS))
1931 token->type = OP_OPEN_SUBEXP;
1934 if (!(syntax & RE_NO_BK_PARENS))
1935 token->type = OP_CLOSE_SUBEXP;
1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1939 token->type = OP_DUP_PLUS;
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1943 token->type = OP_DUP_QUESTION;
1946 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1947 token->type = OP_OPEN_DUP_NUM;
1950 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1951 token->type = OP_CLOSE_DUP_NUM;
1959 token->type = CHARACTER;
1960 #ifdef RE_ENABLE_I18N
1961 if (input->mb_cur_max > 1)
1963 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1964 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1968 token->word_char = IS_WORD_CHAR (token->opr.c);
1973 if (syntax & RE_NEWLINE_ALT)
1974 token->type = OP_ALT;
1977 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1978 token->type = OP_ALT;
1981 token->type = OP_DUP_ASTERISK;
1984 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1985 token->type = OP_DUP_PLUS;
1988 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1989 token->type = OP_DUP_QUESTION;
1992 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1993 token->type = OP_OPEN_DUP_NUM;
1996 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1997 token->type = OP_CLOSE_DUP_NUM;
2000 if (syntax & RE_NO_BK_PARENS)
2001 token->type = OP_OPEN_SUBEXP;
2004 if (syntax & RE_NO_BK_PARENS)
2005 token->type = OP_CLOSE_SUBEXP;
2008 token->type = OP_OPEN_BRACKET;
2011 token->type = OP_PERIOD;
2014 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2015 re_string_cur_idx (input) != 0)
2017 char prev = re_string_peek_byte (input, -1);
2018 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2021 token->type = ANCHOR;
2022 token->opr.ctx_type = LINE_FIRST;
2025 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2026 re_string_cur_idx (input) + 1 != re_string_length (input))
2029 re_string_skip_bytes (input, 1);
2030 peek_token (&next, input, syntax);
2031 re_string_skip_bytes (input, -1);
2032 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2035 token->type = ANCHOR;
2036 token->opr.ctx_type = LINE_LAST;
2044 /* Peek a token from INPUT, and return the length of the token.
2045 We must not use this function out of bracket expressions. */
2049 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2052 if (re_string_eoi (input))
2054 token->type = END_OF_RE;
2057 c = re_string_peek_byte (input, 0);
2060 #ifdef RE_ENABLE_I18N
2061 if (input->mb_cur_max > 1 &&
2062 !re_string_first_byte (input, re_string_cur_idx (input)))
2064 token->type = CHARACTER;
2067 #endif /* RE_ENABLE_I18N */
2069 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2070 && re_string_cur_idx (input) + 1 < re_string_length (input))
2072 /* In this case, '\' escape a character. */
2074 re_string_skip_bytes (input, 1);
2075 c2 = re_string_peek_byte (input, 0);
2077 token->type = CHARACTER;
2080 if (c == '[') /* '[' is a special char in a bracket exps. */
2084 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2085 c2 = re_string_peek_byte (input, 1);
2093 token->type = OP_OPEN_COLL_ELEM;
2096 token->type = OP_OPEN_EQUIV_CLASS;
2099 if (syntax & RE_CHAR_CLASSES)
2101 token->type = OP_OPEN_CHAR_CLASS;
2104 /* else fall through. */
2106 token->type = CHARACTER;
2116 token->type = OP_CHARSET_RANGE;
2119 token->type = OP_CLOSE_BRACKET;
2122 token->type = OP_NON_MATCH_LIST;
2125 token->type = CHARACTER;
2130 /* Functions for parser. */
2132 /* Entry point of the parser.
2133 Parse the regular expression REGEXP and return the structure tree.
2134 If an error occurs, ERR is set by error code, and return NULL.
2135 This function build the following tree, from regular expression <reg_exp>:
2141 CAT means concatenation.
2142 EOR means end of regular expression. */
2145 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2148 re_dfa_t *dfa = preg->buffer;
2149 bin_tree_t *tree, *eor, *root;
2150 re_token_t current_token;
2151 dfa->syntax = syntax;
2152 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2156 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2158 root = create_tree (dfa, tree, eor, CONCAT);
2161 if (BE (eor == NULL || root == NULL, 0))
2169 /* This function build the following tree, from regular expression
2170 <branch1>|<branch2>:
2176 ALT means alternative, which represents the operator '|'. */
2179 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2182 re_dfa_t *dfa = preg->buffer;
2183 bin_tree_t *tree, *branch = NULL;
2184 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2185 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188 while (token->type == OP_ALT)
2190 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2191 if (token->type != OP_ALT && token->type != END_OF_RE
2192 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2200 tree = create_tree (dfa, tree, branch, OP_ALT);
2201 if (BE (tree == NULL, 0))
2210 /* This function build the following tree, from regular expression
2217 CAT means concatenation. */
2220 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223 bin_tree_t *tree, *expr;
2224 re_dfa_t *dfa = preg->buffer;
2225 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2226 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2229 while (token->type != OP_ALT && token->type != END_OF_RE
2230 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2232 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2233 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2236 postorder (tree, free_tree, NULL);
2239 if (tree != NULL && expr != NULL)
2241 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2242 if (newtree == NULL)
2244 postorder (expr, free_tree, NULL);
2245 postorder (tree, free_tree, NULL);
2251 else if (tree == NULL)
2253 /* Otherwise expr == NULL, we don't need to create new tree. */
2258 /* This function build the following tree, from regular expression a*:
2265 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2266 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2268 re_dfa_t *dfa = preg->buffer;
2270 switch (token->type)
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2279 #ifdef RE_ENABLE_I18N
2280 if (dfa->mb_cur_max > 1)
2282 while (!re_string_eoi (regexp)
2283 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285 bin_tree_t *mbc_remain;
2286 fetch_token (token, regexp, syntax);
2287 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2288 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2289 if (BE (mbc_remain == NULL || tree == NULL, 0))
2298 case OP_OPEN_SUBEXP:
2299 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2300 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303 case OP_OPEN_BRACKET:
2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2309 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2314 dfa->used_bkref_map |= 1 << token->opr.idx;
2315 tree = create_token_tree (dfa, NULL, NULL, token);
2316 if (BE (tree == NULL, 0))
2322 dfa->has_mb_node = 1;
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2331 case OP_DUP_ASTERISK:
2333 case OP_DUP_QUESTION:
2334 if (syntax & RE_CONTEXT_INVALID_OPS)
2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
2341 fetch_token (token, regexp, syntax);
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2344 /* else fall through */
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2352 /* else fall through */
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
2358 /* mb_partial and word_char bits should be initialized already
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2368 if ((token->opr.ctx_type
2369 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2370 && dfa->word_ops_used == 0)
2371 init_word_char (dfa);
2372 if (token->opr.ctx_type == WORD_DELIM
2373 || token->opr.ctx_type == NOT_WORD_DELIM)
2375 bin_tree_t *tree_first, *tree_last;
2376 if (token->opr.ctx_type == WORD_DELIM)
2378 token->opr.ctx_type = WORD_FIRST;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = WORD_LAST;
2384 token->opr.ctx_type = INSIDE_WORD;
2385 tree_first = create_token_tree (dfa, NULL, NULL, token);
2386 token->opr.ctx_type = INSIDE_NOTWORD;
2388 tree_last = create_token_tree (dfa, NULL, NULL, token);
2389 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2390 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2398 tree = create_token_tree (dfa, NULL, NULL, token);
2399 if (BE (tree == NULL, 0))
2405 /* We must return here, since ANCHORs can't be followed
2406 by repetition operators.
2407 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2408 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2409 fetch_token (token, regexp, syntax);
2412 tree = create_token_tree (dfa, NULL, NULL, token);
2413 if (BE (tree == NULL, 0))
2418 if (dfa->mb_cur_max > 1)
2419 dfa->has_mb_node = 1;
2423 tree = build_charclass_op (dfa, regexp->trans,
2424 (const unsigned char *) "alnum",
2425 (const unsigned char *) "_",
2426 token->type == OP_NOTWORD, err);
2427 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2432 tree = build_charclass_op (dfa, regexp->trans,
2433 (const unsigned char *) "space",
2434 (const unsigned char *) "",
2435 token->type == OP_NOTSPACE, err);
2436 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2446 /* Must not happen? */
2452 fetch_token (token, regexp, syntax);
2454 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2455 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2457 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2458 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460 /* In BRE consecutive duplications are not allowed. */
2461 if ((syntax & RE_CONTEXT_INVALID_DUP)
2462 && (token->type == OP_DUP_ASTERISK
2463 || token->type == OP_OPEN_DUP_NUM))
2473 /* This function build the following tree, from regular expression
2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2482 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2484 re_dfa_t *dfa = preg->buffer;
2487 cur_nsub = preg->re_nsub++;
2489 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2491 /* The subexpression may be a null string. */
2492 if (token->type == OP_CLOSE_SUBEXP)
2496 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2500 postorder (tree, free_tree, NULL);
2503 if (BE (*err != REG_NOERROR, 0))
2507 if (cur_nsub <= '9' - '1')
2508 dfa->completed_bkref_map |= 1 << cur_nsub;
2510 tree = create_tree (dfa, tree, NULL, SUBEXP);
2511 if (BE (tree == NULL, 0))
2516 tree->token.opr.idx = cur_nsub;
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2526 bin_tree_t *tree = NULL, *old_tree = NULL;
2527 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528 re_token_t start_token = *token;
2530 if (token->type == OP_OPEN_DUP_NUM)
2533 start = fetch_number (regexp, token, syntax);
2534 if (start == REG_MISSING)
2536 if (token->type == CHARACTER && token->opr.c == ',')
2537 start = 0; /* We treat "{,m}" as "{0,m}". */
2540 *err = REG_BADBR; /* <re>{} is invalid. */
2544 if (BE (start != REG_ERROR, 1))
2546 /* We treat "{n}" as "{n,n}". */
2547 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548 : ((token->type == CHARACTER && token->opr.c == ',')
2549 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2551 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2553 /* Invalid sequence. */
2554 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2556 if (token->type == END_OF_RE)
2564 /* If the syntax bit is set, rollback. */
2565 re_string_set_index (regexp, start_idx);
2566 *token = start_token;
2567 token->type = CHARACTER;
2568 /* mb_partial and word_char bits should be already initialized by
2573 if (BE ((end != REG_MISSING && start > end)
2574 || token->type != OP_CLOSE_DUP_NUM, 0))
2576 /* First number greater than second. */
2581 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2589 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2593 fetch_token (token, regexp, syntax);
2595 if (BE (elem == NULL, 0))
2597 if (BE (start == 0 && end == 0, 0))
2599 postorder (elem, free_tree, NULL);
2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2604 if (BE (start > 0, 0))
2607 for (i = 2; i <= start; ++i)
2609 elem = duplicate_tree (elem, dfa);
2610 tree = create_tree (dfa, tree, elem, CONCAT);
2611 if (BE (elem == NULL || tree == NULL, 0))
2612 goto parse_dup_op_espace;
2618 /* Duplicate ELEM before it is marked optional. */
2619 elem = duplicate_tree (elem, dfa);
2625 if (elem->token.type == SUBEXP)
2627 uintptr_t subidx = elem->token.opr.idx;
2628 postorder (elem, mark_opt_subexp, (void *) subidx);
2631 tree = create_tree (dfa, elem, NULL,
2632 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633 if (BE (tree == NULL, 0))
2634 goto parse_dup_op_espace;
2636 /* From gnulib's "intprops.h":
2637 True if the arithmetic type T is signed. */
2638 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2640 /* This loop is actually executed only when end != REG_MISSING,
2641 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2642 already created the start+1-th copy. */
2643 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2644 for (i = start + 2; i <= end; ++i)
2646 elem = duplicate_tree (elem, dfa);
2647 tree = create_tree (dfa, tree, elem, CONCAT);
2648 if (BE (elem == NULL || tree == NULL, 0))
2649 goto parse_dup_op_espace;
2651 tree = create_tree (dfa, tree, NULL, OP_ALT);
2652 if (BE (tree == NULL, 0))
2653 goto parse_dup_op_espace;
2657 tree = create_tree (dfa, old_tree, tree, CONCAT);
2661 parse_dup_op_espace:
2666 /* Size of the names for collating symbol/equivalence_class/character_class.
2667 I'm not sure, but maybe enough. */
2668 #define BRACKET_NAME_BUF_SIZE 32
2671 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672 Build the range expression which starts from START_ELEM, and ends
2673 at END_ELEM. The result are written to MBCSET and SBCSET.
2674 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2675 mbcset->range_ends, is a pointer argument since we may
2678 static reg_errcode_t
2680 # ifdef RE_ENABLE_I18N
2681 build_range_exp (const reg_syntax_t syntax,
2683 re_charset_t *mbcset,
2685 const bracket_elem_t *start_elem,
2686 const bracket_elem_t *end_elem)
2687 # else /* not RE_ENABLE_I18N */
2688 build_range_exp (const reg_syntax_t syntax,
2690 const bracket_elem_t *start_elem,
2691 const bracket_elem_t *end_elem)
2692 # endif /* not RE_ENABLE_I18N */
2694 unsigned int start_ch, end_ch;
2695 /* Equivalence Classes and Character Classes can't be a range start/end. */
2696 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2697 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2701 /* We can handle no multi character collating elements without libc
2703 if (BE ((start_elem->type == COLL_SYM
2704 && strlen ((char *) start_elem->opr.name) > 1)
2705 || (end_elem->type == COLL_SYM
2706 && strlen ((char *) end_elem->opr.name) > 1), 0))
2707 return REG_ECOLLATE;
2709 # ifdef RE_ENABLE_I18N
2714 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 ? __btowc (start_ch) : start_elem->opr.wch);
2724 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 ? __btowc (end_ch) : end_elem->opr.wch);
2726 if (start_wc == WEOF || end_wc == WEOF)
2727 return REG_ECOLLATE;
2728 cmp_buf[0] = start_wc;
2729 cmp_buf[4] = end_wc;
2731 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2735 /* Got valid collation sequence values, add them as a new entry.
2736 However, for !_LIBC we have no collation elements: if the
2737 character set is single byte, the single byte character set
2738 that we build below suffices. parse_bracket_exp passes
2739 no MBCSET if dfa->mb_cur_max == 1. */
2742 /* Check the space of the arrays. */
2743 if (BE (*range_alloc == mbcset->nranges, 0))
2745 /* There is not enough space, need realloc. */
2746 wchar_t *new_array_start, *new_array_end;
2749 /* +1 in case of mbcset->nranges is 0. */
2750 new_nranges = 2 * mbcset->nranges + 1;
2751 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2752 are NULL if *range_alloc == 0. */
2753 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2758 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2761 mbcset->range_starts = new_array_start;
2762 mbcset->range_ends = new_array_end;
2763 *range_alloc = new_nranges;
2766 mbcset->range_starts[mbcset->nranges] = start_wc;
2767 mbcset->range_ends[mbcset->nranges++] = end_wc;
2770 /* Build the table for single byte characters. */
2771 for (wc = 0; wc < SBC_MAX; ++wc)
2774 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776 bitset_set (sbcset, wc);
2779 # else /* not RE_ENABLE_I18N */
2782 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2785 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2788 if (start_ch > end_ch)
2790 /* Build the table for single byte characters. */
2791 for (ch = 0; ch < SBC_MAX; ++ch)
2792 if (start_ch <= ch && ch <= end_ch)
2793 bitset_set (sbcset, ch);
2795 # endif /* not RE_ENABLE_I18N */
2798 #endif /* not _LIBC */
2801 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802 Build the collating element which is represented by NAME.
2803 The result are written to MBCSET and SBCSET.
2804 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805 pointer argument since we may update it. */
2807 static reg_errcode_t
2809 # ifdef RE_ENABLE_I18N
2810 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2811 Idx *coll_sym_alloc, const unsigned char *name)
2812 # else /* not RE_ENABLE_I18N */
2813 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2814 # endif /* not RE_ENABLE_I18N */
2816 size_t name_len = strlen ((const char *) name);
2817 if (BE (name_len != 1, 0))
2818 return REG_ECOLLATE;
2821 bitset_set (sbcset, name[0]);
2825 #endif /* not _LIBC */
2827 /* This function parse bracket expression like "[abc]", "[a-c]",
2831 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2832 reg_syntax_t syntax, reg_errcode_t *err)
2835 const unsigned char *collseqmb;
2836 const char *collseqwc;
2839 const int32_t *symb_table;
2840 const unsigned char *extra;
2842 /* Local function for parse_bracket_exp used in _LIBC environment.
2843 Seek the collating symbol entry corresponding to NAME.
2844 Return the index of the symbol in the SYMB_TABLE. */
2847 __attribute ((always_inline))
2848 seek_collating_symbol_entry (name, name_len)
2849 const unsigned char *name;
2852 int32_t hash = elem_hash ((const char *) name, name_len);
2853 int32_t elem = hash % table_size;
2854 if (symb_table[2 * elem] != 0)
2856 int32_t second = hash % (table_size - 2) + 1;
2860 /* First compare the hashing value. */
2861 if (symb_table[2 * elem] == hash
2862 /* Compare the length of the name. */
2863 && name_len == extra[symb_table[2 * elem + 1]]
2864 /* Compare the name. */
2865 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2868 /* Yep, this is the entry. */
2875 while (symb_table[2 * elem] != 0);
2880 /* Local function for parse_bracket_exp used in _LIBC environment.
2881 Look up the collation sequence value of BR_ELEM.
2882 Return the value if succeeded, UINT_MAX otherwise. */
2884 auto inline unsigned int
2885 __attribute ((always_inline))
2886 lookup_collation_sequence_value (br_elem)
2887 bracket_elem_t *br_elem;
2889 if (br_elem->type == SB_CHAR)
2892 if (MB_CUR_MAX == 1)
2895 return collseqmb[br_elem->opr.ch];
2898 wint_t wc = __btowc (br_elem->opr.ch);
2899 return __collseq_table_lookup (collseqwc, wc);
2902 else if (br_elem->type == MB_CHAR)
2905 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2907 else if (br_elem->type == COLL_SYM)
2909 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2913 elem = seek_collating_symbol_entry (br_elem->opr.name,
2915 if (symb_table[2 * elem] != 0)
2917 /* We found the entry. */
2918 idx = symb_table[2 * elem + 1];
2919 /* Skip the name of collating element name. */
2920 idx += 1 + extra[idx];
2921 /* Skip the byte sequence of the collating element. */
2922 idx += 1 + extra[idx];
2923 /* Adjust for the alignment. */
2924 idx = (idx + 3) & ~3;
2925 /* Skip the multibyte collation sequence value. */
2926 idx += sizeof (unsigned int);
2927 /* Skip the wide char sequence of the collating element. */
2928 idx += sizeof (unsigned int) *
2929 (1 + *(unsigned int *) (extra + idx));
2930 /* Return the collation sequence value. */
2931 return *(unsigned int *) (extra + idx);
2933 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2935 /* No valid character. Match it as a single byte
2937 return collseqmb[br_elem->opr.name[0]];
2940 else if (sym_name_len == 1)
2941 return collseqmb[br_elem->opr.name[0]];
2946 /* Local function for parse_bracket_exp used in _LIBC environment.
2947 Build the range expression which starts from START_ELEM, and ends
2948 at END_ELEM. The result are written to MBCSET and SBCSET.
2949 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2950 mbcset->range_ends, is a pointer argument since we may
2953 auto inline reg_errcode_t
2954 __attribute ((always_inline))
2955 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2956 re_charset_t *mbcset;
2959 bracket_elem_t *start_elem, *end_elem;
2962 uint32_t start_collseq;
2963 uint32_t end_collseq;
2965 /* Equivalence Classes and Character Classes can't be a range
2967 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2968 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2972 start_collseq = lookup_collation_sequence_value (start_elem);
2973 end_collseq = lookup_collation_sequence_value (end_elem);
2974 /* Check start/end collation sequence values. */
2975 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976 return REG_ECOLLATE;
2977 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2980 /* Got valid collation sequence values, add them as a new entry.
2981 However, if we have no collation elements, and the character set
2982 is single byte, the single byte character set that we
2983 build below suffices. */
2984 if (nrules > 0 || dfa->mb_cur_max > 1)
2986 /* Check the space of the arrays. */
2987 if (BE (*range_alloc == mbcset->nranges, 0))
2989 /* There is not enough space, need realloc. */
2990 uint32_t *new_array_start;
2991 uint32_t *new_array_end;
2994 /* +1 in case of mbcset->nranges is 0. */
2995 new_nranges = 2 * mbcset->nranges + 1;
2996 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2998 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3001 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3004 mbcset->range_starts = new_array_start;
3005 mbcset->range_ends = new_array_end;
3006 *range_alloc = new_nranges;
3009 mbcset->range_starts[mbcset->nranges] = start_collseq;
3010 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013 /* Build the table for single byte characters. */
3014 for (ch = 0; ch < SBC_MAX; ch++)
3016 uint32_t ch_collseq;
3018 if (MB_CUR_MAX == 1)
3021 ch_collseq = collseqmb[ch];
3023 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025 bitset_set (sbcset, ch);
3030 /* Local function for parse_bracket_exp used in _LIBC environment.
3031 Build the collating element which is represented by NAME.
3032 The result are written to MBCSET and SBCSET.
3033 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3034 pointer argument since we may update it. */
3036 auto inline reg_errcode_t
3037 __attribute ((always_inline))
3038 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3039 re_charset_t *mbcset;
3040 Idx *coll_sym_alloc;
3042 const unsigned char *name;
3045 size_t name_len = strlen ((const char *) name);
3048 elem = seek_collating_symbol_entry (name, name_len);
3049 if (symb_table[2 * elem] != 0)
3051 /* We found the entry. */
3052 idx = symb_table[2 * elem + 1];
3053 /* Skip the name of collating element name. */
3054 idx += 1 + extra[idx];
3056 else if (symb_table[2 * elem] == 0 && name_len == 1)
3058 /* No valid character, treat it as a normal
3060 bitset_set (sbcset, name[0]);
3064 return REG_ECOLLATE;
3066 /* Got valid collation sequence, add it as a new entry. */
3067 /* Check the space of the arrays. */
3068 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3070 /* Not enough, realloc it. */
3071 /* +1 in case of mbcset->ncoll_syms is 0. */
3072 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3073 /* Use realloc since mbcset->coll_syms is NULL
3075 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3076 new_coll_sym_alloc);
3077 if (BE (new_coll_syms == NULL, 0))
3079 mbcset->coll_syms = new_coll_syms;
3080 *coll_sym_alloc = new_coll_sym_alloc;
3082 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 if (BE (name_len != 1, 0))
3088 return REG_ECOLLATE;
3091 bitset_set (sbcset, name[0]);
3098 re_token_t br_token;
3099 re_bitset_ptr_t sbcset;
3100 #ifdef RE_ENABLE_I18N
3101 re_charset_t *mbcset;
3102 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3103 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3104 #endif /* not RE_ENABLE_I18N */
3105 bool non_match = false;
3106 bin_tree_t *work_tree;
3108 bool first_round = true;
3110 collseqmb = (const unsigned char *)
3111 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3112 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3118 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3119 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3120 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3121 _NL_COLLATE_SYMB_TABLEMB);
3122 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3123 _NL_COLLATE_SYMB_EXTRAMB);
3126 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3127 #ifdef RE_ENABLE_I18N
3128 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3129 #endif /* RE_ENABLE_I18N */
3130 #ifdef RE_ENABLE_I18N
3131 if (BE (sbcset == NULL || mbcset == NULL, 0))
3133 if (BE (sbcset == NULL, 0))
3134 #endif /* RE_ENABLE_I18N */
3137 #ifdef RE_ENABLE_I18N
3144 token_len = peek_token_bracket (token, regexp, syntax);
3145 if (BE (token->type == END_OF_RE, 0))
3148 goto parse_bracket_exp_free_return;
3150 if (token->type == OP_NON_MATCH_LIST)
3152 #ifdef RE_ENABLE_I18N
3153 mbcset->non_match = 1;
3154 #endif /* not RE_ENABLE_I18N */
3156 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3157 bitset_set (sbcset, '\n');
3158 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3159 token_len = peek_token_bracket (token, regexp, syntax);
3160 if (BE (token->type == END_OF_RE, 0))
3163 goto parse_bracket_exp_free_return;
3167 /* We treat the first ']' as a normal character. */
3168 if (token->type == OP_CLOSE_BRACKET)
3169 token->type = CHARACTER;
3173 bracket_elem_t start_elem, end_elem;
3174 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3175 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178 bool is_range_exp = false;
3181 start_elem.opr.name = start_name_buf;
3182 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3183 syntax, first_round);
3184 if (BE (ret != REG_NOERROR, 0))
3187 goto parse_bracket_exp_free_return;
3189 first_round = false;
3191 /* Get information about the next token. We need it in any case. */
3192 token_len = peek_token_bracket (token, regexp, syntax);
3194 /* Do not check for ranges if we know they are not allowed. */
3195 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3197 if (BE (token->type == END_OF_RE, 0))
3200 goto parse_bracket_exp_free_return;
3202 if (token->type == OP_CHARSET_RANGE)
3204 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3205 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3206 if (BE (token2.type == END_OF_RE, 0))
3209 goto parse_bracket_exp_free_return;
3211 if (token2.type == OP_CLOSE_BRACKET)
3213 /* We treat the last '-' as a normal character. */
3214 re_string_skip_bytes (regexp, -token_len);
3215 token->type = CHARACTER;
3218 is_range_exp = true;
3222 if (is_range_exp == true)
3224 end_elem.opr.name = end_name_buf;
3225 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3227 if (BE (ret != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3233 token_len = peek_token_bracket (token, regexp, syntax);
3236 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3237 &start_elem, &end_elem);
3239 # ifdef RE_ENABLE_I18N
3240 *err = build_range_exp (syntax, sbcset,
3241 dfa->mb_cur_max > 1 ? mbcset : NULL,
3242 &range_alloc, &start_elem, &end_elem);
3244 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3246 #endif /* RE_ENABLE_I18N */
3247 if (BE (*err != REG_NOERROR, 0))
3248 goto parse_bracket_exp_free_return;
3252 switch (start_elem.type)
3255 bitset_set (sbcset, start_elem.opr.ch);
3257 #ifdef RE_ENABLE_I18N
3259 /* Check whether the array has enough space. */
3260 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3262 wchar_t *new_mbchars;
3263 /* Not enough, realloc it. */
3264 /* +1 in case of mbcset->nmbchars is 0. */
3265 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3266 /* Use realloc since array is NULL if *alloc == 0. */
3267 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3269 if (BE (new_mbchars == NULL, 0))
3270 goto parse_bracket_exp_espace;
3271 mbcset->mbchars = new_mbchars;
3273 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3275 #endif /* RE_ENABLE_I18N */
3277 *err = build_equiv_class (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 mbcset, &equiv_class_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 start_elem.opr.name);
3282 if (BE (*err != REG_NOERROR, 0))
3283 goto parse_bracket_exp_free_return;
3286 *err = build_collating_symbol (sbcset,
3287 #ifdef RE_ENABLE_I18N
3288 mbcset, &coll_sym_alloc,
3289 #endif /* RE_ENABLE_I18N */
3290 start_elem.opr.name);
3291 if (BE (*err != REG_NOERROR, 0))
3292 goto parse_bracket_exp_free_return;
3295 *err = build_charclass (regexp->trans, sbcset,
3296 #ifdef RE_ENABLE_I18N
3297 mbcset, &char_class_alloc,
3298 #endif /* RE_ENABLE_I18N */
3299 start_elem.opr.name, syntax);
3300 if (BE (*err != REG_NOERROR, 0))
3301 goto parse_bracket_exp_free_return;
3308 if (BE (token->type == END_OF_RE, 0))
3311 goto parse_bracket_exp_free_return;
3313 if (token->type == OP_CLOSE_BRACKET)
3317 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3319 /* If it is non-matching list. */
3321 bitset_not (sbcset);
3323 #ifdef RE_ENABLE_I18N
3324 /* Ensure only single byte characters are set. */
3325 if (dfa->mb_cur_max > 1)
3326 bitset_mask (sbcset, dfa->sb_char);
3328 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330 || mbcset->non_match)))
3332 bin_tree_t *mbc_tree;
3334 /* Build a tree for complex bracket. */
3335 dfa->has_mb_node = 1;
3336 br_token.type = COMPLEX_BRACKET;
3337 br_token.opr.mbcset = mbcset;
3338 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339 if (BE (mbc_tree == NULL, 0))
3340 goto parse_bracket_exp_espace;
3341 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342 if (sbcset[sbc_idx])
3344 /* If there are no bits set in sbcset, there is no point
3345 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3346 if (sbc_idx < BITSET_WORDS)
3348 /* Build a tree for simple bracket. */
3349 br_token.type = SIMPLE_BRACKET;
3350 br_token.opr.sbcset = sbcset;
3351 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352 if (BE (work_tree == NULL, 0))
3353 goto parse_bracket_exp_espace;
3355 /* Then join them by ALT node. */
3356 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357 if (BE (work_tree == NULL, 0))
3358 goto parse_bracket_exp_espace;
3363 work_tree = mbc_tree;
3367 #endif /* not RE_ENABLE_I18N */
3369 #ifdef RE_ENABLE_I18N
3370 free_charset (mbcset);
3372 /* Build a tree for simple bracket. */
3373 br_token.type = SIMPLE_BRACKET;
3374 br_token.opr.sbcset = sbcset;
3375 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376 if (BE (work_tree == NULL, 0))
3377 goto parse_bracket_exp_espace;
3381 parse_bracket_exp_espace:
3383 parse_bracket_exp_free_return:
3385 #ifdef RE_ENABLE_I18N
3386 free_charset (mbcset);
3387 #endif /* RE_ENABLE_I18N */
3391 /* Parse an element in the bracket expression. */
3393 static reg_errcode_t
3394 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395 re_token_t *token, int token_len, re_dfa_t *dfa,
3396 reg_syntax_t syntax, bool accept_hyphen)
3398 #ifdef RE_ENABLE_I18N
3400 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401 if (cur_char_size > 1)
3403 elem->type = MB_CHAR;
3404 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405 re_string_skip_bytes (regexp, cur_char_size);
3408 #endif /* RE_ENABLE_I18N */
3409 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3410 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411 || token->type == OP_OPEN_EQUIV_CLASS)
3412 return parse_bracket_symbol (elem, regexp, token);
3413 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3415 /* A '-' must only appear as anything but a range indicator before
3416 the closing bracket. Everything else is an error. */
3418 (void) peek_token_bracket (&token2, regexp, syntax);
3419 if (token2.type != OP_CLOSE_BRACKET)
3420 /* The actual error value is not standardized since this whole
3421 case is undefined. But ERANGE makes good sense. */
3424 elem->type = SB_CHAR;
3425 elem->opr.ch = token->opr.c;
3429 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3430 such as [:<character_class>:], [.<collating_element>.], and
3431 [=<equivalent_class>=]. */
3433 static reg_errcode_t
3434 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3437 unsigned char ch, delim = token->opr.c;
3439 if (re_string_eoi(regexp))
3443 if (i >= BRACKET_NAME_BUF_SIZE)
3445 if (token->type == OP_OPEN_CHAR_CLASS)
3446 ch = re_string_fetch_byte_case (regexp);
3448 ch = re_string_fetch_byte (regexp);
3449 if (re_string_eoi(regexp))
3451 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3453 elem->opr.name[i] = ch;
3455 re_string_skip_bytes (regexp, 1);
3456 elem->opr.name[i] = '\0';
3457 switch (token->type)
3459 case OP_OPEN_COLL_ELEM:
3460 elem->type = COLL_SYM;
3462 case OP_OPEN_EQUIV_CLASS:
3463 elem->type = EQUIV_CLASS;
3465 case OP_OPEN_CHAR_CLASS:
3466 elem->type = CHAR_CLASS;
3474 /* Helper function for parse_bracket_exp.
3475 Build the equivalence class which is represented by NAME.
3476 The result are written to MBCSET and SBCSET.
3477 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3478 is a pointer argument since we may update it. */
3480 static reg_errcode_t
3481 #ifdef RE_ENABLE_I18N
3482 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483 Idx *equiv_class_alloc, const unsigned char *name)
3484 #else /* not RE_ENABLE_I18N */
3485 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486 #endif /* not RE_ENABLE_I18N */
3489 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3492 const int32_t *table, *indirect;
3493 const unsigned char *weights, *extra, *cp;
3494 unsigned char char_buf[2];
3498 /* This #include defines a local function! */
3499 # include <locale/weight.h>
3500 /* Calculate the index for equivalence class. */
3502 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3503 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_WEIGHTMB);
3505 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3506 _NL_COLLATE_EXTRAMB);
3507 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3508 _NL_COLLATE_INDIRECTMB);
3509 idx1 = findidx (&cp, -1);
3510 if (BE (idx1 == 0 || *cp != '\0', 0))
3511 /* This isn't a valid character. */
3512 return REG_ECOLLATE;
3514 /* Build single byte matching table for this equivalence class. */
3515 len = weights[idx1 & 0xffffff];
3516 for (ch = 0; ch < SBC_MAX; ++ch)
3520 idx2 = findidx (&cp, 1);
3525 /* This isn't a valid character. */
3527 /* Compare only if the length matches and the collation rule
3528 index is the same. */
3529 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3533 while (cnt <= len &&
3534 weights[(idx1 & 0xffffff) + 1 + cnt]
3535 == weights[(idx2 & 0xffffff) + 1 + cnt])
3539 bitset_set (sbcset, ch);
3542 /* Check whether the array has enough space. */
3543 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3545 /* Not enough, realloc it. */
3546 /* +1 in case of mbcset->nequiv_classes is 0. */
3547 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3548 /* Use realloc since the array is NULL if *alloc == 0. */
3549 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3551 new_equiv_class_alloc);
3552 if (BE (new_equiv_classes == NULL, 0))
3554 mbcset->equiv_classes = new_equiv_classes;
3555 *equiv_class_alloc = new_equiv_class_alloc;
3557 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3562 if (BE (strlen ((const char *) name) != 1, 0))
3563 return REG_ECOLLATE;
3564 bitset_set (sbcset, *name);
3569 /* Helper function for parse_bracket_exp.
3570 Build the character class which is represented by NAME.
3571 The result are written to MBCSET and SBCSET.
3572 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3573 is a pointer argument since we may update it. */
3575 static reg_errcode_t
3576 #ifdef RE_ENABLE_I18N
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 re_charset_t *mbcset, Idx *char_class_alloc,
3579 const unsigned char *class_name, reg_syntax_t syntax)
3580 #else /* not RE_ENABLE_I18N */
3581 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3582 const unsigned char *class_name, reg_syntax_t syntax)
3583 #endif /* not RE_ENABLE_I18N */
3586 const char *name = (const char *) class_name;
3588 /* In case of REG_ICASE "upper" and "lower" match the both of
3589 upper and lower cases. */
3590 if ((syntax & RE_ICASE)
3591 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3594 #ifdef RE_ENABLE_I18N
3595 /* Check the space of the arrays. */
3596 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3598 /* Not enough, realloc it. */
3599 /* +1 in case of mbcset->nchar_classes is 0. */
3600 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3601 /* Use realloc since array is NULL if *alloc == 0. */
3602 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3603 new_char_class_alloc);
3604 if (BE (new_char_classes == NULL, 0))
3606 mbcset->char_classes = new_char_classes;
3607 *char_class_alloc = new_char_class_alloc;
3609 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3610 #endif /* RE_ENABLE_I18N */
3612 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3614 if (BE (trans != NULL, 0)) \
3616 for (i = 0; i < SBC_MAX; ++i) \
3617 if (ctype_func (i)) \
3618 bitset_set (sbcset, trans[i]); \
3622 for (i = 0; i < SBC_MAX; ++i) \
3623 if (ctype_func (i)) \
3624 bitset_set (sbcset, i); \
3628 if (strcmp (name, "alnum") == 0)
3629 BUILD_CHARCLASS_LOOP (isalnum);
3630 else if (strcmp (name, "cntrl") == 0)
3631 BUILD_CHARCLASS_LOOP (iscntrl);
3632 else if (strcmp (name, "lower") == 0)
3633 BUILD_CHARCLASS_LOOP (islower);
3634 else if (strcmp (name, "space") == 0)
3635 BUILD_CHARCLASS_LOOP (isspace);
3636 else if (strcmp (name, "alpha") == 0)
3637 BUILD_CHARCLASS_LOOP (isalpha);
3638 else if (strcmp (name, "digit") == 0)
3639 BUILD_CHARCLASS_LOOP (isdigit);
3640 else if (strcmp (name, "print") == 0)
3641 BUILD_CHARCLASS_LOOP (isprint);
3642 else if (strcmp (name, "upper") == 0)
3643 BUILD_CHARCLASS_LOOP (isupper);
3644 else if (strcmp (name, "blank") == 0)
3645 BUILD_CHARCLASS_LOOP (isblank);
3646 else if (strcmp (name, "graph") == 0)
3647 BUILD_CHARCLASS_LOOP (isgraph);
3648 else if (strcmp (name, "punct") == 0)
3649 BUILD_CHARCLASS_LOOP (ispunct);
3650 else if (strcmp (name, "xdigit") == 0)
3651 BUILD_CHARCLASS_LOOP (isxdigit);
3659 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3660 const unsigned char *class_name,
3661 const unsigned char *extra, bool non_match,
3664 re_bitset_ptr_t sbcset;
3665 #ifdef RE_ENABLE_I18N
3666 re_charset_t *mbcset;
3668 #endif /* not RE_ENABLE_I18N */
3670 re_token_t br_token;
3673 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3674 #ifdef RE_ENABLE_I18N
3675 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676 #endif /* RE_ENABLE_I18N */
3678 #ifdef RE_ENABLE_I18N
3679 if (BE (sbcset == NULL || mbcset == NULL, 0))
3680 #else /* not RE_ENABLE_I18N */
3681 if (BE (sbcset == NULL, 0))
3682 #endif /* not RE_ENABLE_I18N */
3690 #ifdef RE_ENABLE_I18N
3691 mbcset->non_match = 1;
3692 #endif /* not RE_ENABLE_I18N */
3695 /* We don't care the syntax in this case. */
3696 ret = build_charclass (trans, sbcset,
3697 #ifdef RE_ENABLE_I18N
3699 #endif /* RE_ENABLE_I18N */
3702 if (BE (ret != REG_NOERROR, 0))
3705 #ifdef RE_ENABLE_I18N
3706 free_charset (mbcset);
3707 #endif /* RE_ENABLE_I18N */
3711 /* \w match '_' also. */
3712 for (; *extra; extra++)
3713 bitset_set (sbcset, *extra);
3715 /* If it is non-matching list. */
3717 bitset_not (sbcset);
3719 #ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa->mb_cur_max > 1)
3722 bitset_mask (sbcset, dfa->sb_char);
3725 /* Build a tree for simple bracket. */
3726 br_token.type = SIMPLE_BRACKET;
3727 br_token.opr.sbcset = sbcset;
3728 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3729 if (BE (tree == NULL, 0))
3730 goto build_word_op_espace;
3732 #ifdef RE_ENABLE_I18N
3733 if (dfa->mb_cur_max > 1)
3735 bin_tree_t *mbc_tree;
3736 /* Build a tree for complex bracket. */
3737 br_token.type = COMPLEX_BRACKET;
3738 br_token.opr.mbcset = mbcset;
3739 dfa->has_mb_node = 1;
3740 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3741 if (BE (mbc_tree == NULL, 0))
3742 goto build_word_op_espace;
3743 /* Then join them by ALT node. */
3744 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3745 if (BE (mbc_tree != NULL, 1))
3750 free_charset (mbcset);
3753 #else /* not RE_ENABLE_I18N */
3755 #endif /* not RE_ENABLE_I18N */
3757 build_word_op_espace:
3759 #ifdef RE_ENABLE_I18N
3760 free_charset (mbcset);
3761 #endif /* RE_ENABLE_I18N */
3766 /* This is intended for the expressions like "a{1,3}".
3767 Fetch a number from 'input', and return the number.
3768 Return REG_MISSING if the number field is empty like "{,1}".
3769 Return RE_DUP_MAX + 1 if the number field is too large.
3770 Return REG_ERROR if an error occurred. */
3773 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3775 Idx num = REG_MISSING;
3779 fetch_token (token, input, syntax);
3781 if (BE (token->type == END_OF_RE, 0))
3783 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3785 num = ((token->type != CHARACTER || c < '0' || '9' < c
3786 || num == REG_ERROR)
3788 : num == REG_MISSING
3790 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3795 #ifdef RE_ENABLE_I18N
3797 free_charset (re_charset_t *cset)
3799 re_free (cset->mbchars);
3801 re_free (cset->coll_syms);
3802 re_free (cset->equiv_classes);
3803 re_free (cset->range_starts);
3804 re_free (cset->range_ends);
3806 re_free (cset->char_classes);
3809 #endif /* RE_ENABLE_I18N */
3811 /* Functions for binary tree operation. */
3813 /* Create a tree node. */
3816 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817 re_token_type_t type)
3821 return create_token_tree (dfa, left, right, &t);
3825 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3826 const re_token_t *token)
3829 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3831 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3833 if (storage == NULL)
3835 storage->next = dfa->str_tree_storage;
3836 dfa->str_tree_storage = storage;
3837 dfa->str_tree_storage_idx = 0;
3839 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3841 tree->parent = NULL;
3843 tree->right = right;
3844 tree->token = *token;
3845 tree->token.duplicated = 0;
3846 tree->token.opt_subexp = 0;
3849 tree->node_idx = REG_MISSING;
3852 left->parent = tree;
3854 right->parent = tree;
3858 /* Mark the tree SRC as an optional subexpression.
3859 To be called from preorder or postorder. */
3861 static reg_errcode_t
3862 mark_opt_subexp (void *extra, bin_tree_t *node)
3864 Idx idx = (uintptr_t) extra;
3865 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3866 node->token.opt_subexp = 1;
3871 /* Free the allocated memory inside NODE. */
3874 free_token (re_token_t *node)
3876 #ifdef RE_ENABLE_I18N
3877 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3878 free_charset (node->opr.mbcset);
3880 #endif /* RE_ENABLE_I18N */
3881 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3882 re_free (node->opr.sbcset);
3885 /* Worker function for tree walking. Free the allocated memory inside NODE
3886 and its children. */
3888 static reg_errcode_t
3889 free_tree (void *extra, bin_tree_t *node)
3891 free_token (&node->token);
3896 /* Duplicate the node SRC, and return new node. This is a preorder
3897 visit similar to the one implemented by the generic visitor, but
3898 we need more infrastructure to maintain two parallel trees --- so,
3899 it's easier to duplicate. */
3902 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3904 const bin_tree_t *node;
3905 bin_tree_t *dup_root;
3906 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3908 for (node = root; ; )
3910 /* Create a new tree and link it back to the current parent. */
3911 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3914 (*p_new)->parent = dup_node;
3915 (*p_new)->token.duplicated = 1;
3918 /* Go to the left node, or up and to the right. */
3922 p_new = &dup_node->left;
3926 const bin_tree_t *prev = NULL;
3927 while (node->right == prev || node->right == NULL)
3930 node = node->parent;
3931 dup_node = dup_node->parent;
3936 p_new = &dup_node->right;