1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 static void optimize_utf8 (re_dfa_t *dfa);
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
37 static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_token_t *token, int token_len,
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
98 #else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 /* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
127 static const char __re_error_msgid[] =
129 #define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 static const size_t __re_error_msgid_idx[] =
202 /* Entry points for GNU code. */
204 /* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
213 re_compile_pattern (pattern, length, bufp)
216 struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240 weak_alias (__re_compile_pattern, re_compile_pattern)
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
259 re_set_syntax (syntax)
262 reg_syntax_t ret = re_syntax_options;
264 re_syntax_options = syntax;
268 weak_alias (__re_set_syntax, re_set_syntax)
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
275 re_dfa_t *dfa = bufp->buffer;
276 char *fastmap = bufp->fastmap;
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[tolower (ch)] = 1;
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 re_dfa_t *dfa = bufp->buffer;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
317 if (type == CHARACTER)
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 unsigned char buf[MB_LEN_MAX];
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 re_set_fastmap (fastmap, false, buf[0]);
343 else if (type == SIMPLE_BRACKET)
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 /* See if we have to try all bytes which start multiple collation
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
375 re_set_fastmap (fastmap, icase, i);
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
386 || cset->nequiv_classes
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
527 weak_alias (__regcomp, regcomp)
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode, preg, errbuf, errbuf_size)
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
540 #else /* size_t might promote */
542 regerror (int errcode, const regex_t *_Restrict_ preg,
543 char *_Restrict_ errbuf, size_t errbuf_size)
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 msg_size = strlen (msg) + 1; /* Includes the null. */
562 if (BE (errbuf_size != 0, 1))
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
570 memcpy (errbuf, msg, cpy_size);
576 weak_alias (__regerror, regerror)
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset_t utf8_sb_map =
587 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
610 free_dfa_content (re_dfa_t *dfa)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
638 re_dfastate_t *state = entry->array[j];
641 re_free (entry->array);
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
648 re_free (dfa->subexp_map);
650 re_free (dfa->re_str);
657 /* Free dynamically allocated space used by PREG. */
663 re_dfa_t *dfa = preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
672 re_free (preg->translate);
673 preg->translate = NULL;
676 weak_alias (__regfree, regfree)
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
682 #if defined _REGEX_RE_COMP || defined _LIBC
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
707 if (re_comp_buf.buffer)
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
716 if (re_comp_buf.fastmap == NULL)
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740 libc_freeres_fn (free_mem)
742 __regfree (&re_comp_buf);
746 #endif /* _REGEX_RE_COMP */
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
756 reg_errcode_t err = REG_NOERROR;
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
769 /* Initialize the dfa. */
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
780 preg->allocated = sizeof (re_dfa_t);
783 preg->used = sizeof (re_dfa_t);
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
788 free_dfa_content (dfa);
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
799 __libc_lock_init (dfa->lock);
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
814 /* Parse the regular expression, and build a structure tree. */
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
838 if (BE (err != REG_NOERROR, 0))
840 free_dfa_content (dfa);
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 __re_size_t table_size;
856 const char *codeset_name;
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 size_t max_i18n_object_size = 0;
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
870 memset (dfa, '\0', sizeof (re_dfa_t));
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
893 dfa->mb_cur_max = MB_CUR_MAX;
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
901 codeset_name = nl_langinfo (CODESET);
902 if (strcasecmp (codeset_name, "UTF-8") == 0
903 || strcasecmp (codeset_name, "UTF8") == 0)
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
911 #ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 if (BE (dfa->sb_char == NULL, 0))
924 /* Set the bits corresponding to single byte chars. */
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
928 wint_t wch = __btowc (ch);
930 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
932 if (isascii (ch) && wch != ch)
933 dfa->map_notascii = 1;
940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
945 /* Initialize WORD_CHAR table, which indicate which character is
946 "word". In this case "word" means that it is the word construction
947 character used by some operators like "\<", "\>", etc. */
951 init_word_char (re_dfa_t *dfa)
953 dfa->word_ops_used = 1;
957 if (BE (dfa->map_notascii == 0, 1))
959 bitset_word_t bits0 = 0x00000000;
960 bitset_word_t bits1 = 0x03ff0000;
961 bitset_word_t bits2 = 0x87fffffe;
962 bitset_word_t bits3 = 0x07fffffe;
963 if (BITSET_WORD_BITS == 64)
965 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
966 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
969 else if (BITSET_WORD_BITS == 32)
971 dfa->word_char[0] = bits0;
972 dfa->word_char[1] = bits1;
973 dfa->word_char[2] = bits2;
974 dfa->word_char[3] = bits3;
981 if (BE (dfa->is_utf8, 1))
983 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
989 for (; i < BITSET_WORDS; ++i)
990 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
991 if (isalnum (ch) || ch == '_')
992 dfa->word_char[i] |= (bitset_word_t) 1 << j;
995 /* Free the work area which are only used while compiling. */
998 free_workarea_compile (regex_t *preg)
1000 re_dfa_t *dfa = preg->buffer;
1001 bin_tree_storage_t *storage, *next;
1002 for (storage = dfa->str_tree_storage; storage; storage = next)
1004 next = storage->next;
1007 dfa->str_tree_storage = NULL;
1008 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1009 dfa->str_tree = NULL;
1010 re_free (dfa->org_indices);
1011 dfa->org_indices = NULL;
1014 /* Create initial states for all contexts. */
1016 static reg_errcode_t
1017 create_initial_state (re_dfa_t *dfa)
1021 re_node_set init_nodes;
1023 /* Initial states have the epsilon closure of the node which is
1024 the first node of the regular expression. */
1025 first = dfa->str_tree->first->node_idx;
1026 dfa->init_node = first;
1027 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1028 if (BE (err != REG_NOERROR, 0))
1031 /* The back-references which are in initial states can epsilon transit,
1032 since in this case all of the subexpressions can be null.
1033 Then we add epsilon closures of the nodes which are the next nodes of
1034 the back-references. */
1035 if (dfa->nbackref > 0)
1036 for (i = 0; i < init_nodes.nelem; ++i)
1038 Idx node_idx = init_nodes.elems[i];
1039 re_token_type_t type = dfa->nodes[node_idx].type;
1042 if (type != OP_BACK_REF)
1044 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1046 re_token_t *clexp_node;
1047 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1048 if (clexp_node->type == OP_CLOSE_SUBEXP
1049 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1052 if (clexp_idx == init_nodes.nelem)
1055 if (type == OP_BACK_REF)
1057 Idx dest_idx = dfa->edests[node_idx].elems[0];
1058 if (!re_node_set_contains (&init_nodes, dest_idx))
1060 reg_errcode_t merge_err
1061 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1062 if (merge_err != REG_NOERROR)
1069 /* It must be the first time to invoke acquire_state. */
1070 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1071 /* We don't check ERR here, since the initial state must not be NULL. */
1072 if (BE (dfa->init_state == NULL, 0))
1074 if (dfa->init_state->has_constraint)
1076 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1078 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1080 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1084 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1085 || dfa->init_state_begbuf == NULL, 0))
1089 dfa->init_state_word = dfa->init_state_nl
1090 = dfa->init_state_begbuf = dfa->init_state;
1092 re_node_set_free (&init_nodes);
1096 #ifdef RE_ENABLE_I18N
1097 /* If it is possible to do searching in single byte encoding instead of UTF-8
1098 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1099 DFA nodes where needed. */
1102 optimize_utf8 (re_dfa_t *dfa)
1106 bool mb_chars = false;
1107 bool has_period = false;
1109 for (node = 0; node < dfa->nodes_len; ++node)
1110 switch (dfa->nodes[node].type)
1113 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1117 switch (dfa->nodes[node].opr.ctx_type)
1125 /* Word anchors etc. cannot be handled. It's okay to test
1126 opr.ctx_type since constraints (for all DFA nodes) are
1127 created by ORing one or more opr.ctx_type values. */
1137 case OP_DUP_ASTERISK:
1138 case OP_OPEN_SUBEXP:
1139 case OP_CLOSE_SUBEXP:
1141 case COMPLEX_BRACKET:
1143 case SIMPLE_BRACKET:
1144 /* Just double check. */
1146 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1148 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1149 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1151 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1161 if (mb_chars || has_period)
1162 for (node = 0; node < dfa->nodes_len; ++node)
1164 if (dfa->nodes[node].type == CHARACTER
1165 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1166 dfa->nodes[node].mb_partial = 0;
1167 else if (dfa->nodes[node].type == OP_PERIOD)
1168 dfa->nodes[node].type = OP_UTF8_PERIOD;
1171 /* The search can be in single byte locale. */
1172 dfa->mb_cur_max = 1;
1174 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1178 /* Analyze the structure tree, and calculate "first", "next", "edest",
1179 "eclosure", and "inveclosure". */
1181 static reg_errcode_t
1182 analyze (regex_t *preg)
1184 re_dfa_t *dfa = preg->buffer;
1187 /* Allocate arrays. */
1188 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1189 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1190 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1191 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1192 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1193 || dfa->eclosures == NULL, 0))
1196 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1197 if (dfa->subexp_map != NULL)
1200 for (i = 0; i < preg->re_nsub; i++)
1201 dfa->subexp_map[i] = i;
1202 preorder (dfa->str_tree, optimize_subexps, dfa);
1203 for (i = 0; i < preg->re_nsub; i++)
1204 if (dfa->subexp_map[i] != i)
1206 if (i == preg->re_nsub)
1208 free (dfa->subexp_map);
1209 dfa->subexp_map = NULL;
1213 ret = postorder (dfa->str_tree, lower_subexps, preg);
1214 if (BE (ret != REG_NOERROR, 0))
1216 ret = postorder (dfa->str_tree, calc_first, dfa);
1217 if (BE (ret != REG_NOERROR, 0))
1219 preorder (dfa->str_tree, calc_next, dfa);
1220 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1221 if (BE (ret != REG_NOERROR, 0))
1223 ret = calc_eclosure (dfa);
1224 if (BE (ret != REG_NOERROR, 0))
1227 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1228 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1229 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1232 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1233 if (BE (dfa->inveclosures == NULL, 0))
1235 ret = calc_inveclosure (dfa);
1241 /* Our parse trees are very unbalanced, so we cannot use a stack to
1242 implement parse tree visits. Instead, we use parent pointers and
1243 some hairy code in these two functions. */
1244 static reg_errcode_t
1245 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 bin_tree_t *node, *prev;
1250 for (node = root; ; )
1252 /* Descend down the tree, preferably to the left (or to the right
1253 if that's the only child). */
1254 while (node->left || node->right)
1262 reg_errcode_t err = fn (extra, node);
1263 if (BE (err != REG_NOERROR, 0))
1265 if (node->parent == NULL)
1268 node = node->parent;
1270 /* Go up while we have a node that is reached from the right. */
1271 while (node->right == prev || node->right == NULL);
1276 static reg_errcode_t
1277 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1282 for (node = root; ; )
1284 reg_errcode_t err = fn (extra, node);
1285 if (BE (err != REG_NOERROR, 0))
1288 /* Go to the left node, or up and to the right. */
1293 bin_tree_t *prev = NULL;
1294 while (node->right == prev || node->right == NULL)
1297 node = node->parent;
1306 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1307 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1308 backreferences as well. Requires a preorder visit. */
1309 static reg_errcode_t
1310 optimize_subexps (void *extra, bin_tree_t *node)
1312 re_dfa_t *dfa = (re_dfa_t *) extra;
1314 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1316 int idx = node->token.opr.idx;
1317 node->token.opr.idx = dfa->subexp_map[idx];
1318 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1321 else if (node->token.type == SUBEXP
1322 && node->left && node->left->token.type == SUBEXP)
1324 Idx other_idx = node->left->token.opr.idx;
1326 node->left = node->left->left;
1328 node->left->parent = node;
1330 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1331 if (other_idx < BITSET_WORD_BITS)
1332 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1338 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1339 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1340 static reg_errcode_t
1341 lower_subexps (void *extra, bin_tree_t *node)
1343 regex_t *preg = (regex_t *) extra;
1344 reg_errcode_t err = REG_NOERROR;
1346 if (node->left && node->left->token.type == SUBEXP)
1348 node->left = lower_subexp (&err, preg, node->left);
1350 node->left->parent = node;
1352 if (node->right && node->right->token.type == SUBEXP)
1354 node->right = lower_subexp (&err, preg, node->right);
1356 node->right->parent = node;
1363 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1365 re_dfa_t *dfa = preg->buffer;
1366 bin_tree_t *body = node->left;
1367 bin_tree_t *op, *cls, *tree1, *tree;
1370 /* We do not optimize empty subexpressions, because otherwise we may
1371 have bad CONCAT nodes with NULL children. This is obviously not
1372 very common, so we do not lose much. An example that triggers
1373 this case is the sed "script" /\(\)/x. */
1374 && node->left != NULL
1375 && (node->token.opr.idx >= BITSET_WORD_BITS
1376 || !(dfa->used_bkref_map
1377 & ((bitset_word_t) 1 << node->token.opr.idx))))
1380 /* Convert the SUBEXP node to the concatenation of an
1381 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1382 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1383 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1384 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1385 tree = create_tree (dfa, op, tree1, CONCAT);
1386 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1392 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1393 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1398 nodes. Requires a postorder visit. */
1399 static reg_errcode_t
1400 calc_first (void *extra, bin_tree_t *node)
1402 re_dfa_t *dfa = (re_dfa_t *) extra;
1403 if (node->token.type == CONCAT)
1405 node->first = node->left->first;
1406 node->node_idx = node->left->node_idx;
1411 node->node_idx = re_dfa_add_node (dfa, node->token);
1412 if (BE (node->node_idx == REG_MISSING, 0))
1414 if (node->token.type == ANCHOR)
1415 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1420 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1421 static reg_errcode_t
1422 calc_next (void *extra, bin_tree_t *node)
1424 switch (node->token.type)
1426 case OP_DUP_ASTERISK:
1427 node->left->next = node;
1430 node->left->next = node->right->first;
1431 node->right->next = node->next;
1435 node->left->next = node->next;
1437 node->right->next = node->next;
1443 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1444 static reg_errcode_t
1445 link_nfa_nodes (void *extra, bin_tree_t *node)
1447 re_dfa_t *dfa = (re_dfa_t *) extra;
1448 Idx idx = node->node_idx;
1449 reg_errcode_t err = REG_NOERROR;
1451 switch (node->token.type)
1457 assert (node->next == NULL);
1460 case OP_DUP_ASTERISK:
1464 dfa->has_plural_match = 1;
1465 if (node->left != NULL)
1466 left = node->left->first->node_idx;
1468 left = node->next->node_idx;
1469 if (node->right != NULL)
1470 right = node->right->first->node_idx;
1472 right = node->next->node_idx;
1473 assert (REG_VALID_INDEX (left));
1474 assert (REG_VALID_INDEX (right));
1475 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1480 case OP_OPEN_SUBEXP:
1481 case OP_CLOSE_SUBEXP:
1482 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486 dfa->nexts[idx] = node->next->node_idx;
1487 if (node->token.type == OP_BACK_REF)
1488 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492 assert (!IS_EPSILON_NODE (node->token.type));
1493 dfa->nexts[idx] = node->next->node_idx;
1500 /* Duplicate the epsilon closure of the node ROOT_NODE.
1501 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1502 to their own constraint. */
1504 static reg_errcode_t
1506 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1507 Idx root_node, unsigned int init_constraint)
1509 Idx org_node, clone_node;
1511 unsigned int constraint = init_constraint;
1512 for (org_node = top_org_node, clone_node = top_clone_node;;)
1514 Idx org_dest, clone_dest;
1515 if (dfa->nodes[org_node].type == OP_BACK_REF)
1517 /* If the back reference epsilon-transit, its destination must
1518 also have the constraint. Then duplicate the epsilon closure
1519 of the destination of the back reference, and store it in
1520 edests of the back reference. */
1521 org_dest = dfa->nexts[org_node];
1522 re_node_set_empty (dfa->edests + clone_node);
1523 clone_dest = duplicate_node (dfa, org_dest, constraint);
1524 if (BE (clone_dest == REG_MISSING, 0))
1526 dfa->nexts[clone_node] = dfa->nexts[org_node];
1527 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 else if (dfa->edests[org_node].nelem == 0)
1533 /* In case of the node can't epsilon-transit, don't duplicate the
1534 destination and store the original destination as the
1535 destination of the node. */
1536 dfa->nexts[clone_node] = dfa->nexts[org_node];
1539 else if (dfa->edests[org_node].nelem == 1)
1541 /* In case of the node can epsilon-transit, and it has only one
1543 org_dest = dfa->edests[org_node].elems[0];
1544 re_node_set_empty (dfa->edests + clone_node);
1545 /* If the node is root_node itself, it means the epsilon closure
1546 has a loop. Then tie it to the destination of the root_node. */
1547 if (org_node == root_node && clone_node != org_node)
1549 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1554 /* In case the node has another constraint, append it. */
1555 constraint |= dfa->nodes[org_node].constraint;
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (BE (clone_dest == REG_MISSING, 0))
1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 else /* dfa->edests[org_node].nelem == 2 */
1565 /* In case of the node can epsilon-transit, and it has two
1566 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1567 org_dest = dfa->edests[org_node].elems[0];
1568 re_node_set_empty (dfa->edests + clone_node);
1569 /* Search for a duplicated node which satisfies the constraint. */
1570 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1571 if (clone_dest == REG_MISSING)
1573 /* There is no such duplicated node, create a new one. */
1575 clone_dest = duplicate_node (dfa, org_dest, constraint);
1576 if (BE (clone_dest == REG_MISSING, 0))
1578 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1582 root_node, constraint);
1583 if (BE (err != REG_NOERROR, 0))
1588 /* There is a duplicated node which satisfies the constraint,
1589 use it to avoid infinite loop. */
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1595 org_dest = dfa->edests[org_node].elems[1];
1596 clone_dest = duplicate_node (dfa, org_dest, constraint);
1597 if (BE (clone_dest == REG_MISSING, 0))
1599 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 org_node = org_dest;
1604 clone_node = clone_dest;
1609 /* Search for a node which is duplicated from the node ORG_NODE, and
1610 satisfies the constraint CONSTRAINT. */
1613 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1614 unsigned int constraint)
1617 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1619 if (org_node == dfa->org_indices[idx]
1620 && constraint == dfa->nodes[idx].constraint)
1621 return idx; /* Found. */
1623 return REG_MISSING; /* Not found. */
1626 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1627 Return the index of the new node, or REG_MISSING if insufficient storage is
1631 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1633 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1634 if (BE (dup_idx != REG_MISSING, 1))
1636 dfa->nodes[dup_idx].constraint = constraint;
1637 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1638 dfa->nodes[dup_idx].duplicated = 1;
1640 /* Store the index of the original node. */
1641 dfa->org_indices[dup_idx] = org_idx;
1646 static reg_errcode_t
1647 calc_inveclosure (re_dfa_t *dfa)
1651 for (idx = 0; idx < dfa->nodes_len; ++idx)
1652 re_node_set_init_empty (dfa->inveclosures + idx);
1654 for (src = 0; src < dfa->nodes_len; ++src)
1656 Idx *elems = dfa->eclosures[src].elems;
1657 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1659 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668 /* Calculate "eclosure" for all the node in DFA. */
1670 static reg_errcode_t
1671 calc_eclosure (re_dfa_t *dfa)
1676 assert (dfa->nodes_len > 0);
1679 /* For each nodes, calculate epsilon closure. */
1680 for (node_idx = 0; ; ++node_idx)
1683 re_node_set eclosure_elem;
1684 if (node_idx == dfa->nodes_len)
1693 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1696 /* If we have already calculated, skip it. */
1697 if (dfa->eclosures[node_idx].nelem != 0)
1699 /* Calculate epsilon closure of 'node_idx'. */
1700 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1701 if (BE (err != REG_NOERROR, 0))
1704 if (dfa->eclosures[node_idx].nelem == 0)
1707 re_node_set_free (&eclosure_elem);
1713 /* Calculate epsilon closure of NODE. */
1715 static reg_errcode_t
1716 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1720 re_node_set eclosure;
1722 bool incomplete = false;
1723 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1724 if (BE (err != REG_NOERROR, 0))
1727 /* This indicates that we are calculating this node now.
1728 We reference this value to avoid infinite loop. */
1729 dfa->eclosures[node].nelem = REG_MISSING;
1731 /* If the current node has constraints, duplicate all nodes
1732 since they must inherit the constraints. */
1733 if (dfa->nodes[node].constraint
1734 && dfa->edests[node].nelem
1735 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1737 err = duplicate_node_closure (dfa, node, node, node,
1738 dfa->nodes[node].constraint);
1739 if (BE (err != REG_NOERROR, 0))
1743 /* Expand each epsilon destination nodes. */
1744 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1745 for (i = 0; i < dfa->edests[node].nelem; ++i)
1747 re_node_set eclosure_elem;
1748 Idx edest = dfa->edests[node].elems[i];
1749 /* If calculating the epsilon closure of 'edest' is in progress,
1750 return intermediate result. */
1751 if (dfa->eclosures[edest].nelem == REG_MISSING)
1756 /* If we haven't calculated the epsilon closure of 'edest' yet,
1757 calculate now. Otherwise use calculated epsilon closure. */
1758 if (dfa->eclosures[edest].nelem == 0)
1760 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1761 if (BE (err != REG_NOERROR, 0))
1765 eclosure_elem = dfa->eclosures[edest];
1766 /* Merge the epsilon closure of 'edest'. */
1767 err = re_node_set_merge (&eclosure, &eclosure_elem);
1768 if (BE (err != REG_NOERROR, 0))
1770 /* If the epsilon closure of 'edest' is incomplete,
1771 the epsilon closure of this node is also incomplete. */
1772 if (dfa->eclosures[edest].nelem == 0)
1775 re_node_set_free (&eclosure_elem);
1779 /* An epsilon closure includes itself. */
1780 ok = re_node_set_insert (&eclosure, node);
1783 if (incomplete && !root)
1784 dfa->eclosures[node].nelem = 0;
1786 dfa->eclosures[node] = eclosure;
1787 *new_set = eclosure;
1791 /* Functions for token which are used in the parser. */
1793 /* Fetch a token from INPUT.
1794 We must not use this function inside bracket expressions. */
1798 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1800 re_string_skip_bytes (input, peek_token (result, input, syntax));
1803 /* Peek a token from INPUT, and return the length of the token.
1804 We must not use this function inside bracket expressions. */
1808 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1812 if (re_string_eoi (input))
1814 token->type = END_OF_RE;
1818 c = re_string_peek_byte (input, 0);
1821 token->word_char = 0;
1822 #ifdef RE_ENABLE_I18N
1823 token->mb_partial = 0;
1824 if (input->mb_cur_max > 1 &&
1825 !re_string_first_byte (input, re_string_cur_idx (input)))
1827 token->type = CHARACTER;
1828 token->mb_partial = 1;
1835 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1837 token->type = BACK_SLASH;
1841 c2 = re_string_peek_byte_case (input, 1);
1843 token->type = CHARACTER;
1844 #ifdef RE_ENABLE_I18N
1845 if (input->mb_cur_max > 1)
1847 wint_t wc = re_string_wchar_at (input,
1848 re_string_cur_idx (input) + 1);
1849 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1853 token->word_char = IS_WORD_CHAR (c2) != 0;
1858 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1859 token->type = OP_ALT;
1861 case '1': case '2': case '3': case '4': case '5':
1862 case '6': case '7': case '8': case '9':
1863 if (!(syntax & RE_NO_BK_REFS))
1865 token->type = OP_BACK_REF;
1866 token->opr.idx = c2 - '1';
1870 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = WORD_FIRST;
1877 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = WORD_LAST;
1884 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = WORD_DELIM;
1891 if (!(syntax & RE_NO_GNU_OPS))
1893 token->type = ANCHOR;
1894 token->opr.ctx_type = NOT_WORD_DELIM;
1898 if (!(syntax & RE_NO_GNU_OPS))
1899 token->type = OP_WORD;
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_NOTWORD;
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_SPACE;
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = OP_NOTSPACE;
1914 if (!(syntax & RE_NO_GNU_OPS))
1916 token->type = ANCHOR;
1917 token->opr.ctx_type = BUF_FIRST;
1921 if (!(syntax & RE_NO_GNU_OPS))
1923 token->type = ANCHOR;
1924 token->opr.ctx_type = BUF_LAST;
1928 if (!(syntax & RE_NO_BK_PARENS))
1929 token->type = OP_OPEN_SUBEXP;
1932 if (!(syntax & RE_NO_BK_PARENS))
1933 token->type = OP_CLOSE_SUBEXP;
1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937 token->type = OP_DUP_PLUS;
1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941 token->type = OP_DUP_QUESTION;
1944 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945 token->type = OP_OPEN_DUP_NUM;
1948 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949 token->type = OP_CLOSE_DUP_NUM;
1957 token->type = CHARACTER;
1958 #ifdef RE_ENABLE_I18N
1959 if (input->mb_cur_max > 1)
1961 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1962 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1966 token->word_char = IS_WORD_CHAR (token->opr.c);
1971 if (syntax & RE_NEWLINE_ALT)
1972 token->type = OP_ALT;
1975 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1976 token->type = OP_ALT;
1979 token->type = OP_DUP_ASTERISK;
1982 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983 token->type = OP_DUP_PLUS;
1986 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987 token->type = OP_DUP_QUESTION;
1990 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991 token->type = OP_OPEN_DUP_NUM;
1994 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995 token->type = OP_CLOSE_DUP_NUM;
1998 if (syntax & RE_NO_BK_PARENS)
1999 token->type = OP_OPEN_SUBEXP;
2002 if (syntax & RE_NO_BK_PARENS)
2003 token->type = OP_CLOSE_SUBEXP;
2006 token->type = OP_OPEN_BRACKET;
2009 token->type = OP_PERIOD;
2012 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2013 re_string_cur_idx (input) != 0)
2015 char prev = re_string_peek_byte (input, -1);
2016 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2019 token->type = ANCHOR;
2020 token->opr.ctx_type = LINE_FIRST;
2023 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2024 re_string_cur_idx (input) + 1 != re_string_length (input))
2027 re_string_skip_bytes (input, 1);
2028 peek_token (&next, input, syntax);
2029 re_string_skip_bytes (input, -1);
2030 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2033 token->type = ANCHOR;
2034 token->opr.ctx_type = LINE_LAST;
2042 /* Peek a token from INPUT, and return the length of the token.
2043 We must not use this function out of bracket expressions. */
2047 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2050 if (re_string_eoi (input))
2052 token->type = END_OF_RE;
2055 c = re_string_peek_byte (input, 0);
2058 #ifdef RE_ENABLE_I18N
2059 if (input->mb_cur_max > 1 &&
2060 !re_string_first_byte (input, re_string_cur_idx (input)))
2062 token->type = CHARACTER;
2065 #endif /* RE_ENABLE_I18N */
2067 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2068 && re_string_cur_idx (input) + 1 < re_string_length (input))
2070 /* In this case, '\' escape a character. */
2072 re_string_skip_bytes (input, 1);
2073 c2 = re_string_peek_byte (input, 0);
2075 token->type = CHARACTER;
2078 if (c == '[') /* '[' is a special char in a bracket exps. */
2082 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2083 c2 = re_string_peek_byte (input, 1);
2091 token->type = OP_OPEN_COLL_ELEM;
2094 token->type = OP_OPEN_EQUIV_CLASS;
2097 if (syntax & RE_CHAR_CLASSES)
2099 token->type = OP_OPEN_CHAR_CLASS;
2102 /* else fall through. */
2104 token->type = CHARACTER;
2114 token->type = OP_CHARSET_RANGE;
2117 token->type = OP_CLOSE_BRACKET;
2120 token->type = OP_NON_MATCH_LIST;
2123 token->type = CHARACTER;
2128 /* Functions for parser. */
2130 /* Entry point of the parser.
2131 Parse the regular expression REGEXP and return the structure tree.
2132 If an error occurs, ERR is set by error code, and return NULL.
2133 This function build the following tree, from regular expression <reg_exp>:
2139 CAT means concatenation.
2140 EOR means end of regular expression. */
2143 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2146 re_dfa_t *dfa = preg->buffer;
2147 bin_tree_t *tree, *eor, *root;
2148 re_token_t current_token;
2149 dfa->syntax = syntax;
2150 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2151 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2152 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2154 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2156 root = create_tree (dfa, tree, eor, CONCAT);
2159 if (BE (eor == NULL || root == NULL, 0))
2167 /* This function build the following tree, from regular expression
2168 <branch1>|<branch2>:
2174 ALT means alternative, which represents the operator '|'. */
2177 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2180 re_dfa_t *dfa = preg->buffer;
2181 bin_tree_t *tree, *branch = NULL;
2182 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2183 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186 while (token->type == OP_ALT)
2188 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2189 if (token->type != OP_ALT && token->type != END_OF_RE
2190 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2192 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2193 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2198 tree = create_tree (dfa, tree, branch, OP_ALT);
2199 if (BE (tree == NULL, 0))
2208 /* This function build the following tree, from regular expression
2215 CAT means concatenation. */
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2221 bin_tree_t *tree, *expr;
2222 re_dfa_t *dfa = preg->buffer;
2223 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227 while (token->type != OP_ALT && token->type != END_OF_RE
2228 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2230 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2234 postorder (tree, free_tree, NULL);
2237 if (tree != NULL && expr != NULL)
2239 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240 if (newtree == NULL)
2242 postorder (expr, free_tree, NULL);
2243 postorder (tree, free_tree, NULL);
2249 else if (tree == NULL)
2251 /* Otherwise expr == NULL, we don't need to create new tree. */
2256 /* This function build the following tree, from regular expression a*:
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2266 re_dfa_t *dfa = preg->buffer;
2268 switch (token->type)
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (BE (tree == NULL, 0))
2277 #ifdef RE_ENABLE_I18N
2278 if (dfa->mb_cur_max > 1)
2280 while (!re_string_eoi (regexp)
2281 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2283 bin_tree_t *mbc_remain;
2284 fetch_token (token, regexp, syntax);
2285 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287 if (BE (mbc_remain == NULL || tree == NULL, 0))
2296 case OP_OPEN_SUBEXP:
2297 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2298 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2301 case OP_OPEN_BRACKET:
2302 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2312 dfa->used_bkref_map |= 1 << token->opr.idx;
2313 tree = create_token_tree (dfa, NULL, NULL, token);
2314 if (BE (tree == NULL, 0))
2320 dfa->has_mb_node = 1;
2322 case OP_OPEN_DUP_NUM:
2323 if (syntax & RE_CONTEXT_INVALID_DUP)
2329 case OP_DUP_ASTERISK:
2331 case OP_DUP_QUESTION:
2332 if (syntax & RE_CONTEXT_INVALID_OPS)
2337 else if (syntax & RE_CONTEXT_INDEP_OPS)
2339 fetch_token (token, regexp, syntax);
2340 return parse_expression (regexp, preg, token, syntax, nest, err);
2342 /* else fall through */
2343 case OP_CLOSE_SUBEXP:
2344 if ((token->type == OP_CLOSE_SUBEXP) &&
2345 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350 /* else fall through */
2351 case OP_CLOSE_DUP_NUM:
2352 /* We treat it as a normal character. */
2354 /* Then we can these characters as normal characters. */
2355 token->type = CHARACTER;
2356 /* mb_partial and word_char bits should be initialized already
2358 tree = create_token_tree (dfa, NULL, NULL, token);
2359 if (BE (tree == NULL, 0))
2366 if ((token->opr.ctx_type
2367 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2368 && dfa->word_ops_used == 0)
2369 init_word_char (dfa);
2370 if (token->opr.ctx_type == WORD_DELIM
2371 || token->opr.ctx_type == NOT_WORD_DELIM)
2373 bin_tree_t *tree_first, *tree_last;
2374 if (token->opr.ctx_type == WORD_DELIM)
2376 token->opr.ctx_type = WORD_FIRST;
2377 tree_first = create_token_tree (dfa, NULL, NULL, token);
2378 token->opr.ctx_type = WORD_LAST;
2382 token->opr.ctx_type = INSIDE_WORD;
2383 tree_first = create_token_tree (dfa, NULL, NULL, token);
2384 token->opr.ctx_type = INSIDE_NOTWORD;
2386 tree_last = create_token_tree (dfa, NULL, NULL, token);
2387 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2388 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2396 tree = create_token_tree (dfa, NULL, NULL, token);
2397 if (BE (tree == NULL, 0))
2403 /* We must return here, since ANCHORs can't be followed
2404 by repetition operators.
2405 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2406 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2407 fetch_token (token, regexp, syntax);
2410 tree = create_token_tree (dfa, NULL, NULL, token);
2411 if (BE (tree == NULL, 0))
2416 if (dfa->mb_cur_max > 1)
2417 dfa->has_mb_node = 1;
2421 tree = build_charclass_op (dfa, regexp->trans,
2422 (const unsigned char *) "alnum",
2423 (const unsigned char *) "_",
2424 token->type == OP_NOTWORD, err);
2425 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430 tree = build_charclass_op (dfa, regexp->trans,
2431 (const unsigned char *) "space",
2432 (const unsigned char *) "",
2433 token->type == OP_NOTSPACE, err);
2434 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2444 /* Must not happen? */
2450 fetch_token (token, regexp, syntax);
2452 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2453 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2455 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2456 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2458 /* In BRE consecutive duplications are not allowed. */
2459 if ((syntax & RE_CONTEXT_INVALID_DUP)
2460 && (token->type == OP_DUP_ASTERISK
2461 || token->type == OP_OPEN_DUP_NUM))
2471 /* This function build the following tree, from regular expression
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2482 re_dfa_t *dfa = preg->buffer;
2485 cur_nsub = preg->re_nsub++;
2487 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2489 /* The subexpression may be a null string. */
2490 if (token->type == OP_CLOSE_SUBEXP)
2494 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2498 postorder (tree, free_tree, NULL);
2501 if (BE (*err != REG_NOERROR, 0))
2505 if (cur_nsub <= '9' - '1')
2506 dfa->completed_bkref_map |= 1 << cur_nsub;
2508 tree = create_tree (dfa, tree, NULL, SUBEXP);
2509 if (BE (tree == NULL, 0))
2514 tree->token.opr.idx = cur_nsub;
2518 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2521 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2522 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2524 bin_tree_t *tree = NULL, *old_tree = NULL;
2525 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2526 re_token_t start_token = *token;
2528 if (token->type == OP_OPEN_DUP_NUM)
2531 start = fetch_number (regexp, token, syntax);
2532 if (start == REG_MISSING)
2534 if (token->type == CHARACTER && token->opr.c == ',')
2535 start = 0; /* We treat "{,m}" as "{0,m}". */
2538 *err = REG_BADBR; /* <re>{} is invalid. */
2542 if (BE (start != REG_ERROR, 1))
2544 /* We treat "{n}" as "{n,n}". */
2545 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2546 : ((token->type == CHARACTER && token->opr.c == ',')
2547 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2549 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2551 /* Invalid sequence. */
2552 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2554 if (token->type == END_OF_RE)
2562 /* If the syntax bit is set, rollback. */
2563 re_string_set_index (regexp, start_idx);
2564 *token = start_token;
2565 token->type = CHARACTER;
2566 /* mb_partial and word_char bits should be already initialized by
2571 if (BE ((end != REG_MISSING && start > end)
2572 || token->type != OP_CLOSE_DUP_NUM, 0))
2574 /* First number greater than second. */
2579 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2587 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2591 fetch_token (token, regexp, syntax);
2593 if (BE (elem == NULL, 0))
2595 if (BE (start == 0 && end == 0, 0))
2597 postorder (elem, free_tree, NULL);
2601 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2602 if (BE (start > 0, 0))
2605 for (i = 2; i <= start; ++i)
2607 elem = duplicate_tree (elem, dfa);
2608 tree = create_tree (dfa, tree, elem, CONCAT);
2609 if (BE (elem == NULL || tree == NULL, 0))
2610 goto parse_dup_op_espace;
2616 /* Duplicate ELEM before it is marked optional. */
2617 elem = duplicate_tree (elem, dfa);
2623 if (elem->token.type == SUBEXP)
2625 uintptr_t subidx = elem->token.opr.idx;
2626 postorder (elem, mark_opt_subexp, (void *) subidx);
2629 tree = create_tree (dfa, elem, NULL,
2630 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2631 if (BE (tree == NULL, 0))
2632 goto parse_dup_op_espace;
2634 /* From gnulib's "intprops.h":
2635 True if the arithmetic type T is signed. */
2636 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2638 /* This loop is actually executed only when end != REG_MISSING,
2639 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2640 already created the start+1-th copy. */
2641 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2642 for (i = start + 2; i <= end; ++i)
2644 elem = duplicate_tree (elem, dfa);
2645 tree = create_tree (dfa, tree, elem, CONCAT);
2646 if (BE (elem == NULL || tree == NULL, 0))
2647 goto parse_dup_op_espace;
2649 tree = create_tree (dfa, tree, NULL, OP_ALT);
2650 if (BE (tree == NULL, 0))
2651 goto parse_dup_op_espace;
2655 tree = create_tree (dfa, old_tree, tree, CONCAT);
2659 parse_dup_op_espace:
2664 /* Size of the names for collating symbol/equivalence_class/character_class.
2665 I'm not sure, but maybe enough. */
2666 #define BRACKET_NAME_BUF_SIZE 32
2669 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2670 Build the range expression which starts from START_ELEM, and ends
2671 at END_ELEM. The result are written to MBCSET and SBCSET.
2672 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2673 mbcset->range_ends, is a pointer argument since we may
2676 static reg_errcode_t
2678 # ifdef RE_ENABLE_I18N
2679 build_range_exp (const reg_syntax_t syntax,
2681 re_charset_t *mbcset,
2683 const bracket_elem_t *start_elem,
2684 const bracket_elem_t *end_elem)
2685 # else /* not RE_ENABLE_I18N */
2686 build_range_exp (const reg_syntax_t syntax,
2688 const bracket_elem_t *start_elem,
2689 const bracket_elem_t *end_elem)
2690 # endif /* not RE_ENABLE_I18N */
2692 unsigned int start_ch, end_ch;
2693 /* Equivalence Classes and Character Classes can't be a range start/end. */
2694 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2695 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2699 /* We can handle no multi character collating elements without libc
2701 if (BE ((start_elem->type == COLL_SYM
2702 && strlen ((char *) start_elem->opr.name) > 1)
2703 || (end_elem->type == COLL_SYM
2704 && strlen ((char *) end_elem->opr.name) > 1), 0))
2705 return REG_ECOLLATE;
2707 # ifdef RE_ENABLE_I18N
2712 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2714 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2715 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2717 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2718 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2720 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2721 ? __btowc (start_ch) : start_elem->opr.wch);
2722 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2723 ? __btowc (end_ch) : end_elem->opr.wch);
2724 if (start_wc == WEOF || end_wc == WEOF)
2725 return REG_ECOLLATE;
2726 cmp_buf[0] = start_wc;
2727 cmp_buf[4] = end_wc;
2729 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2730 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733 /* Got valid collation sequence values, add them as a new entry.
2734 However, for !_LIBC we have no collation elements: if the
2735 character set is single byte, the single byte character set
2736 that we build below suffices. parse_bracket_exp passes
2737 no MBCSET if dfa->mb_cur_max == 1. */
2740 /* Check the space of the arrays. */
2741 if (BE (*range_alloc == mbcset->nranges, 0))
2743 /* There is not enough space, need realloc. */
2744 wchar_t *new_array_start, *new_array_end;
2747 /* +1 in case of mbcset->nranges is 0. */
2748 new_nranges = 2 * mbcset->nranges + 1;
2749 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2750 are NULL if *range_alloc == 0. */
2751 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2753 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2759 mbcset->range_starts = new_array_start;
2760 mbcset->range_ends = new_array_end;
2761 *range_alloc = new_nranges;
2764 mbcset->range_starts[mbcset->nranges] = start_wc;
2765 mbcset->range_ends[mbcset->nranges++] = end_wc;
2768 /* Build the table for single byte characters. */
2769 for (wc = 0; wc < SBC_MAX; ++wc)
2772 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2773 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2774 bitset_set (sbcset, wc);
2777 # else /* not RE_ENABLE_I18N */
2780 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2781 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2784 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786 if (start_ch > end_ch)
2788 /* Build the table for single byte characters. */
2789 for (ch = 0; ch < SBC_MAX; ++ch)
2790 if (start_ch <= ch && ch <= end_ch)
2791 bitset_set (sbcset, ch);
2793 # endif /* not RE_ENABLE_I18N */
2796 #endif /* not _LIBC */
2799 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2800 Build the collating element which is represented by NAME.
2801 The result are written to MBCSET and SBCSET.
2802 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2803 pointer argument since we may update it. */
2805 static reg_errcode_t
2807 # ifdef RE_ENABLE_I18N
2808 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2809 Idx *coll_sym_alloc, const unsigned char *name)
2810 # else /* not RE_ENABLE_I18N */
2811 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2812 # endif /* not RE_ENABLE_I18N */
2814 size_t name_len = strlen ((const char *) name);
2815 if (BE (name_len != 1, 0))
2816 return REG_ECOLLATE;
2819 bitset_set (sbcset, name[0]);
2823 #endif /* not _LIBC */
2825 /* This function parse bracket expression like "[abc]", "[a-c]",
2829 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2830 reg_syntax_t syntax, reg_errcode_t *err)
2833 const unsigned char *collseqmb;
2834 const char *collseqwc;
2837 const int32_t *symb_table;
2838 const unsigned char *extra;
2840 /* Local function for parse_bracket_exp used in _LIBC environment.
2841 Seek the collating symbol entry corresponding to NAME.
2842 Return the index of the symbol in the SYMB_TABLE. */
2845 __attribute ((always_inline))
2846 seek_collating_symbol_entry (name, name_len)
2847 const unsigned char *name;
2850 int32_t hash = elem_hash ((const char *) name, name_len);
2851 int32_t elem = hash % table_size;
2852 if (symb_table[2 * elem] != 0)
2854 int32_t second = hash % (table_size - 2) + 1;
2858 /* First compare the hashing value. */
2859 if (symb_table[2 * elem] == hash
2860 /* Compare the length of the name. */
2861 && name_len == extra[symb_table[2 * elem + 1]]
2862 /* Compare the name. */
2863 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2866 /* Yep, this is the entry. */
2873 while (symb_table[2 * elem] != 0);
2878 /* Local function for parse_bracket_exp used in _LIBC environment.
2879 Look up the collation sequence value of BR_ELEM.
2880 Return the value if succeeded, UINT_MAX otherwise. */
2882 auto inline unsigned int
2883 __attribute ((always_inline))
2884 lookup_collation_sequence_value (br_elem)
2885 bracket_elem_t *br_elem;
2887 if (br_elem->type == SB_CHAR)
2890 if (MB_CUR_MAX == 1)
2893 return collseqmb[br_elem->opr.ch];
2896 wint_t wc = __btowc (br_elem->opr.ch);
2897 return __collseq_table_lookup (collseqwc, wc);
2900 else if (br_elem->type == MB_CHAR)
2903 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2905 else if (br_elem->type == COLL_SYM)
2907 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2911 elem = seek_collating_symbol_entry (br_elem->opr.name,
2913 if (symb_table[2 * elem] != 0)
2915 /* We found the entry. */
2916 idx = symb_table[2 * elem + 1];
2917 /* Skip the name of collating element name. */
2918 idx += 1 + extra[idx];
2919 /* Skip the byte sequence of the collating element. */
2920 idx += 1 + extra[idx];
2921 /* Adjust for the alignment. */
2922 idx = (idx + 3) & ~3;
2923 /* Skip the multibyte collation sequence value. */
2924 idx += sizeof (unsigned int);
2925 /* Skip the wide char sequence of the collating element. */
2926 idx += sizeof (unsigned int) *
2927 (1 + *(unsigned int *) (extra + idx));
2928 /* Return the collation sequence value. */
2929 return *(unsigned int *) (extra + idx);
2931 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2933 /* No valid character. Match it as a single byte
2935 return collseqmb[br_elem->opr.name[0]];
2938 else if (sym_name_len == 1)
2939 return collseqmb[br_elem->opr.name[0]];
2944 /* Local function for parse_bracket_exp used in _LIBC environment.
2945 Build the range expression which starts from START_ELEM, and ends
2946 at END_ELEM. The result are written to MBCSET and SBCSET.
2947 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2948 mbcset->range_ends, is a pointer argument since we may
2951 auto inline reg_errcode_t
2952 __attribute ((always_inline))
2953 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2954 re_charset_t *mbcset;
2957 bracket_elem_t *start_elem, *end_elem;
2960 uint32_t start_collseq;
2961 uint32_t end_collseq;
2963 /* Equivalence Classes and Character Classes can't be a range
2965 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2966 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2970 start_collseq = lookup_collation_sequence_value (start_elem);
2971 end_collseq = lookup_collation_sequence_value (end_elem);
2972 /* Check start/end collation sequence values. */
2973 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2974 return REG_ECOLLATE;
2975 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978 /* Got valid collation sequence values, add them as a new entry.
2979 However, if we have no collation elements, and the character set
2980 is single byte, the single byte character set that we
2981 build below suffices. */
2982 if (nrules > 0 || dfa->mb_cur_max > 1)
2984 /* Check the space of the arrays. */
2985 if (BE (*range_alloc == mbcset->nranges, 0))
2987 /* There is not enough space, need realloc. */
2988 uint32_t *new_array_start;
2989 uint32_t *new_array_end;
2992 /* +1 in case of mbcset->nranges is 0. */
2993 new_nranges = 2 * mbcset->nranges + 1;
2994 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2996 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002 mbcset->range_starts = new_array_start;
3003 mbcset->range_ends = new_array_end;
3004 *range_alloc = new_nranges;
3007 mbcset->range_starts[mbcset->nranges] = start_collseq;
3008 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011 /* Build the table for single byte characters. */
3012 for (ch = 0; ch < SBC_MAX; ch++)
3014 uint32_t ch_collseq;
3016 if (MB_CUR_MAX == 1)
3019 ch_collseq = collseqmb[ch];
3021 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3022 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3023 bitset_set (sbcset, ch);
3028 /* Local function for parse_bracket_exp used in _LIBC environment.
3029 Build the collating element which is represented by NAME.
3030 The result are written to MBCSET and SBCSET.
3031 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3032 pointer argument since we may update it. */
3034 auto inline reg_errcode_t
3035 __attribute ((always_inline))
3036 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3037 re_charset_t *mbcset;
3038 Idx *coll_sym_alloc;
3040 const unsigned char *name;
3043 size_t name_len = strlen ((const char *) name);
3046 elem = seek_collating_symbol_entry (name, name_len);
3047 if (symb_table[2 * elem] != 0)
3049 /* We found the entry. */
3050 idx = symb_table[2 * elem + 1];
3051 /* Skip the name of collating element name. */
3052 idx += 1 + extra[idx];
3054 else if (symb_table[2 * elem] == 0 && name_len == 1)
3056 /* No valid character, treat it as a normal
3058 bitset_set (sbcset, name[0]);
3062 return REG_ECOLLATE;
3064 /* Got valid collation sequence, add it as a new entry. */
3065 /* Check the space of the arrays. */
3066 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3068 /* Not enough, realloc it. */
3069 /* +1 in case of mbcset->ncoll_syms is 0. */
3070 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3071 /* Use realloc since mbcset->coll_syms is NULL
3073 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3074 new_coll_sym_alloc);
3075 if (BE (new_coll_syms == NULL, 0))
3077 mbcset->coll_syms = new_coll_syms;
3078 *coll_sym_alloc = new_coll_sym_alloc;
3080 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085 if (BE (name_len != 1, 0))
3086 return REG_ECOLLATE;
3089 bitset_set (sbcset, name[0]);
3096 re_token_t br_token;
3097 re_bitset_ptr_t sbcset;
3098 #ifdef RE_ENABLE_I18N
3099 re_charset_t *mbcset;
3100 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3101 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3102 #endif /* not RE_ENABLE_I18N */
3103 bool non_match = false;
3104 bin_tree_t *work_tree;
3106 bool first_round = true;
3108 collseqmb = (const unsigned char *)
3109 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3110 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3116 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3117 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3118 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3119 _NL_COLLATE_SYMB_TABLEMB);
3120 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3121 _NL_COLLATE_SYMB_EXTRAMB);
3124 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3125 #ifdef RE_ENABLE_I18N
3126 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3127 #endif /* RE_ENABLE_I18N */
3128 #ifdef RE_ENABLE_I18N
3129 if (BE (sbcset == NULL || mbcset == NULL, 0))
3131 if (BE (sbcset == NULL, 0))
3132 #endif /* RE_ENABLE_I18N */
3135 #ifdef RE_ENABLE_I18N
3142 token_len = peek_token_bracket (token, regexp, syntax);
3143 if (BE (token->type == END_OF_RE, 0))
3146 goto parse_bracket_exp_free_return;
3148 if (token->type == OP_NON_MATCH_LIST)
3150 #ifdef RE_ENABLE_I18N
3151 mbcset->non_match = 1;
3152 #endif /* not RE_ENABLE_I18N */
3154 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3155 bitset_set (sbcset, '\n');
3156 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3157 token_len = peek_token_bracket (token, regexp, syntax);
3158 if (BE (token->type == END_OF_RE, 0))
3161 goto parse_bracket_exp_free_return;
3165 /* We treat the first ']' as a normal character. */
3166 if (token->type == OP_CLOSE_BRACKET)
3167 token->type = CHARACTER;
3171 bracket_elem_t start_elem, end_elem;
3172 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3173 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3176 bool is_range_exp = false;
3179 start_elem.opr.name = start_name_buf;
3180 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3181 syntax, first_round);
3182 if (BE (ret != REG_NOERROR, 0))
3185 goto parse_bracket_exp_free_return;
3187 first_round = false;
3189 /* Get information about the next token. We need it in any case. */
3190 token_len = peek_token_bracket (token, regexp, syntax);
3192 /* Do not check for ranges if we know they are not allowed. */
3193 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3195 if (BE (token->type == END_OF_RE, 0))
3198 goto parse_bracket_exp_free_return;
3200 if (token->type == OP_CHARSET_RANGE)
3202 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3203 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3204 if (BE (token2.type == END_OF_RE, 0))
3207 goto parse_bracket_exp_free_return;
3209 if (token2.type == OP_CLOSE_BRACKET)
3211 /* We treat the last '-' as a normal character. */
3212 re_string_skip_bytes (regexp, -token_len);
3213 token->type = CHARACTER;
3216 is_range_exp = true;
3220 if (is_range_exp == true)
3222 end_elem.opr.name = end_name_buf;
3223 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3225 if (BE (ret != REG_NOERROR, 0))
3228 goto parse_bracket_exp_free_return;
3231 token_len = peek_token_bracket (token, regexp, syntax);
3234 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3235 &start_elem, &end_elem);
3237 # ifdef RE_ENABLE_I18N
3238 *err = build_range_exp (syntax, sbcset,
3239 dfa->mb_cur_max > 1 ? mbcset : NULL,
3240 &range_alloc, &start_elem, &end_elem);
3242 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3244 #endif /* RE_ENABLE_I18N */
3245 if (BE (*err != REG_NOERROR, 0))
3246 goto parse_bracket_exp_free_return;
3250 switch (start_elem.type)
3253 bitset_set (sbcset, start_elem.opr.ch);
3255 #ifdef RE_ENABLE_I18N
3257 /* Check whether the array has enough space. */
3258 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3260 wchar_t *new_mbchars;
3261 /* Not enough, realloc it. */
3262 /* +1 in case of mbcset->nmbchars is 0. */
3263 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3264 /* Use realloc since array is NULL if *alloc == 0. */
3265 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3267 if (BE (new_mbchars == NULL, 0))
3268 goto parse_bracket_exp_espace;
3269 mbcset->mbchars = new_mbchars;
3271 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3273 #endif /* RE_ENABLE_I18N */
3275 *err = build_equiv_class (sbcset,
3276 #ifdef RE_ENABLE_I18N
3277 mbcset, &equiv_class_alloc,
3278 #endif /* RE_ENABLE_I18N */
3279 start_elem.opr.name);
3280 if (BE (*err != REG_NOERROR, 0))
3281 goto parse_bracket_exp_free_return;
3284 *err = build_collating_symbol (sbcset,
3285 #ifdef RE_ENABLE_I18N
3286 mbcset, &coll_sym_alloc,
3287 #endif /* RE_ENABLE_I18N */
3288 start_elem.opr.name);
3289 if (BE (*err != REG_NOERROR, 0))
3290 goto parse_bracket_exp_free_return;
3293 *err = build_charclass (regexp->trans, sbcset,
3294 #ifdef RE_ENABLE_I18N
3295 mbcset, &char_class_alloc,
3296 #endif /* RE_ENABLE_I18N */
3297 start_elem.opr.name, syntax);
3298 if (BE (*err != REG_NOERROR, 0))
3299 goto parse_bracket_exp_free_return;
3306 if (BE (token->type == END_OF_RE, 0))
3309 goto parse_bracket_exp_free_return;
3311 if (token->type == OP_CLOSE_BRACKET)
3315 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3317 /* If it is non-matching list. */
3319 bitset_not (sbcset);
3321 #ifdef RE_ENABLE_I18N
3322 /* Ensure only single byte characters are set. */
3323 if (dfa->mb_cur_max > 1)
3324 bitset_mask (sbcset, dfa->sb_char);
3326 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3327 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3328 || mbcset->non_match)))
3330 bin_tree_t *mbc_tree;
3332 /* Build a tree for complex bracket. */
3333 dfa->has_mb_node = 1;
3334 br_token.type = COMPLEX_BRACKET;
3335 br_token.opr.mbcset = mbcset;
3336 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337 if (BE (mbc_tree == NULL, 0))
3338 goto parse_bracket_exp_espace;
3339 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3340 if (sbcset[sbc_idx])
3342 /* If there are no bits set in sbcset, there is no point
3343 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3344 if (sbc_idx < BITSET_WORDS)
3346 /* Build a tree for simple bracket. */
3347 br_token.type = SIMPLE_BRACKET;
3348 br_token.opr.sbcset = sbcset;
3349 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3350 if (BE (work_tree == NULL, 0))
3351 goto parse_bracket_exp_espace;
3353 /* Then join them by ALT node. */
3354 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3355 if (BE (work_tree == NULL, 0))
3356 goto parse_bracket_exp_espace;
3361 work_tree = mbc_tree;
3365 #endif /* not RE_ENABLE_I18N */
3367 #ifdef RE_ENABLE_I18N
3368 free_charset (mbcset);
3370 /* Build a tree for simple bracket. */
3371 br_token.type = SIMPLE_BRACKET;
3372 br_token.opr.sbcset = sbcset;
3373 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3374 if (BE (work_tree == NULL, 0))
3375 goto parse_bracket_exp_espace;
3379 parse_bracket_exp_espace:
3381 parse_bracket_exp_free_return:
3383 #ifdef RE_ENABLE_I18N
3384 free_charset (mbcset);
3385 #endif /* RE_ENABLE_I18N */
3389 /* Parse an element in the bracket expression. */
3391 static reg_errcode_t
3392 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3393 re_token_t *token, int token_len, re_dfa_t *dfa,
3394 reg_syntax_t syntax, bool accept_hyphen)
3396 #ifdef RE_ENABLE_I18N
3398 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3399 if (cur_char_size > 1)
3401 elem->type = MB_CHAR;
3402 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3403 re_string_skip_bytes (regexp, cur_char_size);
3406 #endif /* RE_ENABLE_I18N */
3407 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3408 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3409 || token->type == OP_OPEN_EQUIV_CLASS)
3410 return parse_bracket_symbol (elem, regexp, token);
3411 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3413 /* A '-' must only appear as anything but a range indicator before
3414 the closing bracket. Everything else is an error. */
3416 (void) peek_token_bracket (&token2, regexp, syntax);
3417 if (token2.type != OP_CLOSE_BRACKET)
3418 /* The actual error value is not standardized since this whole
3419 case is undefined. But ERANGE makes good sense. */
3422 elem->type = SB_CHAR;
3423 elem->opr.ch = token->opr.c;
3427 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3428 such as [:<character_class>:], [.<collating_element>.], and
3429 [=<equivalent_class>=]. */
3431 static reg_errcode_t
3432 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435 unsigned char ch, delim = token->opr.c;
3437 if (re_string_eoi(regexp))
3441 if (i >= BRACKET_NAME_BUF_SIZE)
3443 if (token->type == OP_OPEN_CHAR_CLASS)
3444 ch = re_string_fetch_byte_case (regexp);
3446 ch = re_string_fetch_byte (regexp);
3447 if (re_string_eoi(regexp))
3449 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3451 elem->opr.name[i] = ch;
3453 re_string_skip_bytes (regexp, 1);
3454 elem->opr.name[i] = '\0';
3455 switch (token->type)
3457 case OP_OPEN_COLL_ELEM:
3458 elem->type = COLL_SYM;
3460 case OP_OPEN_EQUIV_CLASS:
3461 elem->type = EQUIV_CLASS;
3463 case OP_OPEN_CHAR_CLASS:
3464 elem->type = CHAR_CLASS;
3472 /* Helper function for parse_bracket_exp.
3473 Build the equivalence class which is represented by NAME.
3474 The result are written to MBCSET and SBCSET.
3475 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3476 is a pointer argument since we may update it. */
3478 static reg_errcode_t
3479 #ifdef RE_ENABLE_I18N
3480 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3481 Idx *equiv_class_alloc, const unsigned char *name)
3482 #else /* not RE_ENABLE_I18N */
3483 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3484 #endif /* not RE_ENABLE_I18N */
3487 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490 const int32_t *table, *indirect;
3491 const unsigned char *weights, *extra, *cp;
3492 unsigned char char_buf[2];
3496 /* This #include defines a local function! */
3497 # include <locale/weight.h>
3498 /* Calculate the index for equivalence class. */
3500 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3501 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502 _NL_COLLATE_WEIGHTMB);
3503 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_EXTRAMB);
3505 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3506 _NL_COLLATE_INDIRECTMB);
3507 idx1 = findidx (&cp, -1);
3508 if (BE (idx1 == 0 || *cp != '\0', 0))
3509 /* This isn't a valid character. */
3510 return REG_ECOLLATE;
3512 /* Build single byte matching table for this equivalence class. */
3513 len = weights[idx1 & 0xffffff];
3514 for (ch = 0; ch < SBC_MAX; ++ch)
3518 idx2 = findidx (&cp, 1);
3523 /* This isn't a valid character. */
3525 /* Compare only if the length matches and the collation rule
3526 index is the same. */
3527 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3531 while (cnt <= len &&
3532 weights[(idx1 & 0xffffff) + 1 + cnt]
3533 == weights[(idx2 & 0xffffff) + 1 + cnt])
3537 bitset_set (sbcset, ch);
3540 /* Check whether the array has enough space. */
3541 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3543 /* Not enough, realloc it. */
3544 /* +1 in case of mbcset->nequiv_classes is 0. */
3545 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3546 /* Use realloc since the array is NULL if *alloc == 0. */
3547 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3549 new_equiv_class_alloc);
3550 if (BE (new_equiv_classes == NULL, 0))
3552 mbcset->equiv_classes = new_equiv_classes;
3553 *equiv_class_alloc = new_equiv_class_alloc;
3555 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3560 if (BE (strlen ((const char *) name) != 1, 0))
3561 return REG_ECOLLATE;
3562 bitset_set (sbcset, *name);
3567 /* Helper function for parse_bracket_exp.
3568 Build the character class which is represented by NAME.
3569 The result are written to MBCSET and SBCSET.
3570 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3571 is a pointer argument since we may update it. */
3573 static reg_errcode_t
3574 #ifdef RE_ENABLE_I18N
3575 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3576 re_charset_t *mbcset, Idx *char_class_alloc,
3577 const unsigned char *class_name, reg_syntax_t syntax)
3578 #else /* not RE_ENABLE_I18N */
3579 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3580 const unsigned char *class_name, reg_syntax_t syntax)
3581 #endif /* not RE_ENABLE_I18N */
3584 const char *name = (const char *) class_name;
3586 /* In case of REG_ICASE "upper" and "lower" match the both of
3587 upper and lower cases. */
3588 if ((syntax & RE_ICASE)
3589 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3592 #ifdef RE_ENABLE_I18N
3593 /* Check the space of the arrays. */
3594 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3596 /* Not enough, realloc it. */
3597 /* +1 in case of mbcset->nchar_classes is 0. */
3598 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3599 /* Use realloc since array is NULL if *alloc == 0. */
3600 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3601 new_char_class_alloc);
3602 if (BE (new_char_classes == NULL, 0))
3604 mbcset->char_classes = new_char_classes;
3605 *char_class_alloc = new_char_class_alloc;
3607 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3608 #endif /* RE_ENABLE_I18N */
3610 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3612 if (BE (trans != NULL, 0)) \
3614 for (i = 0; i < SBC_MAX; ++i) \
3615 if (ctype_func (i)) \
3616 bitset_set (sbcset, trans[i]); \
3620 for (i = 0; i < SBC_MAX; ++i) \
3621 if (ctype_func (i)) \
3622 bitset_set (sbcset, i); \
3626 if (strcmp (name, "alnum") == 0)
3627 BUILD_CHARCLASS_LOOP (isalnum);
3628 else if (strcmp (name, "cntrl") == 0)
3629 BUILD_CHARCLASS_LOOP (iscntrl);
3630 else if (strcmp (name, "lower") == 0)
3631 BUILD_CHARCLASS_LOOP (islower);
3632 else if (strcmp (name, "space") == 0)
3633 BUILD_CHARCLASS_LOOP (isspace);
3634 else if (strcmp (name, "alpha") == 0)
3635 BUILD_CHARCLASS_LOOP (isalpha);
3636 else if (strcmp (name, "digit") == 0)
3637 BUILD_CHARCLASS_LOOP (isdigit);
3638 else if (strcmp (name, "print") == 0)
3639 BUILD_CHARCLASS_LOOP (isprint);
3640 else if (strcmp (name, "upper") == 0)
3641 BUILD_CHARCLASS_LOOP (isupper);
3642 else if (strcmp (name, "blank") == 0)
3643 BUILD_CHARCLASS_LOOP (isblank);
3644 else if (strcmp (name, "graph") == 0)
3645 BUILD_CHARCLASS_LOOP (isgraph);
3646 else if (strcmp (name, "punct") == 0)
3647 BUILD_CHARCLASS_LOOP (ispunct);
3648 else if (strcmp (name, "xdigit") == 0)
3649 BUILD_CHARCLASS_LOOP (isxdigit);
3657 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3658 const unsigned char *class_name,
3659 const unsigned char *extra, bool non_match,
3662 re_bitset_ptr_t sbcset;
3663 #ifdef RE_ENABLE_I18N
3664 re_charset_t *mbcset;
3666 #endif /* not RE_ENABLE_I18N */
3668 re_token_t br_token;
3671 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3672 #ifdef RE_ENABLE_I18N
3673 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3674 #endif /* RE_ENABLE_I18N */
3676 #ifdef RE_ENABLE_I18N
3677 if (BE (sbcset == NULL || mbcset == NULL, 0))
3678 #else /* not RE_ENABLE_I18N */
3679 if (BE (sbcset == NULL, 0))
3680 #endif /* not RE_ENABLE_I18N */
3688 #ifdef RE_ENABLE_I18N
3689 mbcset->non_match = 1;
3690 #endif /* not RE_ENABLE_I18N */
3693 /* We don't care the syntax in this case. */
3694 ret = build_charclass (trans, sbcset,
3695 #ifdef RE_ENABLE_I18N
3697 #endif /* RE_ENABLE_I18N */
3700 if (BE (ret != REG_NOERROR, 0))
3703 #ifdef RE_ENABLE_I18N
3704 free_charset (mbcset);
3705 #endif /* RE_ENABLE_I18N */
3709 /* \w match '_' also. */
3710 for (; *extra; extra++)
3711 bitset_set (sbcset, *extra);
3713 /* If it is non-matching list. */
3715 bitset_not (sbcset);
3717 #ifdef RE_ENABLE_I18N
3718 /* Ensure only single byte characters are set. */
3719 if (dfa->mb_cur_max > 1)
3720 bitset_mask (sbcset, dfa->sb_char);
3723 /* Build a tree for simple bracket. */
3724 br_token.type = SIMPLE_BRACKET;
3725 br_token.opr.sbcset = sbcset;
3726 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3727 if (BE (tree == NULL, 0))
3728 goto build_word_op_espace;
3730 #ifdef RE_ENABLE_I18N
3731 if (dfa->mb_cur_max > 1)
3733 bin_tree_t *mbc_tree;
3734 /* Build a tree for complex bracket. */
3735 br_token.type = COMPLEX_BRACKET;
3736 br_token.opr.mbcset = mbcset;
3737 dfa->has_mb_node = 1;
3738 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3739 if (BE (mbc_tree == NULL, 0))
3740 goto build_word_op_espace;
3741 /* Then join them by ALT node. */
3742 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3743 if (BE (mbc_tree != NULL, 1))
3748 free_charset (mbcset);
3751 #else /* not RE_ENABLE_I18N */
3753 #endif /* not RE_ENABLE_I18N */
3755 build_word_op_espace:
3757 #ifdef RE_ENABLE_I18N
3758 free_charset (mbcset);
3759 #endif /* RE_ENABLE_I18N */
3764 /* This is intended for the expressions like "a{1,3}".
3765 Fetch a number from 'input', and return the number.
3766 Return REG_MISSING if the number field is empty like "{,1}".
3767 Return RE_DUP_MAX + 1 if the number field is too large.
3768 Return REG_ERROR if an error occurred. */
3771 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3773 Idx num = REG_MISSING;
3777 fetch_token (token, input, syntax);
3779 if (BE (token->type == END_OF_RE, 0))
3781 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3783 num = ((token->type != CHARACTER || c < '0' || '9' < c
3784 || num == REG_ERROR)
3786 : num == REG_MISSING
3788 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3793 #ifdef RE_ENABLE_I18N
3795 free_charset (re_charset_t *cset)
3797 re_free (cset->mbchars);
3799 re_free (cset->coll_syms);
3800 re_free (cset->equiv_classes);
3801 re_free (cset->range_starts);
3802 re_free (cset->range_ends);
3804 re_free (cset->char_classes);
3807 #endif /* RE_ENABLE_I18N */
3809 /* Functions for binary tree operation. */
3811 /* Create a tree node. */
3814 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815 re_token_type_t type)
3819 return create_token_tree (dfa, left, right, &t);
3823 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824 const re_token_t *token)
3827 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3829 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3831 if (storage == NULL)
3833 storage->next = dfa->str_tree_storage;
3834 dfa->str_tree_storage = storage;
3835 dfa->str_tree_storage_idx = 0;
3837 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3839 tree->parent = NULL;
3841 tree->right = right;
3842 tree->token = *token;
3843 tree->token.duplicated = 0;
3844 tree->token.opt_subexp = 0;
3847 tree->node_idx = REG_MISSING;
3850 left->parent = tree;
3852 right->parent = tree;
3856 /* Mark the tree SRC as an optional subexpression.
3857 To be called from preorder or postorder. */
3859 static reg_errcode_t
3860 mark_opt_subexp (void *extra, bin_tree_t *node)
3862 Idx idx = (uintptr_t) extra;
3863 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3864 node->token.opt_subexp = 1;
3869 /* Free the allocated memory inside NODE. */
3872 free_token (re_token_t *node)
3874 #ifdef RE_ENABLE_I18N
3875 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3876 free_charset (node->opr.mbcset);
3878 #endif /* RE_ENABLE_I18N */
3879 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3880 re_free (node->opr.sbcset);
3883 /* Worker function for tree walking. Free the allocated memory inside NODE
3884 and its children. */
3886 static reg_errcode_t
3887 free_tree (void *extra, bin_tree_t *node)
3889 free_token (&node->token);
3894 /* Duplicate the node SRC, and return new node. This is a preorder
3895 visit similar to the one implemented by the generic visitor, but
3896 we need more infrastructure to maintain two parallel trees --- so,
3897 it's easier to duplicate. */
3900 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3902 const bin_tree_t *node;
3903 bin_tree_t *dup_root;
3904 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3906 for (node = root; ; )
3908 /* Create a new tree and link it back to the current parent. */
3909 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3912 (*p_new)->parent = dup_node;
3913 (*p_new)->token.duplicated = 1;
3916 /* Go to the left node, or up and to the right. */
3920 p_new = &dup_node->left;
3924 const bin_tree_t *prev = NULL;
3925 while (node->right == prev || node->right == NULL)
3928 node = node->parent;
3929 dup_node = dup_node->parent;
3934 p_new = &dup_node->right;