1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
3 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
215 re_compile_pattern (pattern, length, bufp)
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 weak_alias (__re_compile_pattern, re_compile_pattern)
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
261 re_set_syntax (syntax)
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
270 weak_alias (__re_set_syntax, re_set_syntax)
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 re_set_fastmap (fastmap, false, buf[0]);
345 else if (type == SIMPLE_BRACKET)
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
361 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
363 || cset->nranges || cset->nchar_classes)
366 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
368 /* In this case we want to catch the bytes which are
369 the first byte of any collation elements.
370 e.g. In da_DK, we want to catch 'a' since "aa"
371 is a valid collation element, and don't catch
372 'b' since 'b' is the only collation element
373 which starts from 'b'. */
374 const int32_t *table = (const int32_t *)
375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 for (i = 0; i < SBC_MAX; ++i)
378 re_set_fastmap (fastmap, icase, i);
381 if (dfa->mb_cur_max > 1)
382 for (i = 0; i < SBC_MAX; ++i)
383 if (__btowc (i) == WEOF)
384 re_set_fastmap (fastmap, icase, i);
385 # endif /* not _LIBC */
387 for (i = 0; i < cset->nmbchars; ++i)
391 memset (&state, '\0', sizeof (state));
392 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
393 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
394 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
396 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
398 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
402 #endif /* RE_ENABLE_I18N */
403 else if (type == OP_PERIOD
404 #ifdef RE_ENABLE_I18N
405 || type == OP_UTF8_PERIOD
406 #endif /* RE_ENABLE_I18N */
407 || type == END_OF_RE)
409 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
410 if (type == END_OF_RE)
411 bufp->can_be_null = 1;
417 /* Entry point for POSIX code. */
418 /* regcomp takes a regular expression as a string and compiles it.
420 PREG is a regex_t *. We do not expect any fields to be initialized,
421 since POSIX says we shouldn't. Thus, we set
423 `buffer' to the compiled pattern;
424 `used' to the length of the compiled pattern;
425 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
426 REG_EXTENDED bit in CFLAGS is set; otherwise, to
427 RE_SYNTAX_POSIX_BASIC;
428 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
429 `fastmap' to an allocated space for the fastmap;
430 `fastmap_accurate' to zero;
431 `re_nsub' to the number of subexpressions in PATTERN.
433 PATTERN is the address of the pattern string.
435 CFLAGS is a series of bits which affect compilation.
437 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
438 use POSIX basic syntax.
440 If REG_NEWLINE is set, then . and [^...] don't match newline.
441 Also, regexec will try a match beginning after every newline.
443 If REG_ICASE is set, then we considers upper- and lowercase
444 versions of letters to be equivalent when matching.
446 If REG_NOSUB is set, then when PREG is passed to regexec, that
447 routine will report only success or failure, and nothing about the
450 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
451 the return codes and their meanings.) */
454 regcomp (preg, pattern, cflags)
455 regex_t *_Restrict_ preg;
456 const char *_Restrict_ pattern;
460 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
461 : RE_SYNTAX_POSIX_BASIC);
467 /* Try to allocate space for the fastmap. */
468 preg->fastmap = re_malloc (char, SBC_MAX);
469 if (BE (preg->fastmap == NULL, 0))
472 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
474 /* If REG_NEWLINE is set, newlines are treated differently. */
475 if (cflags & REG_NEWLINE)
476 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
477 syntax &= ~RE_DOT_NEWLINE;
478 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
479 /* It also changes the matching behavior. */
480 preg->newline_anchor = 1;
483 preg->newline_anchor = 0;
484 preg->no_sub = !!(cflags & REG_NOSUB);
485 preg->translate = NULL;
487 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
489 /* POSIX doesn't distinguish between an unmatched open-group and an
490 unmatched close-group: both are REG_EPAREN. */
491 if (ret == REG_ERPAREN)
494 /* We have already checked preg->fastmap != NULL. */
495 if (BE (ret == REG_NOERROR, 1))
496 /* Compute the fastmap now, since regexec cannot modify the pattern
497 buffer. This function never fails in this implementation. */
498 (void) re_compile_fastmap (preg);
501 /* Some error occurred while compiling the expression. */
502 re_free (preg->fastmap);
503 preg->fastmap = NULL;
509 weak_alias (__regcomp, regcomp)
512 /* Returns a message corresponding to an error code, ERRCODE, returned
513 from either regcomp or regexec. We don't use PREG here. */
517 regerror (errcode, preg, errbuf, errbuf_size)
519 const regex_t *_Restrict_ preg;
520 char *_Restrict_ errbuf;
522 #else /* size_t might promote */
524 regerror (int errcode, const regex_t *_Restrict_ preg,
525 char *_Restrict_ errbuf, size_t errbuf_size)
532 || errcode >= (int) (sizeof (__re_error_msgid_idx)
533 / sizeof (__re_error_msgid_idx[0])), 0))
534 /* Only error codes returned by the rest of the code should be passed
535 to this routine. If we are given anything else, or if other regex
536 code generates an invalid error code, then the program has a bug.
537 Dump core so we can fix it. */
540 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 msg_size = strlen (msg) + 1; /* Includes the null. */
544 if (BE (errbuf_size != 0, 1))
546 size_t cpy_size = msg_size;
547 if (BE (msg_size > errbuf_size, 0))
549 cpy_size = errbuf_size - 1;
550 errbuf[cpy_size] = '\0';
552 memcpy (errbuf, msg, cpy_size);
558 weak_alias (__regerror, regerror)
562 #ifdef RE_ENABLE_I18N
563 /* This static array is used for the map to single-byte characters when
564 UTF-8 is used. Otherwise we would allocate memory just to initialize
565 it the same all the time. UTF-8 is the preferred encoding so this is
566 a worthwhile optimization. */
567 static const bitset_t utf8_sb_map =
569 /* Set the first 128 bits. */
570 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
571 # error "bitset_word_t is narrower than 32 bits"
572 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
573 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
574 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
575 BITSET_WORD_MAX, BITSET_WORD_MAX,
576 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
580 >> (SBC_MAX % BITSET_WORD_BITS == 0
582 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588 free_dfa_content (re_dfa_t *dfa)
593 for (i = 0; i < dfa->nodes_len; ++i)
594 free_token (dfa->nodes + i);
595 re_free (dfa->nexts);
596 for (i = 0; i < dfa->nodes_len; ++i)
598 if (dfa->eclosures != NULL)
599 re_node_set_free (dfa->eclosures + i);
600 if (dfa->inveclosures != NULL)
601 re_node_set_free (dfa->inveclosures + i);
602 if (dfa->edests != NULL)
603 re_node_set_free (dfa->edests + i);
605 re_free (dfa->edests);
606 re_free (dfa->eclosures);
607 re_free (dfa->inveclosures);
608 re_free (dfa->nodes);
610 if (dfa->state_table)
611 for (i = 0; i <= dfa->state_hash_mask; ++i)
613 struct re_state_table_entry *entry = dfa->state_table + i;
614 for (j = 0; j < entry->num; ++j)
616 re_dfastate_t *state = entry->array[j];
619 re_free (entry->array);
621 re_free (dfa->state_table);
622 #ifdef RE_ENABLE_I18N
623 if (dfa->sb_char != utf8_sb_map)
624 re_free (dfa->sb_char);
626 re_free (dfa->subexp_map);
628 re_free (dfa->re_str);
635 /* Free dynamically allocated space used by PREG. */
641 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642 if (BE (dfa != NULL, 1))
643 free_dfa_content (dfa);
647 re_free (preg->fastmap);
648 preg->fastmap = NULL;
650 re_free (preg->translate);
651 preg->translate = NULL;
654 weak_alias (__regfree, regfree)
657 /* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
660 #if defined _REGEX_RE_COMP || defined _LIBC
662 /* BSD has one and only one pattern buffer. */
663 static struct re_pattern_buffer re_comp_buf;
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
680 if (!re_comp_buf.buffer)
681 return gettext ("No previous regular expression");
685 if (re_comp_buf.buffer)
687 fastmap = re_comp_buf.fastmap;
688 re_comp_buf.fastmap = NULL;
689 __regfree (&re_comp_buf);
690 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691 re_comp_buf.fastmap = fastmap;
694 if (re_comp_buf.fastmap == NULL)
696 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697 if (re_comp_buf.fastmap == NULL)
698 return (char *) gettext (__re_error_msgid
699 + __re_error_msgid_idx[(int) REG_ESPACE]);
702 /* Since `re_exec' always passes NULL for the `regs' argument, we
703 don't need to initialize the pattern buffer fields which affect it. */
705 /* Match anchors at newlines. */
706 re_comp_buf.newline_anchor = 1;
708 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
718 libc_freeres_fn (free_mem)
720 __regfree (&re_comp_buf);
724 #endif /* _REGEX_RE_COMP */
726 /* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
734 reg_errcode_t err = REG_NOERROR;
738 /* Initialize the pattern buffer. */
739 preg->fastmap_accurate = 0;
740 preg->syntax = syntax;
741 preg->not_bol = preg->not_eol = 0;
744 preg->can_be_null = 0;
745 preg->regs_allocated = REGS_UNALLOCATED;
747 /* Initialize the dfa. */
748 dfa = (re_dfa_t *) preg->buffer;
749 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
758 preg->allocated = sizeof (re_dfa_t);
759 preg->buffer = (unsigned char *) dfa;
761 preg->used = sizeof (re_dfa_t);
763 err = init_dfa (dfa, length);
764 if (BE (err != REG_NOERROR, 0))
766 free_dfa_content (dfa);
772 /* Note: length+1 will not overflow since it is checked in init_dfa. */
773 dfa->re_str = re_malloc (char, length + 1);
774 strncpy (dfa->re_str, pattern, length + 1);
777 __libc_lock_init (dfa->lock);
779 err = re_string_construct (®exp, pattern, length, preg->translate,
780 (syntax & RE_ICASE) != 0, dfa);
781 if (BE (err != REG_NOERROR, 0))
783 re_compile_internal_free_return:
784 free_workarea_compile (preg);
785 re_string_destruct (®exp);
786 free_dfa_content (dfa);
792 /* Parse the regular expression, and build a structure tree. */
794 dfa->str_tree = parse (®exp, preg, syntax, &err);
795 if (BE (dfa->str_tree == NULL, 0))
796 goto re_compile_internal_free_return;
798 /* Analyze the tree and create the nfa. */
799 err = analyze (preg);
800 if (BE (err != REG_NOERROR, 0))
801 goto re_compile_internal_free_return;
803 #ifdef RE_ENABLE_I18N
804 /* If possible, do searching in single byte encoding to speed things up. */
805 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
809 /* Then create the initial state of the dfa. */
810 err = create_initial_state (dfa);
812 /* Release work areas. */
813 free_workarea_compile (preg);
814 re_string_destruct (®exp);
816 if (BE (err != REG_NOERROR, 0))
818 free_dfa_content (dfa);
826 /* Initialize DFA. We use the length of the regular expression PAT_LEN
827 as the initial length of some arrays. */
830 init_dfa (re_dfa_t *dfa, size_t pat_len)
832 __re_size_t table_size;
833 #ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
836 size_t max_i18n_object_size = 0;
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
845 memset (dfa, '\0', sizeof (re_dfa_t));
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
854 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
857 dfa->nodes_alloc = pat_len + 1;
858 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860 /* table_size = 2 ^ ceil(log pat_len) */
861 for (table_size = 1; ; table_size <<= 1)
862 if (table_size > pat_len)
865 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
866 dfa->state_hash_mask = table_size - 1;
868 dfa->mb_cur_max = MB_CUR_MAX;
870 if (dfa->mb_cur_max == 6
871 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 if (strcmp (locale_charset (), "UTF-8") == 0)
879 /* We check exhaustively in the loop below if this charset is a
880 superset of ASCII. */
881 dfa->map_notascii = 0;
884 #ifdef RE_ENABLE_I18N
885 if (dfa->mb_cur_max > 1)
888 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
894 if (BE (dfa->sb_char == NULL, 0))
897 /* Set the bits corresponding to single byte chars. */
898 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
899 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
901 wint_t wch = __btowc (ch);
903 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
905 if (isascii (ch) && wch != ch)
906 dfa->map_notascii = 1;
913 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918 /* Initialize WORD_CHAR table, which indicate which character is
919 "word". In this case "word" means that it is the word construction
920 character used by some operators like "\<", "\>", etc. */
924 init_word_char (re_dfa_t *dfa)
927 dfa->word_ops_used = 1;
928 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
929 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
930 if (isalnum (ch) || ch == '_')
931 dfa->word_char[i] |= (bitset_word_t) 1 << j;
934 /* Free the work area which are only used while compiling. */
937 free_workarea_compile (regex_t *preg)
939 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
940 bin_tree_storage_t *storage, *next;
941 for (storage = dfa->str_tree_storage; storage; storage = next)
943 next = storage->next;
946 dfa->str_tree_storage = NULL;
947 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
948 dfa->str_tree = NULL;
949 re_free (dfa->org_indices);
950 dfa->org_indices = NULL;
953 /* Create initial states for all contexts. */
956 create_initial_state (re_dfa_t *dfa)
960 re_node_set init_nodes;
962 /* Initial states have the epsilon closure of the node which is
963 the first node of the regular expression. */
964 first = dfa->str_tree->first->node_idx;
965 dfa->init_node = first;
966 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
967 if (BE (err != REG_NOERROR, 0))
970 /* The back-references which are in initial states can epsilon transit,
971 since in this case all of the subexpressions can be null.
972 Then we add epsilon closures of the nodes which are the next nodes of
973 the back-references. */
974 if (dfa->nbackref > 0)
975 for (i = 0; i < init_nodes.nelem; ++i)
977 Idx node_idx = init_nodes.elems[i];
978 re_token_type_t type = dfa->nodes[node_idx].type;
981 if (type != OP_BACK_REF)
983 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
985 re_token_t *clexp_node;
986 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
987 if (clexp_node->type == OP_CLOSE_SUBEXP
988 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
991 if (clexp_idx == init_nodes.nelem)
994 if (type == OP_BACK_REF)
996 Idx dest_idx = dfa->edests[node_idx].elems[0];
997 if (!re_node_set_contains (&init_nodes, dest_idx))
999 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1005 /* It must be the first time to invoke acquire_state. */
1006 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1007 /* We don't check ERR here, since the initial state must not be NULL. */
1008 if (BE (dfa->init_state == NULL, 0))
1010 if (dfa->init_state->has_constraint)
1012 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1014 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1016 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1020 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1021 || dfa->init_state_begbuf == NULL, 0))
1025 dfa->init_state_word = dfa->init_state_nl
1026 = dfa->init_state_begbuf = dfa->init_state;
1028 re_node_set_free (&init_nodes);
1032 #ifdef RE_ENABLE_I18N
1033 /* If it is possible to do searching in single byte encoding instead of UTF-8
1034 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1035 DFA nodes where needed. */
1038 optimize_utf8 (re_dfa_t *dfa)
1042 bool mb_chars = false;
1043 bool has_period = false;
1045 for (node = 0; node < dfa->nodes_len; ++node)
1046 switch (dfa->nodes[node].type)
1049 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1053 switch (dfa->nodes[node].opr.ctx_type)
1061 /* Word anchors etc. cannot be handled. It's okay to test
1062 opr.ctx_type since constraints (for all DFA nodes) are
1063 created by ORing one or more opr.ctx_type values. */
1073 case OP_DUP_ASTERISK:
1074 case OP_OPEN_SUBEXP:
1075 case OP_CLOSE_SUBEXP:
1077 case COMPLEX_BRACKET:
1079 case SIMPLE_BRACKET:
1080 /* Just double check. */
1082 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1084 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1085 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1087 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1097 if (mb_chars || has_period)
1098 for (node = 0; node < dfa->nodes_len; ++node)
1100 if (dfa->nodes[node].type == CHARACTER
1101 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1102 dfa->nodes[node].mb_partial = 0;
1103 else if (dfa->nodes[node].type == OP_PERIOD)
1104 dfa->nodes[node].type = OP_UTF8_PERIOD;
1107 /* The search can be in single byte locale. */
1108 dfa->mb_cur_max = 1;
1110 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1114 /* Analyze the structure tree, and calculate "first", "next", "edest",
1115 "eclosure", and "inveclosure". */
1117 static reg_errcode_t
1118 analyze (regex_t *preg)
1120 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1123 /* Allocate arrays. */
1124 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1125 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1126 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1127 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1128 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1129 || dfa->eclosures == NULL, 0))
1132 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1133 if (dfa->subexp_map != NULL)
1136 for (i = 0; i < preg->re_nsub; i++)
1137 dfa->subexp_map[i] = i;
1138 preorder (dfa->str_tree, optimize_subexps, dfa);
1139 for (i = 0; i < preg->re_nsub; i++)
1140 if (dfa->subexp_map[i] != i)
1142 if (i == preg->re_nsub)
1144 free (dfa->subexp_map);
1145 dfa->subexp_map = NULL;
1149 ret = postorder (dfa->str_tree, lower_subexps, preg);
1150 if (BE (ret != REG_NOERROR, 0))
1152 ret = postorder (dfa->str_tree, calc_first, dfa);
1153 if (BE (ret != REG_NOERROR, 0))
1155 preorder (dfa->str_tree, calc_next, dfa);
1156 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1157 if (BE (ret != REG_NOERROR, 0))
1159 ret = calc_eclosure (dfa);
1160 if (BE (ret != REG_NOERROR, 0))
1163 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1164 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1165 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1168 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1169 if (BE (dfa->inveclosures == NULL, 0))
1171 ret = calc_inveclosure (dfa);
1177 /* Our parse trees are very unbalanced, so we cannot use a stack to
1178 implement parse tree visits. Instead, we use parent pointers and
1179 some hairy code in these two functions. */
1180 static reg_errcode_t
1181 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1184 bin_tree_t *node, *prev;
1186 for (node = root; ; )
1188 /* Descend down the tree, preferably to the left (or to the right
1189 if that's the only child). */
1190 while (node->left || node->right)
1198 reg_errcode_t err = fn (extra, node);
1199 if (BE (err != REG_NOERROR, 0))
1201 if (node->parent == NULL)
1204 node = node->parent;
1206 /* Go up while we have a node that is reached from the right. */
1207 while (node->right == prev || node->right == NULL);
1212 static reg_errcode_t
1213 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1218 for (node = root; ; )
1220 reg_errcode_t err = fn (extra, node);
1221 if (BE (err != REG_NOERROR, 0))
1224 /* Go to the left node, or up and to the right. */
1229 bin_tree_t *prev = NULL;
1230 while (node->right == prev || node->right == NULL)
1233 node = node->parent;
1242 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1243 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1244 backreferences as well. Requires a preorder visit. */
1245 static reg_errcode_t
1246 optimize_subexps (void *extra, bin_tree_t *node)
1248 re_dfa_t *dfa = (re_dfa_t *) extra;
1250 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1252 int idx = node->token.opr.idx;
1253 node->token.opr.idx = dfa->subexp_map[idx];
1254 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1257 else if (node->token.type == SUBEXP
1258 && node->left && node->left->token.type == SUBEXP)
1260 Idx other_idx = node->left->token.opr.idx;
1262 node->left = node->left->left;
1264 node->left->parent = node;
1266 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1267 if (other_idx < BITSET_WORD_BITS)
1268 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1274 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1275 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1276 static reg_errcode_t
1277 lower_subexps (void *extra, bin_tree_t *node)
1279 regex_t *preg = (regex_t *) extra;
1280 reg_errcode_t err = REG_NOERROR;
1282 if (node->left && node->left->token.type == SUBEXP)
1284 node->left = lower_subexp (&err, preg, node->left);
1286 node->left->parent = node;
1288 if (node->right && node->right->token.type == SUBEXP)
1290 node->right = lower_subexp (&err, preg, node->right);
1292 node->right->parent = node;
1299 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1301 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1302 bin_tree_t *body = node->left;
1303 bin_tree_t *op, *cls, *tree1, *tree;
1306 /* We do not optimize empty subexpressions, because otherwise we may
1307 have bad CONCAT nodes with NULL children. This is obviously not
1308 very common, so we do not lose much. An example that triggers
1309 this case is the sed "script" /\(\)/x. */
1310 && node->left != NULL
1311 && (node->token.opr.idx >= BITSET_WORD_BITS
1312 || !(dfa->used_bkref_map
1313 & ((bitset_word_t) 1 << node->token.opr.idx))))
1316 /* Convert the SUBEXP node to the concatenation of an
1317 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1318 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1319 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1320 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1321 tree = create_tree (dfa, op, tree1, CONCAT);
1322 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1328 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1329 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1333 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1334 nodes. Requires a postorder visit. */
1335 static reg_errcode_t
1336 calc_first (void *extra, bin_tree_t *node)
1338 re_dfa_t *dfa = (re_dfa_t *) extra;
1339 if (node->token.type == CONCAT)
1341 node->first = node->left->first;
1342 node->node_idx = node->left->node_idx;
1347 node->node_idx = re_dfa_add_node (dfa, node->token);
1348 if (BE (node->node_idx == REG_MISSING, 0))
1350 if (node->token.type == ANCHOR)
1351 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1356 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1357 static reg_errcode_t
1358 calc_next (void *extra, bin_tree_t *node)
1360 switch (node->token.type)
1362 case OP_DUP_ASTERISK:
1363 node->left->next = node;
1366 node->left->next = node->right->first;
1367 node->right->next = node->next;
1371 node->left->next = node->next;
1373 node->right->next = node->next;
1379 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1380 static reg_errcode_t
1381 link_nfa_nodes (void *extra, bin_tree_t *node)
1383 re_dfa_t *dfa = (re_dfa_t *) extra;
1384 Idx idx = node->node_idx;
1385 reg_errcode_t err = REG_NOERROR;
1387 switch (node->token.type)
1393 assert (node->next == NULL);
1396 case OP_DUP_ASTERISK:
1400 dfa->has_plural_match = 1;
1401 if (node->left != NULL)
1402 left = node->left->first->node_idx;
1404 left = node->next->node_idx;
1405 if (node->right != NULL)
1406 right = node->right->first->node_idx;
1408 right = node->next->node_idx;
1409 assert (REG_VALID_INDEX (left));
1410 assert (REG_VALID_INDEX (right));
1411 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1416 case OP_OPEN_SUBEXP:
1417 case OP_CLOSE_SUBEXP:
1418 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1422 dfa->nexts[idx] = node->next->node_idx;
1423 if (node->token.type == OP_BACK_REF)
1424 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1428 assert (!IS_EPSILON_NODE (node->token.type));
1429 dfa->nexts[idx] = node->next->node_idx;
1436 /* Duplicate the epsilon closure of the node ROOT_NODE.
1437 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1438 to their own constraint. */
1440 static reg_errcode_t
1442 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1443 Idx root_node, unsigned int init_constraint)
1445 Idx org_node, clone_node;
1447 unsigned int constraint = init_constraint;
1448 for (org_node = top_org_node, clone_node = top_clone_node;;)
1450 Idx org_dest, clone_dest;
1451 if (dfa->nodes[org_node].type == OP_BACK_REF)
1453 /* If the back reference epsilon-transit, its destination must
1454 also have the constraint. Then duplicate the epsilon closure
1455 of the destination of the back reference, and store it in
1456 edests of the back reference. */
1457 org_dest = dfa->nexts[org_node];
1458 re_node_set_empty (dfa->edests + clone_node);
1459 clone_dest = duplicate_node (dfa, org_dest, constraint);
1460 if (BE (clone_dest == REG_MISSING, 0))
1462 dfa->nexts[clone_node] = dfa->nexts[org_node];
1463 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1467 else if (dfa->edests[org_node].nelem == 0)
1469 /* In case of the node can't epsilon-transit, don't duplicate the
1470 destination and store the original destination as the
1471 destination of the node. */
1472 dfa->nexts[clone_node] = dfa->nexts[org_node];
1475 else if (dfa->edests[org_node].nelem == 1)
1477 /* In case of the node can epsilon-transit, and it has only one
1479 org_dest = dfa->edests[org_node].elems[0];
1480 re_node_set_empty (dfa->edests + clone_node);
1481 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1482 /* If the node is root_node itself, it means the epsilon closure
1483 has a loop. Then tie it to the destination of the root_node. */
1484 if (org_node == root_node && clone_node != org_node)
1486 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1491 /* In case the node has another constraint, append it. */
1492 constraint |= dfa->nodes[org_node].constraint;
1493 clone_dest = duplicate_node (dfa, org_dest, constraint);
1494 if (BE (clone_dest == REG_MISSING, 0))
1496 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500 else /* dfa->edests[org_node].nelem == 2 */
1502 /* In case of the node can epsilon-transit, and it has two
1503 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1504 org_dest = dfa->edests[org_node].elems[0];
1505 re_node_set_empty (dfa->edests + clone_node);
1506 /* Search for a duplicated node which satisfies the constraint. */
1507 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1508 if (clone_dest == REG_MISSING)
1510 /* There is no such duplicated node, create a new one. */
1512 clone_dest = duplicate_node (dfa, org_dest, constraint);
1513 if (BE (clone_dest == REG_MISSING, 0))
1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1518 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1519 root_node, constraint);
1520 if (BE (err != REG_NOERROR, 0))
1525 /* There is a duplicated node which satisfy the constraint,
1526 use it to avoid infinite loop. */
1527 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1532 org_dest = dfa->edests[org_node].elems[1];
1533 clone_dest = duplicate_node (dfa, org_dest, constraint);
1534 if (BE (clone_dest == REG_MISSING, 0))
1536 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540 org_node = org_dest;
1541 clone_node = clone_dest;
1546 /* Search for a node which is duplicated from the node ORG_NODE, and
1547 satisfies the constraint CONSTRAINT. */
1550 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1551 unsigned int constraint)
1554 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1556 if (org_node == dfa->org_indices[idx]
1557 && constraint == dfa->nodes[idx].constraint)
1558 return idx; /* Found. */
1560 return REG_MISSING; /* Not found. */
1563 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1564 Return the index of the new node, or REG_MISSING if insufficient storage is
1568 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1570 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1571 if (BE (dup_idx != REG_MISSING, 1))
1573 dfa->nodes[dup_idx].constraint = constraint;
1574 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1575 dfa->nodes[dup_idx].duplicated = 1;
1577 /* Store the index of the original node. */
1578 dfa->org_indices[dup_idx] = org_idx;
1583 static reg_errcode_t
1584 calc_inveclosure (re_dfa_t *dfa)
1588 for (idx = 0; idx < dfa->nodes_len; ++idx)
1589 re_node_set_init_empty (dfa->inveclosures + idx);
1591 for (src = 0; src < dfa->nodes_len; ++src)
1593 Idx *elems = dfa->eclosures[src].elems;
1594 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1596 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1605 /* Calculate "eclosure" for all the node in DFA. */
1607 static reg_errcode_t
1608 calc_eclosure (re_dfa_t *dfa)
1613 assert (dfa->nodes_len > 0);
1616 /* For each nodes, calculate epsilon closure. */
1617 for (node_idx = 0; ; ++node_idx)
1620 re_node_set eclosure_elem;
1621 if (node_idx == dfa->nodes_len)
1630 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1633 /* If we have already calculated, skip it. */
1634 if (dfa->eclosures[node_idx].nelem != 0)
1636 /* Calculate epsilon closure of `node_idx'. */
1637 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1638 if (BE (err != REG_NOERROR, 0))
1641 if (dfa->eclosures[node_idx].nelem == 0)
1644 re_node_set_free (&eclosure_elem);
1650 /* Calculate epsilon closure of NODE. */
1652 static reg_errcode_t
1653 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1659 re_node_set eclosure;
1661 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662 if (BE (err != REG_NOERROR, 0))
1665 /* This indicates that we are calculating this node now.
1666 We reference this value to avoid infinite loop. */
1667 dfa->eclosures[node].nelem = REG_MISSING;
1669 /* If the current node has constraints, duplicate all nodes
1670 since they must inherit the constraints. */
1671 if (dfa->nodes[node].constraint
1672 && dfa->edests[node].nelem
1673 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1675 err = duplicate_node_closure (dfa, node, node, node,
1676 dfa->nodes[node].constraint);
1677 if (BE (err != REG_NOERROR, 0))
1681 /* Expand each epsilon destination nodes. */
1682 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1683 for (i = 0; i < dfa->edests[node].nelem; ++i)
1685 re_node_set eclosure_elem;
1686 Idx edest = dfa->edests[node].elems[i];
1687 /* If calculating the epsilon closure of `edest' is in progress,
1688 return intermediate result. */
1689 if (dfa->eclosures[edest].nelem == REG_MISSING)
1694 /* If we haven't calculated the epsilon closure of `edest' yet,
1695 calculate now. Otherwise use calculated epsilon closure. */
1696 if (dfa->eclosures[edest].nelem == 0)
1698 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1699 if (BE (err != REG_NOERROR, 0))
1703 eclosure_elem = dfa->eclosures[edest];
1704 /* Merge the epsilon closure of `edest'. */
1705 re_node_set_merge (&eclosure, &eclosure_elem);
1706 /* If the epsilon closure of `edest' is incomplete,
1707 the epsilon closure of this node is also incomplete. */
1708 if (dfa->eclosures[edest].nelem == 0)
1711 re_node_set_free (&eclosure_elem);
1715 /* Epsilon closures include itself. */
1716 ok = re_node_set_insert (&eclosure, node);
1719 if (incomplete && !root)
1720 dfa->eclosures[node].nelem = 0;
1722 dfa->eclosures[node] = eclosure;
1723 *new_set = eclosure;
1727 /* Functions for token which are used in the parser. */
1729 /* Fetch a token from INPUT.
1730 We must not use this function inside bracket expressions. */
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 re_string_skip_bytes (input, peek_token (result, input, syntax));
1739 /* Peek a token from INPUT, and return the length of the token.
1740 We must not use this function inside bracket expressions. */
1744 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1748 if (re_string_eoi (input))
1750 token->type = END_OF_RE;
1754 c = re_string_peek_byte (input, 0);
1757 token->word_char = 0;
1758 #ifdef RE_ENABLE_I18N
1759 token->mb_partial = 0;
1760 if (input->mb_cur_max > 1 &&
1761 !re_string_first_byte (input, re_string_cur_idx (input)))
1763 token->type = CHARACTER;
1764 token->mb_partial = 1;
1771 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773 token->type = BACK_SLASH;
1777 c2 = re_string_peek_byte_case (input, 1);
1779 token->type = CHARACTER;
1780 #ifdef RE_ENABLE_I18N
1781 if (input->mb_cur_max > 1)
1783 wint_t wc = re_string_wchar_at (input,
1784 re_string_cur_idx (input) + 1);
1785 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1789 token->word_char = IS_WORD_CHAR (c2) != 0;
1794 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1795 token->type = OP_ALT;
1797 case '1': case '2': case '3': case '4': case '5':
1798 case '6': case '7': case '8': case '9':
1799 if (!(syntax & RE_NO_BK_REFS))
1801 token->type = OP_BACK_REF;
1802 token->opr.idx = c2 - '1';
1806 if (!(syntax & RE_NO_GNU_OPS))
1808 token->type = ANCHOR;
1809 token->opr.ctx_type = WORD_FIRST;
1813 if (!(syntax & RE_NO_GNU_OPS))
1815 token->type = ANCHOR;
1816 token->opr.ctx_type = WORD_LAST;
1820 if (!(syntax & RE_NO_GNU_OPS))
1822 token->type = ANCHOR;
1823 token->opr.ctx_type = WORD_DELIM;
1827 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = ANCHOR;
1830 token->opr.ctx_type = NOT_WORD_DELIM;
1834 if (!(syntax & RE_NO_GNU_OPS))
1835 token->type = OP_WORD;
1838 if (!(syntax & RE_NO_GNU_OPS))
1839 token->type = OP_NOTWORD;
1842 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = OP_SPACE;
1846 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = OP_NOTSPACE;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = BUF_FIRST;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = BUF_LAST;
1864 if (!(syntax & RE_NO_BK_PARENS))
1865 token->type = OP_OPEN_SUBEXP;
1868 if (!(syntax & RE_NO_BK_PARENS))
1869 token->type = OP_CLOSE_SUBEXP;
1872 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1873 token->type = OP_DUP_PLUS;
1876 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1877 token->type = OP_DUP_QUESTION;
1880 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1881 token->type = OP_OPEN_DUP_NUM;
1884 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1885 token->type = OP_CLOSE_DUP_NUM;
1893 token->type = CHARACTER;
1894 #ifdef RE_ENABLE_I18N
1895 if (input->mb_cur_max > 1)
1897 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1898 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1902 token->word_char = IS_WORD_CHAR (token->opr.c);
1907 if (syntax & RE_NEWLINE_ALT)
1908 token->type = OP_ALT;
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1912 token->type = OP_ALT;
1915 token->type = OP_DUP_ASTERISK;
1918 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1919 token->type = OP_DUP_PLUS;
1922 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1923 token->type = OP_DUP_QUESTION;
1926 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1927 token->type = OP_OPEN_DUP_NUM;
1930 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1931 token->type = OP_CLOSE_DUP_NUM;
1934 if (syntax & RE_NO_BK_PARENS)
1935 token->type = OP_OPEN_SUBEXP;
1938 if (syntax & RE_NO_BK_PARENS)
1939 token->type = OP_CLOSE_SUBEXP;
1942 token->type = OP_OPEN_BRACKET;
1945 token->type = OP_PERIOD;
1948 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1949 re_string_cur_idx (input) != 0)
1951 char prev = re_string_peek_byte (input, -1);
1952 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1955 token->type = ANCHOR;
1956 token->opr.ctx_type = LINE_FIRST;
1959 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1960 re_string_cur_idx (input) + 1 != re_string_length (input))
1963 re_string_skip_bytes (input, 1);
1964 peek_token (&next, input, syntax);
1965 re_string_skip_bytes (input, -1);
1966 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1969 token->type = ANCHOR;
1970 token->opr.ctx_type = LINE_LAST;
1978 /* Peek a token from INPUT, and return the length of the token.
1979 We must not use this function out of bracket expressions. */
1983 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1986 if (re_string_eoi (input))
1988 token->type = END_OF_RE;
1991 c = re_string_peek_byte (input, 0);
1994 #ifdef RE_ENABLE_I18N
1995 if (input->mb_cur_max > 1 &&
1996 !re_string_first_byte (input, re_string_cur_idx (input)))
1998 token->type = CHARACTER;
2001 #endif /* RE_ENABLE_I18N */
2003 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2004 && re_string_cur_idx (input) + 1 < re_string_length (input))
2006 /* In this case, '\' escape a character. */
2008 re_string_skip_bytes (input, 1);
2009 c2 = re_string_peek_byte (input, 0);
2011 token->type = CHARACTER;
2014 if (c == '[') /* '[' is a special char in a bracket exps. */
2018 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2019 c2 = re_string_peek_byte (input, 1);
2027 token->type = OP_OPEN_COLL_ELEM;
2030 token->type = OP_OPEN_EQUIV_CLASS;
2033 if (syntax & RE_CHAR_CLASSES)
2035 token->type = OP_OPEN_CHAR_CLASS;
2038 /* else fall through. */
2040 token->type = CHARACTER;
2050 token->type = OP_CHARSET_RANGE;
2053 token->type = OP_CLOSE_BRACKET;
2056 token->type = OP_NON_MATCH_LIST;
2059 token->type = CHARACTER;
2064 /* Functions for parser. */
2066 /* Entry point of the parser.
2067 Parse the regular expression REGEXP and return the structure tree.
2068 If an error is occured, ERR is set by error code, and return NULL.
2069 This function build the following tree, from regular expression <reg_exp>:
2075 CAT means concatenation.
2076 EOR means end of regular expression. */
2079 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2083 bin_tree_t *tree, *eor, *root;
2084 re_token_t current_token;
2085 dfa->syntax = syntax;
2086 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2087 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2088 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092 root = create_tree (dfa, tree, eor, CONCAT);
2095 if (BE (eor == NULL || root == NULL, 0))
2103 /* This function build the following tree, from regular expression
2104 <branch1>|<branch2>:
2110 ALT means alternative, which represents the operator `|'. */
2113 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2114 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2117 bin_tree_t *tree, *branch = NULL;
2118 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2119 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2122 while (token->type == OP_ALT)
2124 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125 if (token->type != OP_ALT && token->type != END_OF_RE
2126 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2129 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2134 tree = create_tree (dfa, tree, branch, OP_ALT);
2135 if (BE (tree == NULL, 0))
2144 /* This function build the following tree, from regular expression
2151 CAT means concatenation. */
2154 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2155 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157 bin_tree_t *tree, *expr;
2158 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2159 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2160 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2163 while (token->type != OP_ALT && token->type != END_OF_RE
2164 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2167 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2171 if (tree != NULL && expr != NULL)
2173 tree = create_tree (dfa, tree, expr, CONCAT);
2180 else if (tree == NULL)
2182 /* Otherwise expr == NULL, we don't need to create new tree. */
2187 /* This function build the following tree, from regular expression a*:
2194 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2195 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199 switch (token->type)
2202 tree = create_token_tree (dfa, NULL, NULL, token);
2203 if (BE (tree == NULL, 0))
2208 #ifdef RE_ENABLE_I18N
2209 if (dfa->mb_cur_max > 1)
2211 while (!re_string_eoi (regexp)
2212 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214 bin_tree_t *mbc_remain;
2215 fetch_token (token, regexp, syntax);
2216 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2217 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2218 if (BE (mbc_remain == NULL || tree == NULL, 0))
2227 case OP_OPEN_SUBEXP:
2228 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2229 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2232 case OP_OPEN_BRACKET:
2233 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2234 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2238 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2243 dfa->used_bkref_map |= 1 << token->opr.idx;
2244 tree = create_token_tree (dfa, NULL, NULL, token);
2245 if (BE (tree == NULL, 0))
2251 dfa->has_mb_node = 1;
2253 case OP_OPEN_DUP_NUM:
2254 if (syntax & RE_CONTEXT_INVALID_DUP)
2260 case OP_DUP_ASTERISK:
2262 case OP_DUP_QUESTION:
2263 if (syntax & RE_CONTEXT_INVALID_OPS)
2268 else if (syntax & RE_CONTEXT_INDEP_OPS)
2270 fetch_token (token, regexp, syntax);
2271 return parse_expression (regexp, preg, token, syntax, nest, err);
2273 /* else fall through */
2274 case OP_CLOSE_SUBEXP:
2275 if ((token->type == OP_CLOSE_SUBEXP) &&
2276 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2281 /* else fall through */
2282 case OP_CLOSE_DUP_NUM:
2283 /* We treat it as a normal character. */
2285 /* Then we can these characters as normal characters. */
2286 token->type = CHARACTER;
2287 /* mb_partial and word_char bits should be initialized already
2289 tree = create_token_tree (dfa, NULL, NULL, token);
2290 if (BE (tree == NULL, 0))
2297 if ((token->opr.ctx_type
2298 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2299 && dfa->word_ops_used == 0)
2300 init_word_char (dfa);
2301 if (token->opr.ctx_type == WORD_DELIM
2302 || token->opr.ctx_type == NOT_WORD_DELIM)
2304 bin_tree_t *tree_first, *tree_last;
2305 if (token->opr.ctx_type == WORD_DELIM)
2307 token->opr.ctx_type = WORD_FIRST;
2308 tree_first = create_token_tree (dfa, NULL, NULL, token);
2309 token->opr.ctx_type = WORD_LAST;
2313 token->opr.ctx_type = INSIDE_WORD;
2314 tree_first = create_token_tree (dfa, NULL, NULL, token);
2315 token->opr.ctx_type = INSIDE_NOTWORD;
2317 tree_last = create_token_tree (dfa, NULL, NULL, token);
2318 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2319 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2327 tree = create_token_tree (dfa, NULL, NULL, token);
2328 if (BE (tree == NULL, 0))
2334 /* We must return here, since ANCHORs can't be followed
2335 by repetition operators.
2336 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2337 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2338 fetch_token (token, regexp, syntax);
2341 tree = create_token_tree (dfa, NULL, NULL, token);
2342 if (BE (tree == NULL, 0))
2347 if (dfa->mb_cur_max > 1)
2348 dfa->has_mb_node = 1;
2352 tree = build_charclass_op (dfa, regexp->trans,
2353 (const unsigned char *) "alnum",
2354 (const unsigned char *) "_",
2355 token->type == OP_NOTWORD, err);
2356 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2361 tree = build_charclass_op (dfa, regexp->trans,
2362 (const unsigned char *) "space",
2363 (const unsigned char *) "",
2364 token->type == OP_NOTSPACE, err);
2365 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2375 /* Must not happen? */
2381 fetch_token (token, regexp, syntax);
2383 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2384 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2387 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 /* In BRE consecutive duplications are not allowed. */
2390 if ((syntax & RE_CONTEXT_INVALID_DUP)
2391 && (token->type == OP_DUP_ASTERISK
2392 || token->type == OP_OPEN_DUP_NUM))
2402 /* This function build the following tree, from regular expression
2410 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2411 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2416 cur_nsub = preg->re_nsub++;
2418 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2420 /* The subexpression may be a null string. */
2421 if (token->type == OP_CLOSE_SUBEXP)
2425 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2426 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428 if (BE (*err != REG_NOERROR, 0))
2432 if (cur_nsub <= '9' - '1')
2433 dfa->completed_bkref_map |= 1 << cur_nsub;
2435 tree = create_tree (dfa, tree, NULL, SUBEXP);
2436 if (BE (tree == NULL, 0))
2441 tree->token.opr.idx = cur_nsub;
2445 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2448 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2449 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 bin_tree_t *tree = NULL, *old_tree = NULL;
2452 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2453 re_token_t start_token = *token;
2455 if (token->type == OP_OPEN_DUP_NUM)
2458 start = fetch_number (regexp, token, syntax);
2459 if (start == REG_MISSING)
2461 if (token->type == CHARACTER && token->opr.c == ',')
2462 start = 0; /* We treat "{,m}" as "{0,m}". */
2465 *err = REG_BADBR; /* <re>{} is invalid. */
2469 if (BE (start != REG_ERROR, 1))
2471 /* We treat "{n}" as "{n,n}". */
2472 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2473 : ((token->type == CHARACTER && token->opr.c == ',')
2474 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478 /* Invalid sequence. */
2479 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481 if (token->type == END_OF_RE)
2489 /* If the syntax bit is set, rollback. */
2490 re_string_set_index (regexp, start_idx);
2491 *token = start_token;
2492 token->type = CHARACTER;
2493 /* mb_partial and word_char bits should be already initialized by
2498 if (BE (end != REG_MISSING && start > end, 0))
2500 /* First number greater than second. */
2507 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2508 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2511 fetch_token (token, regexp, syntax);
2513 if (BE (elem == NULL, 0))
2515 if (BE (start == 0 && end == 0, 0))
2517 postorder (elem, free_tree, NULL);
2521 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2522 if (BE (start > 0, 0))
2525 for (i = 2; i <= start; ++i)
2527 elem = duplicate_tree (elem, dfa);
2528 tree = create_tree (dfa, tree, elem, CONCAT);
2529 if (BE (elem == NULL || tree == NULL, 0))
2530 goto parse_dup_op_espace;
2536 /* Duplicate ELEM before it is marked optional. */
2537 elem = duplicate_tree (elem, dfa);
2543 if (elem->token.type == SUBEXP)
2544 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546 tree = create_tree (dfa, elem, NULL,
2547 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2548 if (BE (tree == NULL, 0))
2549 goto parse_dup_op_espace;
2551 /* This loop is actually executed only when end != REG_MISSING,
2552 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2553 already created the start+1-th copy. */
2554 if ((Idx) -1 < 0 || end != REG_MISSING)
2555 for (i = start + 2; i <= end; ++i)
2557 elem = duplicate_tree (elem, dfa);
2558 tree = create_tree (dfa, tree, elem, CONCAT);
2559 if (BE (elem == NULL || tree == NULL, 0))
2560 goto parse_dup_op_espace;
2562 tree = create_tree (dfa, tree, NULL, OP_ALT);
2563 if (BE (tree == NULL, 0))
2564 goto parse_dup_op_espace;
2568 tree = create_tree (dfa, old_tree, tree, CONCAT);
2572 parse_dup_op_espace:
2577 /* Size of the names for collating symbol/equivalence_class/character_class.
2578 I'm not sure, but maybe enough. */
2579 #define BRACKET_NAME_BUF_SIZE 32
2582 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2583 Build the range expression which starts from START_ELEM, and ends
2584 at END_ELEM. The result are written to MBCSET and SBCSET.
2585 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2586 mbcset->range_ends, is a pointer argument sinse we may
2589 static reg_errcode_t
2591 # ifdef RE_ENABLE_I18N
2592 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2593 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2594 # else /* not RE_ENABLE_I18N */
2595 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2596 bracket_elem_t *end_elem)
2597 # endif /* not RE_ENABLE_I18N */
2599 unsigned int start_ch, end_ch;
2600 /* Equivalence Classes and Character Classes can't be a range start/end. */
2601 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2602 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2606 /* We can handle no multi character collating elements without libc
2608 if (BE ((start_elem->type == COLL_SYM
2609 && strlen ((char *) start_elem->opr.name) > 1)
2610 || (end_elem->type == COLL_SYM
2611 && strlen ((char *) end_elem->opr.name) > 1), 0))
2612 return REG_ECOLLATE;
2614 # ifdef RE_ENABLE_I18N
2619 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2621 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2622 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2625 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2628 ? __btowc (start_ch) : start_elem->opr.wch);
2629 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2630 ? __btowc (end_ch) : end_elem->opr.wch);
2631 if (start_wc == WEOF || end_wc == WEOF)
2632 return REG_ECOLLATE;
2633 cmp_buf[0] = start_wc;
2634 cmp_buf[4] = end_wc;
2635 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2638 /* Got valid collation sequence values, add them as a new entry.
2639 However, for !_LIBC we have no collation elements: if the
2640 character set is single byte, the single byte character set
2641 that we build below suffices. parse_bracket_exp passes
2642 no MBCSET if dfa->mb_cur_max == 1. */
2645 /* Check the space of the arrays. */
2646 if (BE (*range_alloc == mbcset->nranges, 0))
2648 /* There is not enough space, need realloc. */
2649 wchar_t *new_array_start, *new_array_end;
2652 /* +1 in case of mbcset->nranges is 0. */
2653 new_nranges = 2 * mbcset->nranges + 1;
2654 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2655 are NULL if *range_alloc == 0. */
2656 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2661 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2664 mbcset->range_starts = new_array_start;
2665 mbcset->range_ends = new_array_end;
2666 *range_alloc = new_nranges;
2669 mbcset->range_starts[mbcset->nranges] = start_wc;
2670 mbcset->range_ends[mbcset->nranges++] = end_wc;
2673 /* Build the table for single byte characters. */
2674 for (wc = 0; wc < SBC_MAX; ++wc)
2677 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2678 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2679 bitset_set (sbcset, wc);
2682 # else /* not RE_ENABLE_I18N */
2685 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2686 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2689 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691 if (start_ch > end_ch)
2693 /* Build the table for single byte characters. */
2694 for (ch = 0; ch < SBC_MAX; ++ch)
2695 if (start_ch <= ch && ch <= end_ch)
2696 bitset_set (sbcset, ch);
2698 # endif /* not RE_ENABLE_I18N */
2701 #endif /* not _LIBC */
2704 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2705 Build the collating element which is represented by NAME.
2706 The result are written to MBCSET and SBCSET.
2707 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2708 pointer argument since we may update it. */
2710 static reg_errcode_t
2712 build_collating_symbol (bitset_t sbcset,
2713 # ifdef RE_ENABLE_I18N
2714 re_charset_t *mbcset, Idx *coll_sym_alloc,
2716 const unsigned char *name)
2718 size_t name_len = strlen ((const char *) name);
2719 if (BE (name_len != 1, 0))
2720 return REG_ECOLLATE;
2723 bitset_set (sbcset, name[0]);
2727 #endif /* not _LIBC */
2729 /* This function parse bracket expression like "[abc]", "[a-c]",
2733 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2734 reg_syntax_t syntax, reg_errcode_t *err)
2737 const unsigned char *collseqmb;
2738 const char *collseqwc;
2741 const int32_t *symb_table;
2742 const unsigned char *extra;
2744 /* Local function for parse_bracket_exp used in _LIBC environement.
2745 Seek the collating symbol entry correspondings to NAME.
2746 Return the index of the symbol in the SYMB_TABLE. */
2749 __attribute ((always_inline))
2750 seek_collating_symbol_entry (name, name_len)
2751 const unsigned char *name;
2754 int32_t hash = elem_hash ((const char *) name, name_len);
2755 int32_t elem = hash % table_size;
2756 if (symb_table[2 * elem] != 0)
2758 int32_t second = hash % (table_size - 2) + 1;
2762 /* First compare the hashing value. */
2763 if (symb_table[2 * elem] == hash
2764 /* Compare the length of the name. */
2765 && name_len == extra[symb_table[2 * elem + 1]]
2766 /* Compare the name. */
2767 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2770 /* Yep, this is the entry. */
2777 while (symb_table[2 * elem] != 0);
2782 /* Local function for parse_bracket_exp used in _LIBC environement.
2783 Look up the collation sequence value of BR_ELEM.
2784 Return the value if succeeded, UINT_MAX otherwise. */
2786 auto inline unsigned int
2787 __attribute ((always_inline))
2788 lookup_collation_sequence_value (br_elem)
2789 bracket_elem_t *br_elem;
2791 if (br_elem->type == SB_CHAR)
2794 if (MB_CUR_MAX == 1)
2797 return collseqmb[br_elem->opr.ch];
2800 wint_t wc = __btowc (br_elem->opr.ch);
2801 return __collseq_table_lookup (collseqwc, wc);
2804 else if (br_elem->type == MB_CHAR)
2806 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808 else if (br_elem->type == COLL_SYM)
2810 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2814 elem = seek_collating_symbol_entry (br_elem->opr.name,
2816 if (symb_table[2 * elem] != 0)
2818 /* We found the entry. */
2819 idx = symb_table[2 * elem + 1];
2820 /* Skip the name of collating element name. */
2821 idx += 1 + extra[idx];
2822 /* Skip the byte sequence of the collating element. */
2823 idx += 1 + extra[idx];
2824 /* Adjust for the alignment. */
2825 idx = (idx + 3) & ~3;
2826 /* Skip the multibyte collation sequence value. */
2827 idx += sizeof (unsigned int);
2828 /* Skip the wide char sequence of the collating element. */
2829 idx += sizeof (unsigned int) *
2830 (1 + *(unsigned int *) (extra + idx));
2831 /* Return the collation sequence value. */
2832 return *(unsigned int *) (extra + idx);
2834 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836 /* No valid character. Match it as a single byte
2838 return collseqmb[br_elem->opr.name[0]];
2841 else if (sym_name_len == 1)
2842 return collseqmb[br_elem->opr.name[0]];
2847 /* Local function for parse_bracket_exp used in _LIBC environement.
2848 Build the range expression which starts from START_ELEM, and ends
2849 at END_ELEM. The result are written to MBCSET and SBCSET.
2850 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2851 mbcset->range_ends, is a pointer argument sinse we may
2854 auto inline reg_errcode_t
2855 __attribute ((always_inline))
2856 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2857 re_charset_t *mbcset;
2860 bracket_elem_t *start_elem, *end_elem;
2863 uint32_t start_collseq;
2864 uint32_t end_collseq;
2866 /* Equivalence Classes and Character Classes can't be a range
2868 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2869 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2873 start_collseq = lookup_collation_sequence_value (start_elem);
2874 end_collseq = lookup_collation_sequence_value (end_elem);
2875 /* Check start/end collation sequence values. */
2876 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2877 return REG_ECOLLATE;
2878 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2881 /* Got valid collation sequence values, add them as a new entry.
2882 However, if we have no collation elements, and the character set
2883 is single byte, the single byte character set that we
2884 build below suffices. */
2885 if (nrules > 0 || dfa->mb_cur_max > 1)
2887 /* Check the space of the arrays. */
2888 if (BE (*range_alloc == mbcset->nranges, 0))
2890 /* There is not enough space, need realloc. */
2891 uint32_t *new_array_start;
2892 uint32_t *new_array_end;
2895 /* +1 in case of mbcset->nranges is 0. */
2896 new_nranges = 2 * mbcset->nranges + 1;
2897 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2902 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2905 mbcset->range_starts = new_array_start;
2906 mbcset->range_ends = new_array_end;
2907 *range_alloc = new_nranges;
2910 mbcset->range_starts[mbcset->nranges] = start_collseq;
2911 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2914 /* Build the table for single byte characters. */
2915 for (ch = 0; ch < SBC_MAX; ch++)
2917 uint32_t ch_collseq;
2919 if (MB_CUR_MAX == 1)
2922 ch_collseq = collseqmb[ch];
2924 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2925 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2926 bitset_set (sbcset, ch);
2931 /* Local function for parse_bracket_exp used in _LIBC environement.
2932 Build the collating element which is represented by NAME.
2933 The result are written to MBCSET and SBCSET.
2934 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2935 pointer argument sinse we may update it. */
2937 auto inline reg_errcode_t
2938 __attribute ((always_inline))
2939 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2940 re_charset_t *mbcset;
2941 Idx *coll_sym_alloc;
2943 const unsigned char *name;
2946 size_t name_len = strlen ((const char *) name);
2949 elem = seek_collating_symbol_entry (name, name_len);
2950 if (symb_table[2 * elem] != 0)
2952 /* We found the entry. */
2953 idx = symb_table[2 * elem + 1];
2954 /* Skip the name of collating element name. */
2955 idx += 1 + extra[idx];
2957 else if (symb_table[2 * elem] == 0 && name_len == 1)
2959 /* No valid character, treat it as a normal
2961 bitset_set (sbcset, name[0]);
2965 return REG_ECOLLATE;
2967 /* Got valid collation sequence, add it as a new entry. */
2968 /* Check the space of the arrays. */
2969 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971 /* Not enough, realloc it. */
2972 /* +1 in case of mbcset->ncoll_syms is 0. */
2973 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2974 /* Use realloc since mbcset->coll_syms is NULL
2976 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2977 new_coll_sym_alloc);
2978 if (BE (new_coll_syms == NULL, 0))
2980 mbcset->coll_syms = new_coll_syms;
2981 *coll_sym_alloc = new_coll_sym_alloc;
2983 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2988 if (BE (name_len != 1, 0))
2989 return REG_ECOLLATE;
2992 bitset_set (sbcset, name[0]);
2999 re_token_t br_token;
3000 re_bitset_ptr_t sbcset;
3001 #ifdef RE_ENABLE_I18N
3002 re_charset_t *mbcset;
3003 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3004 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3005 #endif /* not RE_ENABLE_I18N */
3006 bool non_match = false;
3007 bin_tree_t *work_tree;
3009 bool first_round = true;
3011 collseqmb = (const unsigned char *)
3012 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3013 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3019 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3020 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3021 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3022 _NL_COLLATE_SYMB_TABLEMB);
3023 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3024 _NL_COLLATE_SYMB_EXTRAMB);
3027 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3028 #ifdef RE_ENABLE_I18N
3029 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3030 #endif /* RE_ENABLE_I18N */
3031 #ifdef RE_ENABLE_I18N
3032 if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 if (BE (sbcset == NULL, 0))
3035 #endif /* RE_ENABLE_I18N */
3041 token_len = peek_token_bracket (token, regexp, syntax);
3042 if (BE (token->type == END_OF_RE, 0))
3045 goto parse_bracket_exp_free_return;
3047 if (token->type == OP_NON_MATCH_LIST)
3049 #ifdef RE_ENABLE_I18N
3050 mbcset->non_match = 1;
3051 #endif /* not RE_ENABLE_I18N */
3053 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3054 bitset_set (sbcset, '\n');
3055 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3056 token_len = peek_token_bracket (token, regexp, syntax);
3057 if (BE (token->type == END_OF_RE, 0))
3060 goto parse_bracket_exp_free_return;
3064 /* We treat the first ']' as a normal character. */
3065 if (token->type == OP_CLOSE_BRACKET)
3066 token->type = CHARACTER;
3070 bracket_elem_t start_elem, end_elem;
3071 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3072 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3075 bool is_range_exp = false;
3078 start_elem.opr.name = start_name_buf;
3079 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3080 syntax, first_round);
3081 if (BE (ret != REG_NOERROR, 0))
3084 goto parse_bracket_exp_free_return;
3086 first_round = false;
3088 /* Get information about the next token. We need it in any case. */
3089 token_len = peek_token_bracket (token, regexp, syntax);
3091 /* Do not check for ranges if we know they are not allowed. */
3092 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094 if (BE (token->type == END_OF_RE, 0))
3097 goto parse_bracket_exp_free_return;
3099 if (token->type == OP_CHARSET_RANGE)
3101 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3102 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3103 if (BE (token2.type == END_OF_RE, 0))
3106 goto parse_bracket_exp_free_return;
3108 if (token2.type == OP_CLOSE_BRACKET)
3110 /* We treat the last '-' as a normal character. */
3111 re_string_skip_bytes (regexp, -token_len);
3112 token->type = CHARACTER;
3115 is_range_exp = true;
3119 if (is_range_exp == true)
3121 end_elem.opr.name = end_name_buf;
3122 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124 if (BE (ret != REG_NOERROR, 0))
3127 goto parse_bracket_exp_free_return;
3130 token_len = peek_token_bracket (token, regexp, syntax);
3133 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3134 &start_elem, &end_elem);
3136 # ifdef RE_ENABLE_I18N
3137 *err = build_range_exp (sbcset,
3138 dfa->mb_cur_max > 1 ? mbcset : NULL,
3139 &range_alloc, &start_elem, &end_elem);
3141 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143 #endif /* RE_ENABLE_I18N */
3144 if (BE (*err != REG_NOERROR, 0))
3145 goto parse_bracket_exp_free_return;
3149 switch (start_elem.type)
3152 bitset_set (sbcset, start_elem.opr.ch);
3154 #ifdef RE_ENABLE_I18N
3156 /* Check whether the array has enough space. */
3157 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159 wchar_t *new_mbchars;
3160 /* Not enough, realloc it. */
3161 /* +1 in case of mbcset->nmbchars is 0. */
3162 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3163 /* Use realloc since array is NULL if *alloc == 0. */
3164 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166 if (BE (new_mbchars == NULL, 0))
3167 goto parse_bracket_exp_espace;
3168 mbcset->mbchars = new_mbchars;
3170 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172 #endif /* RE_ENABLE_I18N */
3174 *err = build_equiv_class (sbcset,
3175 #ifdef RE_ENABLE_I18N
3176 mbcset, &equiv_class_alloc,
3177 #endif /* RE_ENABLE_I18N */
3178 start_elem.opr.name);
3179 if (BE (*err != REG_NOERROR, 0))
3180 goto parse_bracket_exp_free_return;
3183 *err = build_collating_symbol (sbcset,
3184 #ifdef RE_ENABLE_I18N
3185 mbcset, &coll_sym_alloc,
3186 #endif /* RE_ENABLE_I18N */
3187 start_elem.opr.name);
3188 if (BE (*err != REG_NOERROR, 0))
3189 goto parse_bracket_exp_free_return;
3192 *err = build_charclass (regexp->trans, sbcset,
3193 #ifdef RE_ENABLE_I18N
3194 mbcset, &char_class_alloc,
3195 #endif /* RE_ENABLE_I18N */
3196 start_elem.opr.name, syntax);
3197 if (BE (*err != REG_NOERROR, 0))
3198 goto parse_bracket_exp_free_return;
3205 if (BE (token->type == END_OF_RE, 0))
3208 goto parse_bracket_exp_free_return;
3210 if (token->type == OP_CLOSE_BRACKET)
3214 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3216 /* If it is non-matching list. */
3218 bitset_not (sbcset);
3220 #ifdef RE_ENABLE_I18N
3221 /* Ensure only single byte characters are set. */
3222 if (dfa->mb_cur_max > 1)
3223 bitset_mask (sbcset, dfa->sb_char);
3225 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3226 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3227 || mbcset->non_match)))
3229 bin_tree_t *mbc_tree;
3231 /* Build a tree for complex bracket. */
3232 dfa->has_mb_node = 1;
3233 br_token.type = COMPLEX_BRACKET;
3234 br_token.opr.mbcset = mbcset;
3235 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236 if (BE (mbc_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3238 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3239 if (sbcset[sbc_idx])
3241 /* If there are no bits set in sbcset, there is no point
3242 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3243 if (sbc_idx < BITSET_WORDS)
3245 /* Build a tree for simple bracket. */
3246 br_token.type = SIMPLE_BRACKET;
3247 br_token.opr.sbcset = sbcset;
3248 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3249 if (BE (work_tree == NULL, 0))
3250 goto parse_bracket_exp_espace;
3252 /* Then join them by ALT node. */
3253 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3254 if (BE (work_tree == NULL, 0))
3255 goto parse_bracket_exp_espace;
3260 work_tree = mbc_tree;
3264 #endif /* not RE_ENABLE_I18N */
3266 #ifdef RE_ENABLE_I18N
3267 free_charset (mbcset);
3269 /* Build a tree for simple bracket. */
3270 br_token.type = SIMPLE_BRACKET;
3271 br_token.opr.sbcset = sbcset;
3272 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3273 if (BE (work_tree == NULL, 0))
3274 goto parse_bracket_exp_espace;
3278 parse_bracket_exp_espace:
3280 parse_bracket_exp_free_return:
3282 #ifdef RE_ENABLE_I18N
3283 free_charset (mbcset);
3284 #endif /* RE_ENABLE_I18N */
3288 /* Parse an element in the bracket expression. */
3290 static reg_errcode_t
3291 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3292 re_token_t *token, int token_len, re_dfa_t *dfa,
3293 reg_syntax_t syntax, bool accept_hyphen)
3295 #ifdef RE_ENABLE_I18N
3297 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3298 if (cur_char_size > 1)
3300 elem->type = MB_CHAR;
3301 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3302 re_string_skip_bytes (regexp, cur_char_size);
3305 #endif /* RE_ENABLE_I18N */
3306 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3307 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3308 || token->type == OP_OPEN_EQUIV_CLASS)
3309 return parse_bracket_symbol (elem, regexp, token);
3310 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312 /* A '-' must only appear as anything but a range indicator before
3313 the closing bracket. Everything else is an error. */
3315 (void) peek_token_bracket (&token2, regexp, syntax);
3316 if (token2.type != OP_CLOSE_BRACKET)
3317 /* The actual error value is not standardized since this whole
3318 case is undefined. But ERANGE makes good sense. */
3321 elem->type = SB_CHAR;
3322 elem->opr.ch = token->opr.c;
3326 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3327 such as [:<character_class>:], [.<collating_element>.], and
3328 [=<equivalent_class>=]. */
3330 static reg_errcode_t
3331 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3334 unsigned char ch, delim = token->opr.c;
3336 if (re_string_eoi(regexp))
3340 if (i >= BRACKET_NAME_BUF_SIZE)
3342 if (token->type == OP_OPEN_CHAR_CLASS)
3343 ch = re_string_fetch_byte_case (regexp);
3345 ch = re_string_fetch_byte (regexp);
3346 if (re_string_eoi(regexp))
3348 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350 elem->opr.name[i] = ch;
3352 re_string_skip_bytes (regexp, 1);
3353 elem->opr.name[i] = '\0';
3354 switch (token->type)
3356 case OP_OPEN_COLL_ELEM:
3357 elem->type = COLL_SYM;
3359 case OP_OPEN_EQUIV_CLASS:
3360 elem->type = EQUIV_CLASS;
3362 case OP_OPEN_CHAR_CLASS:
3363 elem->type = CHAR_CLASS;
3371 /* Helper function for parse_bracket_exp.
3372 Build the equivalence class which is represented by NAME.
3373 The result are written to MBCSET and SBCSET.
3374 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3375 is a pointer argument sinse we may update it. */
3377 static reg_errcode_t
3378 #ifdef RE_ENABLE_I18N
3379 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3380 Idx *equiv_class_alloc, const unsigned char *name)
3381 #else /* not RE_ENABLE_I18N */
3382 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3383 #endif /* not RE_ENABLE_I18N */
3386 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3389 const int32_t *table, *indirect;
3390 const unsigned char *weights, *extra, *cp;
3391 unsigned char char_buf[2];
3395 /* This #include defines a local function! */
3396 # include <locale/weight.h>
3397 /* Calculate the index for equivalence class. */
3399 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3400 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3401 _NL_COLLATE_WEIGHTMB);
3402 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3403 _NL_COLLATE_EXTRAMB);
3404 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3405 _NL_COLLATE_INDIRECTMB);
3406 idx1 = findidx (&cp);
3407 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3408 /* This isn't a valid character. */
3409 return REG_ECOLLATE;
3411 /* Build single byte matcing table for this equivalence class. */
3412 char_buf[1] = (unsigned char) '\0';
3413 len = weights[idx1];
3414 for (ch = 0; ch < SBC_MAX; ++ch)
3418 idx2 = findidx (&cp);
3423 /* This isn't a valid character. */
3425 if (len == weights[idx2])
3428 while (cnt <= len &&
3429 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3433 bitset_set (sbcset, ch);
3436 /* Check whether the array has enough space. */
3437 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439 /* Not enough, realloc it. */
3440 /* +1 in case of mbcset->nequiv_classes is 0. */
3441 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3442 /* Use realloc since the array is NULL if *alloc == 0. */
3443 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445 new_equiv_class_alloc);
3446 if (BE (new_equiv_classes == NULL, 0))
3448 mbcset->equiv_classes = new_equiv_classes;
3449 *equiv_class_alloc = new_equiv_class_alloc;
3451 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3456 if (BE (strlen ((const char *) name) != 1, 0))
3457 return REG_ECOLLATE;
3458 bitset_set (sbcset, *name);
3463 /* Helper function for parse_bracket_exp.
3464 Build the character class which is represented by NAME.
3465 The result are written to MBCSET and SBCSET.
3466 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3467 is a pointer argument sinse we may update it. */
3469 static reg_errcode_t
3470 #ifdef RE_ENABLE_I18N
3471 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3472 re_charset_t *mbcset, Idx *char_class_alloc,
3473 const unsigned char *class_name, reg_syntax_t syntax)
3474 #else /* not RE_ENABLE_I18N */
3475 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3476 const unsigned char *class_name, reg_syntax_t syntax)
3477 #endif /* not RE_ENABLE_I18N */
3480 const char *name = (const char *) class_name;
3482 /* In case of REG_ICASE "upper" and "lower" match the both of
3483 upper and lower cases. */
3484 if ((syntax & RE_ICASE)
3485 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3488 #ifdef RE_ENABLE_I18N
3489 /* Check the space of the arrays. */
3490 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492 /* Not enough, realloc it. */
3493 /* +1 in case of mbcset->nchar_classes is 0. */
3494 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3495 /* Use realloc since array is NULL if *alloc == 0. */
3496 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3497 new_char_class_alloc);
3498 if (BE (new_char_classes == NULL, 0))
3500 mbcset->char_classes = new_char_classes;
3501 *char_class_alloc = new_char_class_alloc;
3503 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3504 #endif /* RE_ENABLE_I18N */
3506 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3508 if (BE (trans != NULL, 0)) \
3510 for (i = 0; i < SBC_MAX; ++i) \
3511 if (ctype_func (i)) \
3512 bitset_set (sbcset, trans[i]); \
3516 for (i = 0; i < SBC_MAX; ++i) \
3517 if (ctype_func (i)) \
3518 bitset_set (sbcset, i); \
3522 if (strcmp (name, "alnum") == 0)
3523 BUILD_CHARCLASS_LOOP (isalnum);
3524 else if (strcmp (name, "cntrl") == 0)
3525 BUILD_CHARCLASS_LOOP (iscntrl);
3526 else if (strcmp (name, "lower") == 0)
3527 BUILD_CHARCLASS_LOOP (islower);
3528 else if (strcmp (name, "space") == 0)
3529 BUILD_CHARCLASS_LOOP (isspace);
3530 else if (strcmp (name, "alpha") == 0)
3531 BUILD_CHARCLASS_LOOP (isalpha);
3532 else if (strcmp (name, "digit") == 0)
3533 BUILD_CHARCLASS_LOOP (isdigit);
3534 else if (strcmp (name, "print") == 0)
3535 BUILD_CHARCLASS_LOOP (isprint);
3536 else if (strcmp (name, "upper") == 0)
3537 BUILD_CHARCLASS_LOOP (isupper);
3538 else if (strcmp (name, "blank") == 0)
3539 BUILD_CHARCLASS_LOOP (isblank);
3540 else if (strcmp (name, "graph") == 0)
3541 BUILD_CHARCLASS_LOOP (isgraph);
3542 else if (strcmp (name, "punct") == 0)
3543 BUILD_CHARCLASS_LOOP (ispunct);
3544 else if (strcmp (name, "xdigit") == 0)
3545 BUILD_CHARCLASS_LOOP (isxdigit);
3553 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3554 const unsigned char *class_name,
3555 const unsigned char *extra, bool non_match,
3558 re_bitset_ptr_t sbcset;
3559 #ifdef RE_ENABLE_I18N
3560 re_charset_t *mbcset;
3562 #endif /* not RE_ENABLE_I18N */
3564 re_token_t br_token;
3567 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3568 #ifdef RE_ENABLE_I18N
3569 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3570 #endif /* RE_ENABLE_I18N */
3572 #ifdef RE_ENABLE_I18N
3573 if (BE (sbcset == NULL || mbcset == NULL, 0))
3574 #else /* not RE_ENABLE_I18N */
3575 if (BE (sbcset == NULL, 0))
3576 #endif /* not RE_ENABLE_I18N */
3584 #ifdef RE_ENABLE_I18N
3585 mbcset->non_match = 1;
3586 #endif /* not RE_ENABLE_I18N */
3589 /* We don't care the syntax in this case. */
3590 ret = build_charclass (trans, sbcset,
3591 #ifdef RE_ENABLE_I18N
3593 #endif /* RE_ENABLE_I18N */
3596 if (BE (ret != REG_NOERROR, 0))
3599 #ifdef RE_ENABLE_I18N
3600 free_charset (mbcset);
3601 #endif /* RE_ENABLE_I18N */
3605 /* \w match '_' also. */
3606 for (; *extra; extra++)
3607 bitset_set (sbcset, *extra);
3609 /* If it is non-matching list. */
3611 bitset_not (sbcset);
3613 #ifdef RE_ENABLE_I18N
3614 /* Ensure only single byte characters are set. */
3615 if (dfa->mb_cur_max > 1)
3616 bitset_mask (sbcset, dfa->sb_char);
3619 /* Build a tree for simple bracket. */
3620 br_token.type = SIMPLE_BRACKET;
3621 br_token.opr.sbcset = sbcset;
3622 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3623 if (BE (tree == NULL, 0))
3624 goto build_word_op_espace;
3626 #ifdef RE_ENABLE_I18N
3627 if (dfa->mb_cur_max > 1)
3629 bin_tree_t *mbc_tree;
3630 /* Build a tree for complex bracket. */
3631 br_token.type = COMPLEX_BRACKET;
3632 br_token.opr.mbcset = mbcset;
3633 dfa->has_mb_node = 1;
3634 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3635 if (BE (mbc_tree == NULL, 0))
3636 goto build_word_op_espace;
3637 /* Then join them by ALT node. */
3638 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3639 if (BE (mbc_tree != NULL, 1))
3644 free_charset (mbcset);
3647 #else /* not RE_ENABLE_I18N */
3649 #endif /* not RE_ENABLE_I18N */
3651 build_word_op_espace:
3653 #ifdef RE_ENABLE_I18N
3654 free_charset (mbcset);
3655 #endif /* RE_ENABLE_I18N */
3660 /* This is intended for the expressions like "a{1,3}".
3661 Fetch a number from `input', and return the number.
3662 Return REG_MISSING if the number field is empty like "{,1}".
3663 Return REG_ERROR if an error occurred. */
3666 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668 Idx num = REG_MISSING;
3672 fetch_token (token, input, syntax);
3674 if (BE (token->type == END_OF_RE, 0))
3676 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678 num = ((token->type != CHARACTER || c < '0' || '9' < c
3679 || num == REG_ERROR)
3681 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3682 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3687 #ifdef RE_ENABLE_I18N
3689 free_charset (re_charset_t *cset)
3691 re_free (cset->mbchars);
3693 re_free (cset->coll_syms);
3694 re_free (cset->equiv_classes);
3695 re_free (cset->range_starts);
3696 re_free (cset->range_ends);
3698 re_free (cset->char_classes);
3701 #endif /* RE_ENABLE_I18N */
3703 /* Functions for binary tree operation. */
3705 /* Create a tree node. */
3708 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3709 re_token_type_t type)
3713 return create_token_tree (dfa, left, right, &t);
3717 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3718 const re_token_t *token)
3721 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725 if (storage == NULL)
3727 storage->next = dfa->str_tree_storage;
3728 dfa->str_tree_storage = storage;
3729 dfa->str_tree_storage_idx = 0;
3731 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733 tree->parent = NULL;
3735 tree->right = right;
3736 tree->token = *token;
3737 tree->token.duplicated = 0;
3738 tree->token.opt_subexp = 0;
3741 tree->node_idx = REG_MISSING;
3744 left->parent = tree;
3746 right->parent = tree;
3750 /* Mark the tree SRC as an optional subexpression.
3751 To be called from preorder or postorder. */
3753 static reg_errcode_t
3754 mark_opt_subexp (void *extra, bin_tree_t *node)
3756 Idx idx = (Idx) (long) extra;
3757 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3758 node->token.opt_subexp = 1;
3763 /* Free the allocated memory inside NODE. */
3766 free_token (re_token_t *node)
3768 #ifdef RE_ENABLE_I18N
3769 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3770 free_charset (node->opr.mbcset);
3772 #endif /* RE_ENABLE_I18N */
3773 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3774 re_free (node->opr.sbcset);
3777 /* Worker function for tree walking. Free the allocated memory inside NODE
3778 and its children. */
3780 static reg_errcode_t
3781 free_tree (void *extra, bin_tree_t *node)
3783 free_token (&node->token);
3788 /* Duplicate the node SRC, and return new node. This is a preorder
3789 visit similar to the one implemented by the generic visitor, but
3790 we need more infrastructure to maintain two parallel trees --- so,
3791 it's easier to duplicate. */
3794 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796 const bin_tree_t *node;
3797 bin_tree_t *dup_root;
3798 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3800 for (node = root; ; )
3802 /* Create a new tree and link it back to the current parent. */
3803 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3806 (*p_new)->parent = dup_node;
3807 (*p_new)->token.duplicated = 1;
3810 /* Go to the left node, or up and to the right. */
3814 p_new = &dup_node->left;
3818 const bin_tree_t *prev = NULL;
3819 while (node->right == prev || node->right == NULL)
3822 node = node->parent;
3823 dup_node = dup_node->parent;
3828 p_new = &dup_node->right;