1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 /* See if we have to try all bytes which start multiple collation
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
376 re_set_fastmap (fastmap, icase, i);
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
387 || cset->nequiv_classes
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
528 weak_alias (__regcomp, regcomp)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
536 regerror (errcode, preg, errbuf, errbuf_size)
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
541 #else /* size_t might promote */
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
561 msg_size = strlen (msg) + 1; /* Includes the null. */
563 if (BE (errbuf_size != 0, 1))
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
571 memcpy (errbuf, msg, cpy_size);
577 weak_alias (__regerror, regerror)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
588 /* Set the first 128 bits. */
589 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
590 # error "bitset_word_t is narrower than 32 bits"
591 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
592 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
593 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599 >> (SBC_MAX % BITSET_WORD_BITS == 0
601 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
607 free_dfa_content (re_dfa_t *dfa)
612 for (i = 0; i < dfa->nodes_len; ++i)
613 free_token (dfa->nodes + i);
614 re_free (dfa->nexts);
615 for (i = 0; i < dfa->nodes_len; ++i)
617 if (dfa->eclosures != NULL)
618 re_node_set_free (dfa->eclosures + i);
619 if (dfa->inveclosures != NULL)
620 re_node_set_free (dfa->inveclosures + i);
621 if (dfa->edests != NULL)
622 re_node_set_free (dfa->edests + i);
624 re_free (dfa->edests);
625 re_free (dfa->eclosures);
626 re_free (dfa->inveclosures);
627 re_free (dfa->nodes);
629 if (dfa->state_table)
630 for (i = 0; i <= dfa->state_hash_mask; ++i)
632 struct re_state_table_entry *entry = dfa->state_table + i;
633 for (j = 0; j < entry->num; ++j)
635 re_dfastate_t *state = entry->array[j];
638 re_free (entry->array);
640 re_free (dfa->state_table);
641 #ifdef RE_ENABLE_I18N
642 if (dfa->sb_char != utf8_sb_map)
643 re_free (dfa->sb_char);
645 re_free (dfa->subexp_map);
647 re_free (dfa->re_str);
654 /* Free dynamically allocated space used by PREG. */
660 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
661 if (BE (dfa != NULL, 1))
662 free_dfa_content (dfa);
666 re_free (preg->fastmap);
667 preg->fastmap = NULL;
669 re_free (preg->translate);
670 preg->translate = NULL;
673 weak_alias (__regfree, regfree)
676 /* Entry points compatible with 4.2 BSD regex library. We don't define
677 them unless specifically requested. */
679 #if defined _REGEX_RE_COMP || defined _LIBC
681 /* BSD has one and only one pattern buffer. */
682 static struct re_pattern_buffer re_comp_buf;
686 /* Make these definitions weak in libc, so POSIX programs can redefine
687 these names if they don't use our functions, and still use
688 regcomp/regexec above without link errors. */
699 if (!re_comp_buf.buffer)
700 return gettext ("No previous regular expression");
704 if (re_comp_buf.buffer)
706 fastmap = re_comp_buf.fastmap;
707 re_comp_buf.fastmap = NULL;
708 __regfree (&re_comp_buf);
709 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
710 re_comp_buf.fastmap = fastmap;
713 if (re_comp_buf.fastmap == NULL)
715 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
716 if (re_comp_buf.fastmap == NULL)
717 return (char *) gettext (__re_error_msgid
718 + __re_error_msgid_idx[(int) REG_ESPACE]);
721 /* Since `re_exec' always passes NULL for the `regs' argument, we
722 don't need to initialize the pattern buffer fields which affect it. */
724 /* Match anchors at newlines. */
725 re_comp_buf.newline_anchor = 1;
727 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
732 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
733 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 libc_freeres_fn (free_mem)
739 __regfree (&re_comp_buf);
743 #endif /* _REGEX_RE_COMP */
745 /* Internal entry point.
746 Compile the regular expression PATTERN, whose length is LENGTH.
747 SYNTAX indicate regular expression's syntax. */
750 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
753 reg_errcode_t err = REG_NOERROR;
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
780 preg->used = sizeof (re_dfa_t);
782 err = init_dfa (dfa, length);
783 if (BE (err != REG_NOERROR, 0))
785 free_dfa_content (dfa);
791 /* Note: length+1 will not overflow since it is checked in init_dfa. */
792 dfa->re_str = re_malloc (char, length + 1);
793 strncpy (dfa->re_str, pattern, length + 1);
796 __libc_lock_init (dfa->lock);
798 err = re_string_construct (®exp, pattern, length, preg->translate,
799 (syntax & RE_ICASE) != 0, dfa);
800 if (BE (err != REG_NOERROR, 0))
802 re_compile_internal_free_return:
803 free_workarea_compile (preg);
804 re_string_destruct (®exp);
805 free_dfa_content (dfa);
811 /* Parse the regular expression, and build a structure tree. */
813 dfa->str_tree = parse (®exp, preg, syntax, &err);
814 if (BE (dfa->str_tree == NULL, 0))
815 goto re_compile_internal_free_return;
817 /* Analyze the tree and create the nfa. */
818 err = analyze (preg);
819 if (BE (err != REG_NOERROR, 0))
820 goto re_compile_internal_free_return;
822 #ifdef RE_ENABLE_I18N
823 /* If possible, do searching in single byte encoding to speed things up. */
824 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828 /* Then create the initial state of the dfa. */
829 err = create_initial_state (dfa);
831 /* Release work areas. */
832 free_workarea_compile (preg);
833 re_string_destruct (®exp);
835 if (BE (err != REG_NOERROR, 0))
837 free_dfa_content (dfa);
845 /* Initialize DFA. We use the length of the regular expression PAT_LEN
846 as the initial length of some arrays. */
849 init_dfa (re_dfa_t *dfa, size_t pat_len)
851 __re_size_t table_size;
855 #ifdef RE_ENABLE_I18N
856 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
858 size_t max_i18n_object_size = 0;
860 size_t max_object_size =
861 MAX (sizeof (struct re_state_table_entry),
862 MAX (sizeof (re_token_t),
863 MAX (sizeof (re_node_set),
864 MAX (sizeof (regmatch_t),
865 max_i18n_object_size))));
867 memset (dfa, '\0', sizeof (re_dfa_t));
869 /* Force allocation of str_tree_storage the first time. */
870 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
872 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
873 calculation below, and for similar doubling calculations
874 elsewhere. And it's <= rather than <, because some of the
875 doubling calculations add 1 afterwards. */
876 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
879 dfa->nodes_alloc = pat_len + 1;
880 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
882 /* table_size = 2 ^ ceil(log pat_len) */
883 for (table_size = 1; ; table_size <<= 1)
884 if (table_size > pat_len)
887 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
888 dfa->state_hash_mask = table_size - 1;
890 dfa->mb_cur_max = MB_CUR_MAX;
892 if (dfa->mb_cur_max == 6
893 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
895 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
898 codeset_name = nl_langinfo (CODESET);
899 if (strcasecmp (codeset_name, "UTF-8") == 0
900 || strcasecmp (codeset_name, "UTF8") == 0)
903 /* We check exhaustively in the loop below if this charset is a
904 superset of ASCII. */
905 dfa->map_notascii = 0;
908 #ifdef RE_ENABLE_I18N
909 if (dfa->mb_cur_max > 1)
912 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
917 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
918 if (BE (dfa->sb_char == NULL, 0))
921 /* Set the bits corresponding to single byte chars. */
922 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
923 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
925 wint_t wch = __btowc (ch);
927 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
929 if (isascii (ch) && wch != ch)
930 dfa->map_notascii = 1;
937 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
942 /* Initialize WORD_CHAR table, which indicate which character is
943 "word". In this case "word" means that it is the word construction
944 character used by some operators like "\<", "\>", etc. */
948 init_word_char (re_dfa_t *dfa)
951 dfa->word_ops_used = 1;
952 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
953 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
954 if (isalnum (ch) || ch == '_')
955 dfa->word_char[i] |= (bitset_word_t) 1 << j;
958 /* Free the work area which are only used while compiling. */
961 free_workarea_compile (regex_t *preg)
963 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
964 bin_tree_storage_t *storage, *next;
965 for (storage = dfa->str_tree_storage; storage; storage = next)
967 next = storage->next;
970 dfa->str_tree_storage = NULL;
971 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
972 dfa->str_tree = NULL;
973 re_free (dfa->org_indices);
974 dfa->org_indices = NULL;
977 /* Create initial states for all contexts. */
980 create_initial_state (re_dfa_t *dfa)
984 re_node_set init_nodes;
986 /* Initial states have the epsilon closure of the node which is
987 the first node of the regular expression. */
988 first = dfa->str_tree->first->node_idx;
989 dfa->init_node = first;
990 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
991 if (BE (err != REG_NOERROR, 0))
994 /* The back-references which are in initial states can epsilon transit,
995 since in this case all of the subexpressions can be null.
996 Then we add epsilon closures of the nodes which are the next nodes of
997 the back-references. */
998 if (dfa->nbackref > 0)
999 for (i = 0; i < init_nodes.nelem; ++i)
1001 Idx node_idx = init_nodes.elems[i];
1002 re_token_type_t type = dfa->nodes[node_idx].type;
1005 if (type != OP_BACK_REF)
1007 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1009 re_token_t *clexp_node;
1010 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1011 if (clexp_node->type == OP_CLOSE_SUBEXP
1012 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015 if (clexp_idx == init_nodes.nelem)
1018 if (type == OP_BACK_REF)
1020 Idx dest_idx = dfa->edests[node_idx].elems[0];
1021 if (!re_node_set_contains (&init_nodes, dest_idx))
1023 reg_errcode_t merge_err
1024 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025 if (merge_err != REG_NOERROR)
1032 /* It must be the first time to invoke acquire_state. */
1033 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1034 /* We don't check ERR here, since the initial state must not be NULL. */
1035 if (BE (dfa->init_state == NULL, 0))
1037 if (dfa->init_state->has_constraint)
1039 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1041 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1043 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1048 || dfa->init_state_begbuf == NULL, 0))
1052 dfa->init_state_word = dfa->init_state_nl
1053 = dfa->init_state_begbuf = dfa->init_state;
1055 re_node_set_free (&init_nodes);
1059 #ifdef RE_ENABLE_I18N
1060 /* If it is possible to do searching in single byte encoding instead of UTF-8
1061 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062 DFA nodes where needed. */
1065 optimize_utf8 (re_dfa_t *dfa)
1069 bool mb_chars = false;
1070 bool has_period = false;
1072 for (node = 0; node < dfa->nodes_len; ++node)
1073 switch (dfa->nodes[node].type)
1076 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080 switch (dfa->nodes[node].opr.ctx_type)
1088 /* Word anchors etc. cannot be handled. It's okay to test
1089 opr.ctx_type since constraints (for all DFA nodes) are
1090 created by ORing one or more opr.ctx_type values. */
1100 case OP_DUP_ASTERISK:
1101 case OP_OPEN_SUBEXP:
1102 case OP_CLOSE_SUBEXP:
1104 case COMPLEX_BRACKET:
1106 case SIMPLE_BRACKET:
1107 /* Just double check. */
1109 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1111 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1112 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1114 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1124 if (mb_chars || has_period)
1125 for (node = 0; node < dfa->nodes_len; ++node)
1127 if (dfa->nodes[node].type == CHARACTER
1128 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1129 dfa->nodes[node].mb_partial = 0;
1130 else if (dfa->nodes[node].type == OP_PERIOD)
1131 dfa->nodes[node].type = OP_UTF8_PERIOD;
1134 /* The search can be in single byte locale. */
1135 dfa->mb_cur_max = 1;
1137 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 /* Analyze the structure tree, and calculate "first", "next", "edest",
1142 "eclosure", and "inveclosure". */
1144 static reg_errcode_t
1145 analyze (regex_t *preg)
1147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1150 /* Allocate arrays. */
1151 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1152 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1153 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1154 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1155 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1156 || dfa->eclosures == NULL, 0))
1159 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1160 if (dfa->subexp_map != NULL)
1163 for (i = 0; i < preg->re_nsub; i++)
1164 dfa->subexp_map[i] = i;
1165 preorder (dfa->str_tree, optimize_subexps, dfa);
1166 for (i = 0; i < preg->re_nsub; i++)
1167 if (dfa->subexp_map[i] != i)
1169 if (i == preg->re_nsub)
1171 free (dfa->subexp_map);
1172 dfa->subexp_map = NULL;
1176 ret = postorder (dfa->str_tree, lower_subexps, preg);
1177 if (BE (ret != REG_NOERROR, 0))
1179 ret = postorder (dfa->str_tree, calc_first, dfa);
1180 if (BE (ret != REG_NOERROR, 0))
1182 preorder (dfa->str_tree, calc_next, dfa);
1183 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1184 if (BE (ret != REG_NOERROR, 0))
1186 ret = calc_eclosure (dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1190 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1192 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1195 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1196 if (BE (dfa->inveclosures == NULL, 0))
1198 ret = calc_inveclosure (dfa);
1204 /* Our parse trees are very unbalanced, so we cannot use a stack to
1205 implement parse tree visits. Instead, we use parent pointers and
1206 some hairy code in these two functions. */
1207 static reg_errcode_t
1208 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211 bin_tree_t *node, *prev;
1213 for (node = root; ; )
1215 /* Descend down the tree, preferably to the left (or to the right
1216 if that's the only child). */
1217 while (node->left || node->right)
1225 reg_errcode_t err = fn (extra, node);
1226 if (BE (err != REG_NOERROR, 0))
1228 if (node->parent == NULL)
1231 node = node->parent;
1233 /* Go up while we have a node that is reached from the right. */
1234 while (node->right == prev || node->right == NULL);
1239 static reg_errcode_t
1240 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1245 for (node = root; ; )
1247 reg_errcode_t err = fn (extra, node);
1248 if (BE (err != REG_NOERROR, 0))
1251 /* Go to the left node, or up and to the right. */
1256 bin_tree_t *prev = NULL;
1257 while (node->right == prev || node->right == NULL)
1260 node = node->parent;
1269 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1271 backreferences as well. Requires a preorder visit. */
1272 static reg_errcode_t
1273 optimize_subexps (void *extra, bin_tree_t *node)
1275 re_dfa_t *dfa = (re_dfa_t *) extra;
1277 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279 int idx = node->token.opr.idx;
1280 node->token.opr.idx = dfa->subexp_map[idx];
1281 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284 else if (node->token.type == SUBEXP
1285 && node->left && node->left->token.type == SUBEXP)
1287 Idx other_idx = node->left->token.opr.idx;
1289 node->left = node->left->left;
1291 node->left->parent = node;
1293 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1294 if (other_idx < BITSET_WORD_BITS)
1295 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1301 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1303 static reg_errcode_t
1304 lower_subexps (void *extra, bin_tree_t *node)
1306 regex_t *preg = (regex_t *) extra;
1307 reg_errcode_t err = REG_NOERROR;
1309 if (node->left && node->left->token.type == SUBEXP)
1311 node->left = lower_subexp (&err, preg, node->left);
1313 node->left->parent = node;
1315 if (node->right && node->right->token.type == SUBEXP)
1317 node->right = lower_subexp (&err, preg, node->right);
1319 node->right->parent = node;
1326 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1329 bin_tree_t *body = node->left;
1330 bin_tree_t *op, *cls, *tree1, *tree;
1333 /* We do not optimize empty subexpressions, because otherwise we may
1334 have bad CONCAT nodes with NULL children. This is obviously not
1335 very common, so we do not lose much. An example that triggers
1336 this case is the sed "script" /\(\)/x. */
1337 && node->left != NULL
1338 && (node->token.opr.idx >= BITSET_WORD_BITS
1339 || !(dfa->used_bkref_map
1340 & ((bitset_word_t) 1 << node->token.opr.idx))))
1343 /* Convert the SUBEXP node to the concatenation of an
1344 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348 tree = create_tree (dfa, op, tree1, CONCAT);
1349 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1355 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361 nodes. Requires a postorder visit. */
1362 static reg_errcode_t
1363 calc_first (void *extra, bin_tree_t *node)
1365 re_dfa_t *dfa = (re_dfa_t *) extra;
1366 if (node->token.type == CONCAT)
1368 node->first = node->left->first;
1369 node->node_idx = node->left->node_idx;
1374 node->node_idx = re_dfa_add_node (dfa, node->token);
1375 if (BE (node->node_idx == REG_MISSING, 0))
1377 if (node->token.type == ANCHOR)
1378 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1383 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1384 static reg_errcode_t
1385 calc_next (void *extra, bin_tree_t *node)
1387 switch (node->token.type)
1389 case OP_DUP_ASTERISK:
1390 node->left->next = node;
1393 node->left->next = node->right->first;
1394 node->right->next = node->next;
1398 node->left->next = node->next;
1400 node->right->next = node->next;
1406 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1407 static reg_errcode_t
1408 link_nfa_nodes (void *extra, bin_tree_t *node)
1410 re_dfa_t *dfa = (re_dfa_t *) extra;
1411 Idx idx = node->node_idx;
1412 reg_errcode_t err = REG_NOERROR;
1414 switch (node->token.type)
1420 assert (node->next == NULL);
1423 case OP_DUP_ASTERISK:
1427 dfa->has_plural_match = 1;
1428 if (node->left != NULL)
1429 left = node->left->first->node_idx;
1431 left = node->next->node_idx;
1432 if (node->right != NULL)
1433 right = node->right->first->node_idx;
1435 right = node->next->node_idx;
1436 assert (REG_VALID_INDEX (left));
1437 assert (REG_VALID_INDEX (right));
1438 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1443 case OP_OPEN_SUBEXP:
1444 case OP_CLOSE_SUBEXP:
1445 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449 dfa->nexts[idx] = node->next->node_idx;
1450 if (node->token.type == OP_BACK_REF)
1451 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455 assert (!IS_EPSILON_NODE (node->token.type));
1456 dfa->nexts[idx] = node->next->node_idx;
1463 /* Duplicate the epsilon closure of the node ROOT_NODE.
1464 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465 to their own constraint. */
1467 static reg_errcode_t
1469 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1470 Idx root_node, unsigned int init_constraint)
1472 Idx org_node, clone_node;
1474 unsigned int constraint = init_constraint;
1475 for (org_node = top_org_node, clone_node = top_clone_node;;)
1477 Idx org_dest, clone_dest;
1478 if (dfa->nodes[org_node].type == OP_BACK_REF)
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest = dfa->nexts[org_node];
1485 re_node_set_empty (dfa->edests + clone_node);
1486 clone_dest = duplicate_node (dfa, org_dest, constraint);
1487 if (BE (clone_dest == REG_MISSING, 0))
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494 else if (dfa->edests[org_node].nelem == 0)
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 else if (dfa->edests[org_node].nelem == 1)
1504 /* In case of the node can epsilon-transit, and it has only one
1506 org_dest = dfa->edests[org_node].elems[0];
1507 re_node_set_empty (dfa->edests + clone_node);
1508 /* If the node is root_node itself, it means the epsilon closure
1509 has a loop. Then tie it to the destination of the root_node. */
1510 if (org_node == root_node && clone_node != org_node)
1512 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1517 /* In case the node has another constraint, append it. */
1518 constraint |= dfa->nodes[org_node].constraint;
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == REG_MISSING, 0))
1522 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526 else /* dfa->edests[org_node].nelem == 2 */
1528 /* In case of the node can epsilon-transit, and it has two
1529 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1530 org_dest = dfa->edests[org_node].elems[0];
1531 re_node_set_empty (dfa->edests + clone_node);
1532 /* Search for a duplicated node which satisfies the constraint. */
1533 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534 if (clone_dest == REG_MISSING)
1536 /* There is no such duplicated node, create a new one. */
1538 clone_dest = duplicate_node (dfa, org_dest, constraint);
1539 if (BE (clone_dest == REG_MISSING, 0))
1541 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545 root_node, constraint);
1546 if (BE (err != REG_NOERROR, 0))
1551 /* There is a duplicated node which satisfies the constraint,
1552 use it to avoid infinite loop. */
1553 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 org_dest = dfa->edests[org_node].elems[1];
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == REG_MISSING, 0))
1562 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 org_node = org_dest;
1567 clone_node = clone_dest;
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1576 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1577 unsigned int constraint)
1580 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1582 if (org_node == dfa->org_indices[idx]
1583 && constraint == dfa->nodes[idx].constraint)
1584 return idx; /* Found. */
1586 return REG_MISSING; /* Not found. */
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590 Return the index of the new node, or REG_MISSING if insufficient storage is
1594 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1596 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597 if (BE (dup_idx != REG_MISSING, 1))
1599 dfa->nodes[dup_idx].constraint = constraint;
1600 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601 dfa->nodes[dup_idx].duplicated = 1;
1603 /* Store the index of the original node. */
1604 dfa->org_indices[dup_idx] = org_idx;
1609 static reg_errcode_t
1610 calc_inveclosure (re_dfa_t *dfa)
1614 for (idx = 0; idx < dfa->nodes_len; ++idx)
1615 re_node_set_init_empty (dfa->inveclosures + idx);
1617 for (src = 0; src < dfa->nodes_len; ++src)
1619 Idx *elems = dfa->eclosures[src].elems;
1620 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631 /* Calculate "eclosure" for all the node in DFA. */
1633 static reg_errcode_t
1634 calc_eclosure (re_dfa_t *dfa)
1639 assert (dfa->nodes_len > 0);
1642 /* For each nodes, calculate epsilon closure. */
1643 for (node_idx = 0; ; ++node_idx)
1646 re_node_set eclosure_elem;
1647 if (node_idx == dfa->nodes_len)
1656 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659 /* If we have already calculated, skip it. */
1660 if (dfa->eclosures[node_idx].nelem != 0)
1662 /* Calculate epsilon closure of `node_idx'. */
1663 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1664 if (BE (err != REG_NOERROR, 0))
1667 if (dfa->eclosures[node_idx].nelem == 0)
1670 re_node_set_free (&eclosure_elem);
1676 /* Calculate epsilon closure of NODE. */
1678 static reg_errcode_t
1679 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 re_node_set eclosure;
1685 bool incomplete = false;
1686 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1687 if (BE (err != REG_NOERROR, 0))
1690 /* This indicates that we are calculating this node now.
1691 We reference this value to avoid infinite loop. */
1692 dfa->eclosures[node].nelem = REG_MISSING;
1694 /* If the current node has constraints, duplicate all nodes
1695 since they must inherit the constraints. */
1696 if (dfa->nodes[node].constraint
1697 && dfa->edests[node].nelem
1698 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1700 err = duplicate_node_closure (dfa, node, node, node,
1701 dfa->nodes[node].constraint);
1702 if (BE (err != REG_NOERROR, 0))
1706 /* Expand each epsilon destination nodes. */
1707 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1708 for (i = 0; i < dfa->edests[node].nelem; ++i)
1710 re_node_set eclosure_elem;
1711 Idx edest = dfa->edests[node].elems[i];
1712 /* If calculating the epsilon closure of `edest' is in progress,
1713 return intermediate result. */
1714 if (dfa->eclosures[edest].nelem == REG_MISSING)
1719 /* If we haven't calculated the epsilon closure of `edest' yet,
1720 calculate now. Otherwise use calculated epsilon closure. */
1721 if (dfa->eclosures[edest].nelem == 0)
1723 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1724 if (BE (err != REG_NOERROR, 0))
1728 eclosure_elem = dfa->eclosures[edest];
1729 /* Merge the epsilon closure of `edest'. */
1730 err = re_node_set_merge (&eclosure, &eclosure_elem);
1731 if (BE (err != REG_NOERROR, 0))
1733 /* If the epsilon closure of `edest' is incomplete,
1734 the epsilon closure of this node is also incomplete. */
1735 if (dfa->eclosures[edest].nelem == 0)
1738 re_node_set_free (&eclosure_elem);
1742 /* An epsilon closure includes itself. */
1743 ok = re_node_set_insert (&eclosure, node);
1746 if (incomplete && !root)
1747 dfa->eclosures[node].nelem = 0;
1749 dfa->eclosures[node] = eclosure;
1750 *new_set = eclosure;
1754 /* Functions for token which are used in the parser. */
1756 /* Fetch a token from INPUT.
1757 We must not use this function inside bracket expressions. */
1761 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1763 re_string_skip_bytes (input, peek_token (result, input, syntax));
1766 /* Peek a token from INPUT, and return the length of the token.
1767 We must not use this function inside bracket expressions. */
1771 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 if (re_string_eoi (input))
1777 token->type = END_OF_RE;
1781 c = re_string_peek_byte (input, 0);
1784 token->word_char = 0;
1785 #ifdef RE_ENABLE_I18N
1786 token->mb_partial = 0;
1787 if (input->mb_cur_max > 1 &&
1788 !re_string_first_byte (input, re_string_cur_idx (input)))
1790 token->type = CHARACTER;
1791 token->mb_partial = 1;
1798 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1800 token->type = BACK_SLASH;
1804 c2 = re_string_peek_byte_case (input, 1);
1806 token->type = CHARACTER;
1807 #ifdef RE_ENABLE_I18N
1808 if (input->mb_cur_max > 1)
1810 wint_t wc = re_string_wchar_at (input,
1811 re_string_cur_idx (input) + 1);
1812 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816 token->word_char = IS_WORD_CHAR (c2) != 0;
1821 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1822 token->type = OP_ALT;
1824 case '1': case '2': case '3': case '4': case '5':
1825 case '6': case '7': case '8': case '9':
1826 if (!(syntax & RE_NO_BK_REFS))
1828 token->type = OP_BACK_REF;
1829 token->opr.idx = c2 - '1';
1833 if (!(syntax & RE_NO_GNU_OPS))
1835 token->type = ANCHOR;
1836 token->opr.ctx_type = WORD_FIRST;
1840 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_LAST;
1847 if (!(syntax & RE_NO_GNU_OPS))
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_DELIM;
1854 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = NOT_WORD_DELIM;
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_WORD;
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_NOTWORD;
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_SPACE;
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTSPACE;
1877 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = BUF_FIRST;
1884 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = BUF_LAST;
1891 if (!(syntax & RE_NO_BK_PARENS))
1892 token->type = OP_OPEN_SUBEXP;
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_CLOSE_SUBEXP;
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900 token->type = OP_DUP_PLUS;
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_QUESTION;
1907 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908 token->type = OP_OPEN_DUP_NUM;
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_CLOSE_DUP_NUM;
1920 token->type = CHARACTER;
1921 #ifdef RE_ENABLE_I18N
1922 if (input->mb_cur_max > 1)
1924 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1925 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929 token->word_char = IS_WORD_CHAR (token->opr.c);
1934 if (syntax & RE_NEWLINE_ALT)
1935 token->type = OP_ALT;
1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1939 token->type = OP_ALT;
1942 token->type = OP_DUP_ASTERISK;
1945 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946 token->type = OP_DUP_PLUS;
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_QUESTION;
1953 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954 token->type = OP_OPEN_DUP_NUM;
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_CLOSE_DUP_NUM;
1961 if (syntax & RE_NO_BK_PARENS)
1962 token->type = OP_OPEN_SUBEXP;
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_CLOSE_SUBEXP;
1969 token->type = OP_OPEN_BRACKET;
1972 token->type = OP_PERIOD;
1975 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1976 re_string_cur_idx (input) != 0)
1978 char prev = re_string_peek_byte (input, -1);
1979 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982 token->type = ANCHOR;
1983 token->opr.ctx_type = LINE_FIRST;
1986 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1987 re_string_cur_idx (input) + 1 != re_string_length (input))
1990 re_string_skip_bytes (input, 1);
1991 peek_token (&next, input, syntax);
1992 re_string_skip_bytes (input, -1);
1993 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996 token->type = ANCHOR;
1997 token->opr.ctx_type = LINE_LAST;
2005 /* Peek a token from INPUT, and return the length of the token.
2006 We must not use this function out of bracket expressions. */
2010 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 if (re_string_eoi (input))
2015 token->type = END_OF_RE;
2018 c = re_string_peek_byte (input, 0);
2021 #ifdef RE_ENABLE_I18N
2022 if (input->mb_cur_max > 1 &&
2023 !re_string_first_byte (input, re_string_cur_idx (input)))
2025 token->type = CHARACTER;
2028 #endif /* RE_ENABLE_I18N */
2030 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2031 && re_string_cur_idx (input) + 1 < re_string_length (input))
2033 /* In this case, '\' escape a character. */
2035 re_string_skip_bytes (input, 1);
2036 c2 = re_string_peek_byte (input, 0);
2038 token->type = CHARACTER;
2041 if (c == '[') /* '[' is a special char in a bracket exps. */
2045 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2046 c2 = re_string_peek_byte (input, 1);
2054 token->type = OP_OPEN_COLL_ELEM;
2057 token->type = OP_OPEN_EQUIV_CLASS;
2060 if (syntax & RE_CHAR_CLASSES)
2062 token->type = OP_OPEN_CHAR_CLASS;
2065 /* else fall through. */
2067 token->type = CHARACTER;
2077 token->type = OP_CHARSET_RANGE;
2080 token->type = OP_CLOSE_BRACKET;
2083 token->type = OP_NON_MATCH_LIST;
2086 token->type = CHARACTER;
2091 /* Functions for parser. */
2093 /* Entry point of the parser.
2094 Parse the regular expression REGEXP and return the structure tree.
2095 If an error is occured, ERR is set by error code, and return NULL.
2096 This function build the following tree, from regular expression <reg_exp>:
2102 CAT means concatenation.
2103 EOR means end of regular expression. */
2106 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2110 bin_tree_t *tree, *eor, *root;
2111 re_token_t current_token;
2112 dfa->syntax = syntax;
2113 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2115 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2117 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2119 root = create_tree (dfa, tree, eor, CONCAT);
2122 if (BE (eor == NULL || root == NULL, 0))
2130 /* This function build the following tree, from regular expression
2131 <branch1>|<branch2>:
2137 ALT means alternative, which represents the operator `|'. */
2140 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144 bin_tree_t *tree, *branch = NULL;
2145 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149 while (token->type == OP_ALT)
2151 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152 if (token->type != OP_ALT && token->type != END_OF_RE
2153 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2155 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2161 tree = create_tree (dfa, tree, branch, OP_ALT);
2162 if (BE (tree == NULL, 0))
2171 /* This function build the following tree, from regular expression
2178 CAT means concatenation. */
2181 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2184 bin_tree_t *tree, *expr;
2185 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 while (token->type != OP_ALT && token->type != END_OF_RE
2191 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2193 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2194 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2198 if (tree != NULL && expr != NULL)
2200 tree = create_tree (dfa, tree, expr, CONCAT);
2207 else if (tree == NULL)
2209 /* Otherwise expr == NULL, we don't need to create new tree. */
2214 /* This function build the following tree, from regular expression a*:
2221 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2224 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226 switch (token->type)
2229 tree = create_token_tree (dfa, NULL, NULL, token);
2230 if (BE (tree == NULL, 0))
2235 #ifdef RE_ENABLE_I18N
2236 if (dfa->mb_cur_max > 1)
2238 while (!re_string_eoi (regexp)
2239 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2241 bin_tree_t *mbc_remain;
2242 fetch_token (token, regexp, syntax);
2243 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2244 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2245 if (BE (mbc_remain == NULL || tree == NULL, 0))
2254 case OP_OPEN_SUBEXP:
2255 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2256 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259 case OP_OPEN_BRACKET:
2260 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2270 dfa->used_bkref_map |= 1 << token->opr.idx;
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (BE (tree == NULL, 0))
2278 dfa->has_mb_node = 1;
2280 case OP_OPEN_DUP_NUM:
2281 if (syntax & RE_CONTEXT_INVALID_DUP)
2287 case OP_DUP_ASTERISK:
2289 case OP_DUP_QUESTION:
2290 if (syntax & RE_CONTEXT_INVALID_OPS)
2295 else if (syntax & RE_CONTEXT_INDEP_OPS)
2297 fetch_token (token, regexp, syntax);
2298 return parse_expression (regexp, preg, token, syntax, nest, err);
2300 /* else fall through */
2301 case OP_CLOSE_SUBEXP:
2302 if ((token->type == OP_CLOSE_SUBEXP) &&
2303 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2308 /* else fall through */
2309 case OP_CLOSE_DUP_NUM:
2310 /* We treat it as a normal character. */
2312 /* Then we can these characters as normal characters. */
2313 token->type = CHARACTER;
2314 /* mb_partial and word_char bits should be initialized already
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2324 if ((token->opr.ctx_type
2325 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2326 && dfa->word_ops_used == 0)
2327 init_word_char (dfa);
2328 if (token->opr.ctx_type == WORD_DELIM
2329 || token->opr.ctx_type == NOT_WORD_DELIM)
2331 bin_tree_t *tree_first, *tree_last;
2332 if (token->opr.ctx_type == WORD_DELIM)
2334 token->opr.ctx_type = WORD_FIRST;
2335 tree_first = create_token_tree (dfa, NULL, NULL, token);
2336 token->opr.ctx_type = WORD_LAST;
2340 token->opr.ctx_type = INSIDE_WORD;
2341 tree_first = create_token_tree (dfa, NULL, NULL, token);
2342 token->opr.ctx_type = INSIDE_NOTWORD;
2344 tree_last = create_token_tree (dfa, NULL, NULL, token);
2345 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2346 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2361 /* We must return here, since ANCHORs can't be followed
2362 by repetition operators.
2363 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2365 fetch_token (token, regexp, syntax);
2368 tree = create_token_tree (dfa, NULL, NULL, token);
2369 if (BE (tree == NULL, 0))
2374 if (dfa->mb_cur_max > 1)
2375 dfa->has_mb_node = 1;
2379 tree = build_charclass_op (dfa, regexp->trans,
2380 (const unsigned char *) "alnum",
2381 (const unsigned char *) "_",
2382 token->type == OP_NOTWORD, err);
2383 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2388 tree = build_charclass_op (dfa, regexp->trans,
2389 (const unsigned char *) "space",
2390 (const unsigned char *) "",
2391 token->type == OP_NOTSPACE, err);
2392 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2402 /* Must not happen? */
2408 fetch_token (token, regexp, syntax);
2410 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2411 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2413 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2414 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2416 /* In BRE consecutive duplications are not allowed. */
2417 if ((syntax & RE_CONTEXT_INVALID_DUP)
2418 && (token->type == OP_DUP_ASTERISK
2419 || token->type == OP_OPEN_DUP_NUM))
2429 /* This function build the following tree, from regular expression
2437 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2438 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2440 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443 cur_nsub = preg->re_nsub++;
2445 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2447 /* The subexpression may be a null string. */
2448 if (token->type == OP_CLOSE_SUBEXP)
2452 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2453 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2455 if (BE (*err != REG_NOERROR, 0))
2459 if (cur_nsub <= '9' - '1')
2460 dfa->completed_bkref_map |= 1 << cur_nsub;
2462 tree = create_tree (dfa, tree, NULL, SUBEXP);
2463 if (BE (tree == NULL, 0))
2468 tree->token.opr.idx = cur_nsub;
2472 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2475 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2476 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2478 bin_tree_t *tree = NULL, *old_tree = NULL;
2479 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2480 re_token_t start_token = *token;
2482 if (token->type == OP_OPEN_DUP_NUM)
2485 start = fetch_number (regexp, token, syntax);
2486 if (start == REG_MISSING)
2488 if (token->type == CHARACTER && token->opr.c == ',')
2489 start = 0; /* We treat "{,m}" as "{0,m}". */
2492 *err = REG_BADBR; /* <re>{} is invalid. */
2496 if (BE (start != REG_ERROR, 1))
2498 /* We treat "{n}" as "{n,n}". */
2499 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2500 : ((token->type == CHARACTER && token->opr.c == ',')
2501 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2503 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2505 /* Invalid sequence. */
2506 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2508 if (token->type == END_OF_RE)
2516 /* If the syntax bit is set, rollback. */
2517 re_string_set_index (regexp, start_idx);
2518 *token = start_token;
2519 token->type = CHARACTER;
2520 /* mb_partial and word_char bits should be already initialized by
2525 if (BE ((end != REG_MISSING && start > end)
2526 || token->type != OP_CLOSE_DUP_NUM, 0))
2528 /* First number greater than second. */
2535 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2536 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539 fetch_token (token, regexp, syntax);
2541 if (BE (elem == NULL, 0))
2543 if (BE (start == 0 && end == 0, 0))
2545 postorder (elem, free_tree, NULL);
2549 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2550 if (BE (start > 0, 0))
2553 for (i = 2; i <= start; ++i)
2555 elem = duplicate_tree (elem, dfa);
2556 tree = create_tree (dfa, tree, elem, CONCAT);
2557 if (BE (elem == NULL || tree == NULL, 0))
2558 goto parse_dup_op_espace;
2564 /* Duplicate ELEM before it is marked optional. */
2565 elem = duplicate_tree (elem, dfa);
2571 if (elem->token.type == SUBEXP)
2572 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2574 tree = create_tree (dfa, elem, NULL,
2575 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2576 if (BE (tree == NULL, 0))
2577 goto parse_dup_op_espace;
2579 /* From gnulib's "intprops.h":
2580 True if the arithmetic type T is signed. */
2581 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2583 /* This loop is actually executed only when end != REG_MISSING,
2584 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2585 already created the start+1-th copy. */
2586 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2587 for (i = start + 2; i <= end; ++i)
2589 elem = duplicate_tree (elem, dfa);
2590 tree = create_tree (dfa, tree, elem, CONCAT);
2591 if (BE (elem == NULL || tree == NULL, 0))
2592 goto parse_dup_op_espace;
2594 tree = create_tree (dfa, tree, NULL, OP_ALT);
2595 if (BE (tree == NULL, 0))
2596 goto parse_dup_op_espace;
2600 tree = create_tree (dfa, old_tree, tree, CONCAT);
2604 parse_dup_op_espace:
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610 I'm not sure, but maybe enough. */
2611 #define BRACKET_NAME_BUF_SIZE 32
2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615 Build the range expression which starts from START_ELEM, and ends
2616 at END_ELEM. The result are written to MBCSET and SBCSET.
2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618 mbcset->range_ends, is a pointer argument sinse we may
2621 static reg_errcode_t
2623 # ifdef RE_ENABLE_I18N
2624 build_range_exp (const reg_syntax_t syntax,
2626 re_charset_t *mbcset,
2628 const bracket_elem_t *start_elem,
2629 const bracket_elem_t *end_elem)
2630 # else /* not RE_ENABLE_I18N */
2631 build_range_exp (const reg_syntax_t syntax,
2633 const bracket_elem_t *start_elem,
2634 const bracket_elem_t *end_elem)
2635 # endif /* not RE_ENABLE_I18N */
2637 unsigned int start_ch, end_ch;
2638 /* Equivalence Classes and Character Classes can't be a range start/end. */
2639 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2640 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644 /* We can handle no multi character collating elements without libc
2646 if (BE ((start_elem->type == COLL_SYM
2647 && strlen ((char *) start_elem->opr.name) > 1)
2648 || (end_elem->type == COLL_SYM
2649 && strlen ((char *) end_elem->opr.name) > 1), 0))
2650 return REG_ECOLLATE;
2652 # ifdef RE_ENABLE_I18N
2657 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2660 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2663 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2666 ? __btowc (start_ch) : start_elem->opr.wch);
2667 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2668 ? __btowc (end_ch) : end_elem->opr.wch);
2669 if (start_wc == WEOF || end_wc == WEOF)
2670 return REG_ECOLLATE;
2671 cmp_buf[0] = start_wc;
2672 cmp_buf[4] = end_wc;
2674 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2675 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2678 /* Got valid collation sequence values, add them as a new entry.
2679 However, for !_LIBC we have no collation elements: if the
2680 character set is single byte, the single byte character set
2681 that we build below suffices. parse_bracket_exp passes
2682 no MBCSET if dfa->mb_cur_max == 1. */
2685 /* Check the space of the arrays. */
2686 if (BE (*range_alloc == mbcset->nranges, 0))
2688 /* There is not enough space, need realloc. */
2689 wchar_t *new_array_start, *new_array_end;
2692 /* +1 in case of mbcset->nranges is 0. */
2693 new_nranges = 2 * mbcset->nranges + 1;
2694 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2695 are NULL if *range_alloc == 0. */
2696 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704 mbcset->range_starts = new_array_start;
2705 mbcset->range_ends = new_array_end;
2706 *range_alloc = new_nranges;
2709 mbcset->range_starts[mbcset->nranges] = start_wc;
2710 mbcset->range_ends[mbcset->nranges++] = end_wc;
2713 /* Build the table for single byte characters. */
2714 for (wc = 0; wc < SBC_MAX; ++wc)
2717 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2718 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2719 bitset_set (sbcset, wc);
2722 # else /* not RE_ENABLE_I18N */
2725 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2726 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2729 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731 if (start_ch > end_ch)
2733 /* Build the table for single byte characters. */
2734 for (ch = 0; ch < SBC_MAX; ++ch)
2735 if (start_ch <= ch && ch <= end_ch)
2736 bitset_set (sbcset, ch);
2738 # endif /* not RE_ENABLE_I18N */
2741 #endif /* not _LIBC */
2744 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2745 Build the collating element which is represented by NAME.
2746 The result are written to MBCSET and SBCSET.
2747 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2748 pointer argument since we may update it. */
2750 static reg_errcode_t
2752 build_collating_symbol (bitset_t sbcset,
2753 # ifdef RE_ENABLE_I18N
2754 re_charset_t *mbcset, Idx *coll_sym_alloc,
2756 const unsigned char *name)
2758 size_t name_len = strlen ((const char *) name);
2759 if (BE (name_len != 1, 0))
2760 return REG_ECOLLATE;
2763 bitset_set (sbcset, name[0]);
2767 #endif /* not _LIBC */
2769 /* This function parse bracket expression like "[abc]", "[a-c]",
2773 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2774 reg_syntax_t syntax, reg_errcode_t *err)
2777 const unsigned char *collseqmb;
2778 const char *collseqwc;
2781 const int32_t *symb_table;
2782 const unsigned char *extra;
2784 /* Local function for parse_bracket_exp used in _LIBC environement.
2785 Seek the collating symbol entry correspondings to NAME.
2786 Return the index of the symbol in the SYMB_TABLE. */
2789 __attribute ((always_inline))
2790 seek_collating_symbol_entry (name, name_len)
2791 const unsigned char *name;
2794 int32_t hash = elem_hash ((const char *) name, name_len);
2795 int32_t elem = hash % table_size;
2796 if (symb_table[2 * elem] != 0)
2798 int32_t second = hash % (table_size - 2) + 1;
2802 /* First compare the hashing value. */
2803 if (symb_table[2 * elem] == hash
2804 /* Compare the length of the name. */
2805 && name_len == extra[symb_table[2 * elem + 1]]
2806 /* Compare the name. */
2807 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2810 /* Yep, this is the entry. */
2817 while (symb_table[2 * elem] != 0);
2822 /* Local function for parse_bracket_exp used in _LIBC environment.
2823 Look up the collation sequence value of BR_ELEM.
2824 Return the value if succeeded, UINT_MAX otherwise. */
2826 auto inline unsigned int
2827 __attribute ((always_inline))
2828 lookup_collation_sequence_value (br_elem)
2829 bracket_elem_t *br_elem;
2831 if (br_elem->type == SB_CHAR)
2834 if (MB_CUR_MAX == 1)
2837 return collseqmb[br_elem->opr.ch];
2840 wint_t wc = __btowc (br_elem->opr.ch);
2841 return __collseq_table_lookup (collseqwc, wc);
2844 else if (br_elem->type == MB_CHAR)
2847 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2849 else if (br_elem->type == COLL_SYM)
2851 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855 elem = seek_collating_symbol_entry (br_elem->opr.name,
2857 if (symb_table[2 * elem] != 0)
2859 /* We found the entry. */
2860 idx = symb_table[2 * elem + 1];
2861 /* Skip the name of collating element name. */
2862 idx += 1 + extra[idx];
2863 /* Skip the byte sequence of the collating element. */
2864 idx += 1 + extra[idx];
2865 /* Adjust for the alignment. */
2866 idx = (idx + 3) & ~3;
2867 /* Skip the multibyte collation sequence value. */
2868 idx += sizeof (unsigned int);
2869 /* Skip the wide char sequence of the collating element. */
2870 idx += sizeof (unsigned int) *
2871 (1 + *(unsigned int *) (extra + idx));
2872 /* Return the collation sequence value. */
2873 return *(unsigned int *) (extra + idx);
2875 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2877 /* No valid character. Match it as a single byte
2879 return collseqmb[br_elem->opr.name[0]];
2882 else if (sym_name_len == 1)
2883 return collseqmb[br_elem->opr.name[0]];
2888 /* Local function for parse_bracket_exp used in _LIBC environement.
2889 Build the range expression which starts from START_ELEM, and ends
2890 at END_ELEM. The result are written to MBCSET and SBCSET.
2891 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892 mbcset->range_ends, is a pointer argument sinse we may
2895 auto inline reg_errcode_t
2896 __attribute ((always_inline))
2897 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2898 re_charset_t *mbcset;
2901 bracket_elem_t *start_elem, *end_elem;
2904 uint32_t start_collseq;
2905 uint32_t end_collseq;
2907 /* Equivalence Classes and Character Classes can't be a range
2909 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2910 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914 start_collseq = lookup_collation_sequence_value (start_elem);
2915 end_collseq = lookup_collation_sequence_value (end_elem);
2916 /* Check start/end collation sequence values. */
2917 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2918 return REG_ECOLLATE;
2919 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2922 /* Got valid collation sequence values, add them as a new entry.
2923 However, if we have no collation elements, and the character set
2924 is single byte, the single byte character set that we
2925 build below suffices. */
2926 if (nrules > 0 || dfa->mb_cur_max > 1)
2928 /* Check the space of the arrays. */
2929 if (BE (*range_alloc == mbcset->nranges, 0))
2931 /* There is not enough space, need realloc. */
2932 uint32_t *new_array_start;
2933 uint32_t *new_array_end;
2936 /* +1 in case of mbcset->nranges is 0. */
2937 new_nranges = 2 * mbcset->nranges + 1;
2938 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2940 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2943 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2946 mbcset->range_starts = new_array_start;
2947 mbcset->range_ends = new_array_end;
2948 *range_alloc = new_nranges;
2951 mbcset->range_starts[mbcset->nranges] = start_collseq;
2952 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2955 /* Build the table for single byte characters. */
2956 for (ch = 0; ch < SBC_MAX; ch++)
2958 uint32_t ch_collseq;
2960 if (MB_CUR_MAX == 1)
2963 ch_collseq = collseqmb[ch];
2965 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2966 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2967 bitset_set (sbcset, ch);
2972 /* Local function for parse_bracket_exp used in _LIBC environement.
2973 Build the collating element which is represented by NAME.
2974 The result are written to MBCSET and SBCSET.
2975 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2976 pointer argument sinse we may update it. */
2978 auto inline reg_errcode_t
2979 __attribute ((always_inline))
2980 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2981 re_charset_t *mbcset;
2982 Idx *coll_sym_alloc;
2984 const unsigned char *name;
2987 size_t name_len = strlen ((const char *) name);
2990 elem = seek_collating_symbol_entry (name, name_len);
2991 if (symb_table[2 * elem] != 0)
2993 /* We found the entry. */
2994 idx = symb_table[2 * elem + 1];
2995 /* Skip the name of collating element name. */
2996 idx += 1 + extra[idx];
2998 else if (symb_table[2 * elem] == 0 && name_len == 1)
3000 /* No valid character, treat it as a normal
3002 bitset_set (sbcset, name[0]);
3006 return REG_ECOLLATE;
3008 /* Got valid collation sequence, add it as a new entry. */
3009 /* Check the space of the arrays. */
3010 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3012 /* Not enough, realloc it. */
3013 /* +1 in case of mbcset->ncoll_syms is 0. */
3014 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3015 /* Use realloc since mbcset->coll_syms is NULL
3017 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3018 new_coll_sym_alloc);
3019 if (BE (new_coll_syms == NULL, 0))
3021 mbcset->coll_syms = new_coll_syms;
3022 *coll_sym_alloc = new_coll_sym_alloc;
3024 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3029 if (BE (name_len != 1, 0))
3030 return REG_ECOLLATE;
3033 bitset_set (sbcset, name[0]);
3040 re_token_t br_token;
3041 re_bitset_ptr_t sbcset;
3042 #ifdef RE_ENABLE_I18N
3043 re_charset_t *mbcset;
3044 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3045 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3046 #endif /* not RE_ENABLE_I18N */
3047 bool non_match = false;
3048 bin_tree_t *work_tree;
3050 bool first_round = true;
3052 collseqmb = (const unsigned char *)
3053 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3054 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3060 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3061 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3062 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3063 _NL_COLLATE_SYMB_TABLEMB);
3064 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3065 _NL_COLLATE_SYMB_EXTRAMB);
3068 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3069 #ifdef RE_ENABLE_I18N
3070 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3071 #endif /* RE_ENABLE_I18N */
3072 #ifdef RE_ENABLE_I18N
3073 if (BE (sbcset == NULL || mbcset == NULL, 0))
3075 if (BE (sbcset == NULL, 0))
3076 #endif /* RE_ENABLE_I18N */
3082 token_len = peek_token_bracket (token, regexp, syntax);
3083 if (BE (token->type == END_OF_RE, 0))
3086 goto parse_bracket_exp_free_return;
3088 if (token->type == OP_NON_MATCH_LIST)
3090 #ifdef RE_ENABLE_I18N
3091 mbcset->non_match = 1;
3092 #endif /* not RE_ENABLE_I18N */
3094 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3095 bitset_set (sbcset, '\n');
3096 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3097 token_len = peek_token_bracket (token, regexp, syntax);
3098 if (BE (token->type == END_OF_RE, 0))
3101 goto parse_bracket_exp_free_return;
3105 /* We treat the first ']' as a normal character. */
3106 if (token->type == OP_CLOSE_BRACKET)
3107 token->type = CHARACTER;
3111 bracket_elem_t start_elem, end_elem;
3112 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3113 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3116 bool is_range_exp = false;
3119 start_elem.opr.name = start_name_buf;
3120 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3121 syntax, first_round);
3122 if (BE (ret != REG_NOERROR, 0))
3125 goto parse_bracket_exp_free_return;
3127 first_round = false;
3129 /* Get information about the next token. We need it in any case. */
3130 token_len = peek_token_bracket (token, regexp, syntax);
3132 /* Do not check for ranges if we know they are not allowed. */
3133 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3135 if (BE (token->type == END_OF_RE, 0))
3138 goto parse_bracket_exp_free_return;
3140 if (token->type == OP_CHARSET_RANGE)
3142 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3143 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3144 if (BE (token2.type == END_OF_RE, 0))
3147 goto parse_bracket_exp_free_return;
3149 if (token2.type == OP_CLOSE_BRACKET)
3151 /* We treat the last '-' as a normal character. */
3152 re_string_skip_bytes (regexp, -token_len);
3153 token->type = CHARACTER;
3156 is_range_exp = true;
3160 if (is_range_exp == true)
3162 end_elem.opr.name = end_name_buf;
3163 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3165 if (BE (ret != REG_NOERROR, 0))
3168 goto parse_bracket_exp_free_return;
3171 token_len = peek_token_bracket (token, regexp, syntax);
3174 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3175 &start_elem, &end_elem);
3177 # ifdef RE_ENABLE_I18N
3178 *err = build_range_exp (syntax, sbcset,
3179 dfa->mb_cur_max > 1 ? mbcset : NULL,
3180 &range_alloc, &start_elem, &end_elem);
3182 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3184 #endif /* RE_ENABLE_I18N */
3185 if (BE (*err != REG_NOERROR, 0))
3186 goto parse_bracket_exp_free_return;
3190 switch (start_elem.type)
3193 bitset_set (sbcset, start_elem.opr.ch);
3195 #ifdef RE_ENABLE_I18N
3197 /* Check whether the array has enough space. */
3198 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3200 wchar_t *new_mbchars;
3201 /* Not enough, realloc it. */
3202 /* +1 in case of mbcset->nmbchars is 0. */
3203 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3204 /* Use realloc since array is NULL if *alloc == 0. */
3205 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3207 if (BE (new_mbchars == NULL, 0))
3208 goto parse_bracket_exp_espace;
3209 mbcset->mbchars = new_mbchars;
3211 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3213 #endif /* RE_ENABLE_I18N */
3215 *err = build_equiv_class (sbcset,
3216 #ifdef RE_ENABLE_I18N
3217 mbcset, &equiv_class_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3224 *err = build_collating_symbol (sbcset,
3225 #ifdef RE_ENABLE_I18N
3226 mbcset, &coll_sym_alloc,
3227 #endif /* RE_ENABLE_I18N */
3228 start_elem.opr.name);
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3233 *err = build_charclass (regexp->trans, sbcset,
3234 #ifdef RE_ENABLE_I18N
3235 mbcset, &char_class_alloc,
3236 #endif /* RE_ENABLE_I18N */
3237 start_elem.opr.name, syntax);
3238 if (BE (*err != REG_NOERROR, 0))
3239 goto parse_bracket_exp_free_return;
3246 if (BE (token->type == END_OF_RE, 0))
3249 goto parse_bracket_exp_free_return;
3251 if (token->type == OP_CLOSE_BRACKET)
3255 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3257 /* If it is non-matching list. */
3259 bitset_not (sbcset);
3261 #ifdef RE_ENABLE_I18N
3262 /* Ensure only single byte characters are set. */
3263 if (dfa->mb_cur_max > 1)
3264 bitset_mask (sbcset, dfa->sb_char);
3266 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3267 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3268 || mbcset->non_match)))
3270 bin_tree_t *mbc_tree;
3272 /* Build a tree for complex bracket. */
3273 dfa->has_mb_node = 1;
3274 br_token.type = COMPLEX_BRACKET;
3275 br_token.opr.mbcset = mbcset;
3276 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 if (BE (mbc_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3280 if (sbcset[sbc_idx])
3282 /* If there are no bits set in sbcset, there is no point
3283 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3284 if (sbc_idx < BITSET_WORDS)
3286 /* Build a tree for simple bracket. */
3287 br_token.type = SIMPLE_BRACKET;
3288 br_token.opr.sbcset = sbcset;
3289 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290 if (BE (work_tree == NULL, 0))
3291 goto parse_bracket_exp_espace;
3293 /* Then join them by ALT node. */
3294 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3295 if (BE (work_tree == NULL, 0))
3296 goto parse_bracket_exp_espace;
3301 work_tree = mbc_tree;
3305 #endif /* not RE_ENABLE_I18N */
3307 #ifdef RE_ENABLE_I18N
3308 free_charset (mbcset);
3310 /* Build a tree for simple bracket. */
3311 br_token.type = SIMPLE_BRACKET;
3312 br_token.opr.sbcset = sbcset;
3313 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314 if (BE (work_tree == NULL, 0))
3315 goto parse_bracket_exp_espace;
3319 parse_bracket_exp_espace:
3321 parse_bracket_exp_free_return:
3323 #ifdef RE_ENABLE_I18N
3324 free_charset (mbcset);
3325 #endif /* RE_ENABLE_I18N */
3329 /* Parse an element in the bracket expression. */
3331 static reg_errcode_t
3332 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3333 re_token_t *token, int token_len, re_dfa_t *dfa,
3334 reg_syntax_t syntax, bool accept_hyphen)
3336 #ifdef RE_ENABLE_I18N
3338 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3339 if (cur_char_size > 1)
3341 elem->type = MB_CHAR;
3342 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3343 re_string_skip_bytes (regexp, cur_char_size);
3346 #endif /* RE_ENABLE_I18N */
3347 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3348 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3349 || token->type == OP_OPEN_EQUIV_CLASS)
3350 return parse_bracket_symbol (elem, regexp, token);
3351 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3353 /* A '-' must only appear as anything but a range indicator before
3354 the closing bracket. Everything else is an error. */
3356 (void) peek_token_bracket (&token2, regexp, syntax);
3357 if (token2.type != OP_CLOSE_BRACKET)
3358 /* The actual error value is not standardized since this whole
3359 case is undefined. But ERANGE makes good sense. */
3362 elem->type = SB_CHAR;
3363 elem->opr.ch = token->opr.c;
3367 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3368 such as [:<character_class>:], [.<collating_element>.], and
3369 [=<equivalent_class>=]. */
3371 static reg_errcode_t
3372 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3375 unsigned char ch, delim = token->opr.c;
3377 if (re_string_eoi(regexp))
3381 if (i >= BRACKET_NAME_BUF_SIZE)
3383 if (token->type == OP_OPEN_CHAR_CLASS)
3384 ch = re_string_fetch_byte_case (regexp);
3386 ch = re_string_fetch_byte (regexp);
3387 if (re_string_eoi(regexp))
3389 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3391 elem->opr.name[i] = ch;
3393 re_string_skip_bytes (regexp, 1);
3394 elem->opr.name[i] = '\0';
3395 switch (token->type)
3397 case OP_OPEN_COLL_ELEM:
3398 elem->type = COLL_SYM;
3400 case OP_OPEN_EQUIV_CLASS:
3401 elem->type = EQUIV_CLASS;
3403 case OP_OPEN_CHAR_CLASS:
3404 elem->type = CHAR_CLASS;
3412 /* Helper function for parse_bracket_exp.
3413 Build the equivalence class which is represented by NAME.
3414 The result are written to MBCSET and SBCSET.
3415 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3416 is a pointer argument sinse we may update it. */
3418 static reg_errcode_t
3419 #ifdef RE_ENABLE_I18N
3420 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3421 Idx *equiv_class_alloc, const unsigned char *name)
3422 #else /* not RE_ENABLE_I18N */
3423 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3424 #endif /* not RE_ENABLE_I18N */
3427 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3430 const int32_t *table, *indirect;
3431 const unsigned char *weights, *extra, *cp;
3432 unsigned char char_buf[2];
3436 /* This #include defines a local function! */
3437 # include <locale/weight.h>
3438 /* Calculate the index for equivalence class. */
3440 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3441 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3442 _NL_COLLATE_WEIGHTMB);
3443 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3444 _NL_COLLATE_EXTRAMB);
3445 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3446 _NL_COLLATE_INDIRECTMB);
3447 idx1 = findidx (&cp);
3448 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3449 /* This isn't a valid character. */
3450 return REG_ECOLLATE;
3452 /* Build single byte matcing table for this equivalence class. */
3453 char_buf[1] = (unsigned char) '\0';
3454 len = weights[idx1 & 0xffffff];
3455 for (ch = 0; ch < SBC_MAX; ++ch)
3459 idx2 = findidx (&cp);
3464 /* This isn't a valid character. */
3466 /* Compare only if the length matches and the collation rule
3467 index is the same. */
3468 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3472 while (cnt <= len &&
3473 weights[(idx1 & 0xffffff) + 1 + cnt]
3474 == weights[(idx2 & 0xffffff) + 1 + cnt])
3478 bitset_set (sbcset, ch);
3481 /* Check whether the array has enough space. */
3482 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3484 /* Not enough, realloc it. */
3485 /* +1 in case of mbcset->nequiv_classes is 0. */
3486 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3487 /* Use realloc since the array is NULL if *alloc == 0. */
3488 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3490 new_equiv_class_alloc);
3491 if (BE (new_equiv_classes == NULL, 0))
3493 mbcset->equiv_classes = new_equiv_classes;
3494 *equiv_class_alloc = new_equiv_class_alloc;
3496 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3501 if (BE (strlen ((const char *) name) != 1, 0))
3502 return REG_ECOLLATE;
3503 bitset_set (sbcset, *name);
3508 /* Helper function for parse_bracket_exp.
3509 Build the character class which is represented by NAME.
3510 The result are written to MBCSET and SBCSET.
3511 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3512 is a pointer argument sinse we may update it. */
3514 static reg_errcode_t
3515 #ifdef RE_ENABLE_I18N
3516 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3517 re_charset_t *mbcset, Idx *char_class_alloc,
3518 const unsigned char *class_name, reg_syntax_t syntax)
3519 #else /* not RE_ENABLE_I18N */
3520 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3521 const unsigned char *class_name, reg_syntax_t syntax)
3522 #endif /* not RE_ENABLE_I18N */
3525 const char *name = (const char *) class_name;
3527 /* In case of REG_ICASE "upper" and "lower" match the both of
3528 upper and lower cases. */
3529 if ((syntax & RE_ICASE)
3530 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3533 #ifdef RE_ENABLE_I18N
3534 /* Check the space of the arrays. */
3535 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3537 /* Not enough, realloc it. */
3538 /* +1 in case of mbcset->nchar_classes is 0. */
3539 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3540 /* Use realloc since array is NULL if *alloc == 0. */
3541 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3542 new_char_class_alloc);
3543 if (BE (new_char_classes == NULL, 0))
3545 mbcset->char_classes = new_char_classes;
3546 *char_class_alloc = new_char_class_alloc;
3548 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3549 #endif /* RE_ENABLE_I18N */
3551 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3553 if (BE (trans != NULL, 0)) \
3555 for (i = 0; i < SBC_MAX; ++i) \
3556 if (ctype_func (i)) \
3557 bitset_set (sbcset, trans[i]); \
3561 for (i = 0; i < SBC_MAX; ++i) \
3562 if (ctype_func (i)) \
3563 bitset_set (sbcset, i); \
3567 if (strcmp (name, "alnum") == 0)
3568 BUILD_CHARCLASS_LOOP (isalnum);
3569 else if (strcmp (name, "cntrl") == 0)
3570 BUILD_CHARCLASS_LOOP (iscntrl);
3571 else if (strcmp (name, "lower") == 0)
3572 BUILD_CHARCLASS_LOOP (islower);
3573 else if (strcmp (name, "space") == 0)
3574 BUILD_CHARCLASS_LOOP (isspace);
3575 else if (strcmp (name, "alpha") == 0)
3576 BUILD_CHARCLASS_LOOP (isalpha);
3577 else if (strcmp (name, "digit") == 0)
3578 BUILD_CHARCLASS_LOOP (isdigit);
3579 else if (strcmp (name, "print") == 0)
3580 BUILD_CHARCLASS_LOOP (isprint);
3581 else if (strcmp (name, "upper") == 0)
3582 BUILD_CHARCLASS_LOOP (isupper);
3583 else if (strcmp (name, "blank") == 0)
3584 BUILD_CHARCLASS_LOOP (isblank);
3585 else if (strcmp (name, "graph") == 0)
3586 BUILD_CHARCLASS_LOOP (isgraph);
3587 else if (strcmp (name, "punct") == 0)
3588 BUILD_CHARCLASS_LOOP (ispunct);
3589 else if (strcmp (name, "xdigit") == 0)
3590 BUILD_CHARCLASS_LOOP (isxdigit);
3598 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3599 const unsigned char *class_name,
3600 const unsigned char *extra, bool non_match,
3603 re_bitset_ptr_t sbcset;
3604 #ifdef RE_ENABLE_I18N
3605 re_charset_t *mbcset;
3607 #endif /* not RE_ENABLE_I18N */
3609 re_token_t br_token;
3612 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3613 #ifdef RE_ENABLE_I18N
3614 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3615 #endif /* RE_ENABLE_I18N */
3617 #ifdef RE_ENABLE_I18N
3618 if (BE (sbcset == NULL || mbcset == NULL, 0))
3619 #else /* not RE_ENABLE_I18N */
3620 if (BE (sbcset == NULL, 0))
3621 #endif /* not RE_ENABLE_I18N */
3629 #ifdef RE_ENABLE_I18N
3630 mbcset->non_match = 1;
3631 #endif /* not RE_ENABLE_I18N */
3634 /* We don't care the syntax in this case. */
3635 ret = build_charclass (trans, sbcset,
3636 #ifdef RE_ENABLE_I18N
3638 #endif /* RE_ENABLE_I18N */
3641 if (BE (ret != REG_NOERROR, 0))
3644 #ifdef RE_ENABLE_I18N
3645 free_charset (mbcset);
3646 #endif /* RE_ENABLE_I18N */
3650 /* \w match '_' also. */
3651 for (; *extra; extra++)
3652 bitset_set (sbcset, *extra);
3654 /* If it is non-matching list. */
3656 bitset_not (sbcset);
3658 #ifdef RE_ENABLE_I18N
3659 /* Ensure only single byte characters are set. */
3660 if (dfa->mb_cur_max > 1)
3661 bitset_mask (sbcset, dfa->sb_char);
3664 /* Build a tree for simple bracket. */
3665 br_token.type = SIMPLE_BRACKET;
3666 br_token.opr.sbcset = sbcset;
3667 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3668 if (BE (tree == NULL, 0))
3669 goto build_word_op_espace;
3671 #ifdef RE_ENABLE_I18N
3672 if (dfa->mb_cur_max > 1)
3674 bin_tree_t *mbc_tree;
3675 /* Build a tree for complex bracket. */
3676 br_token.type = COMPLEX_BRACKET;
3677 br_token.opr.mbcset = mbcset;
3678 dfa->has_mb_node = 1;
3679 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680 if (BE (mbc_tree == NULL, 0))
3681 goto build_word_op_espace;
3682 /* Then join them by ALT node. */
3683 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3684 if (BE (mbc_tree != NULL, 1))
3689 free_charset (mbcset);
3692 #else /* not RE_ENABLE_I18N */
3694 #endif /* not RE_ENABLE_I18N */
3696 build_word_op_espace:
3698 #ifdef RE_ENABLE_I18N
3699 free_charset (mbcset);
3700 #endif /* RE_ENABLE_I18N */
3705 /* This is intended for the expressions like "a{1,3}".
3706 Fetch a number from `input', and return the number.
3707 Return REG_MISSING if the number field is empty like "{,1}".
3708 Return REG_ERROR if an error occurred. */
3711 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3713 Idx num = REG_MISSING;
3717 fetch_token (token, input, syntax);
3719 if (BE (token->type == END_OF_RE, 0))
3721 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3723 num = ((token->type != CHARACTER || c < '0' || '9' < c
3724 || num == REG_ERROR)
3726 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3727 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3732 #ifdef RE_ENABLE_I18N
3734 free_charset (re_charset_t *cset)
3736 re_free (cset->mbchars);
3738 re_free (cset->coll_syms);
3739 re_free (cset->equiv_classes);
3740 re_free (cset->range_starts);
3741 re_free (cset->range_ends);
3743 re_free (cset->char_classes);
3746 #endif /* RE_ENABLE_I18N */
3748 /* Functions for binary tree operation. */
3750 /* Create a tree node. */
3753 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3754 re_token_type_t type)
3758 return create_token_tree (dfa, left, right, &t);
3762 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3763 const re_token_t *token)
3766 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3768 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3770 if (storage == NULL)
3772 storage->next = dfa->str_tree_storage;
3773 dfa->str_tree_storage = storage;
3774 dfa->str_tree_storage_idx = 0;
3776 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3778 tree->parent = NULL;
3780 tree->right = right;
3781 tree->token = *token;
3782 tree->token.duplicated = 0;
3783 tree->token.opt_subexp = 0;
3786 tree->node_idx = REG_MISSING;
3789 left->parent = tree;
3791 right->parent = tree;
3795 /* Mark the tree SRC as an optional subexpression.
3796 To be called from preorder or postorder. */
3798 static reg_errcode_t
3799 mark_opt_subexp (void *extra, bin_tree_t *node)
3801 Idx idx = (Idx) (long) extra;
3802 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3803 node->token.opt_subexp = 1;
3808 /* Free the allocated memory inside NODE. */
3811 free_token (re_token_t *node)
3813 #ifdef RE_ENABLE_I18N
3814 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3815 free_charset (node->opr.mbcset);
3817 #endif /* RE_ENABLE_I18N */
3818 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3819 re_free (node->opr.sbcset);
3822 /* Worker function for tree walking. Free the allocated memory inside NODE
3823 and its children. */
3825 static reg_errcode_t
3826 free_tree (void *extra, bin_tree_t *node)
3828 free_token (&node->token);
3833 /* Duplicate the node SRC, and return new node. This is a preorder
3834 visit similar to the one implemented by the generic visitor, but
3835 we need more infrastructure to maintain two parallel trees --- so,
3836 it's easier to duplicate. */
3839 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3841 const bin_tree_t *node;
3842 bin_tree_t *dup_root;
3843 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3845 for (node = root; ; )
3847 /* Create a new tree and link it back to the current parent. */
3848 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3851 (*p_new)->parent = dup_node;
3852 (*p_new)->token.duplicated = 1;
3855 /* Go to the left node, or up and to the right. */
3859 p_new = &dup_node->left;
3863 const bin_tree_t *prev = NULL;
3864 while (node->right == prev || node->right == NULL)
3867 node = node->parent;
3868 dup_node = dup_node->parent;
3873 p_new = &dup_node->right;