1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
380 if (dfa->mb_cur_max > 1)
381 for (i = 0; i < SBC_MAX; ++i)
382 if (__btowc (i) == WEOF)
383 re_set_fastmap (fastmap, icase, i);
384 # endif /* not _LIBC */
386 for (i = 0; i < cset->nmbchars; ++i)
390 memset (&state, '\0', sizeof (state));
391 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
395 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
397 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
401 #endif /* RE_ENABLE_I18N */
402 else if (type == OP_PERIOD
403 #ifdef RE_ENABLE_I18N
404 || type == OP_UTF8_PERIOD
405 #endif /* RE_ENABLE_I18N */
406 || type == END_OF_RE)
408 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 if (type == END_OF_RE)
410 bufp->can_be_null = 1;
416 /* Entry point for POSIX code. */
417 /* regcomp takes a regular expression as a string and compiles it.
419 PREG is a regex_t *. We do not expect any fields to be initialized,
420 since POSIX says we shouldn't. Thus, we set
422 `buffer' to the compiled pattern;
423 `used' to the length of the compiled pattern;
424 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425 REG_EXTENDED bit in CFLAGS is set; otherwise, to
426 RE_SYNTAX_POSIX_BASIC;
427 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428 `fastmap' to an allocated space for the fastmap;
429 `fastmap_accurate' to zero;
430 `re_nsub' to the number of subexpressions in PATTERN.
432 PATTERN is the address of the pattern string.
434 CFLAGS is a series of bits which affect compilation.
436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437 use POSIX basic syntax.
439 If REG_NEWLINE is set, then . and [^...] don't match newline.
440 Also, regexec will try a match beginning after every newline.
442 If REG_ICASE is set, then we considers upper- and lowercase
443 versions of letters to be equivalent when matching.
445 If REG_NOSUB is set, then when PREG is passed to regexec, that
446 routine will report only success or failure, and nothing about the
449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
450 the return codes and their meanings.) */
453 regcomp (preg, pattern, cflags)
454 regex_t *_Restrict_ preg;
455 const char *_Restrict_ pattern;
459 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 : RE_SYNTAX_POSIX_BASIC);
466 /* Try to allocate space for the fastmap. */
467 preg->fastmap = re_malloc (char, SBC_MAX);
468 if (BE (preg->fastmap == NULL, 0))
471 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
473 /* If REG_NEWLINE is set, newlines are treated differently. */
474 if (cflags & REG_NEWLINE)
475 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
476 syntax &= ~RE_DOT_NEWLINE;
477 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478 /* It also changes the matching behavior. */
479 preg->newline_anchor = 1;
482 preg->newline_anchor = 0;
483 preg->no_sub = !!(cflags & REG_NOSUB);
484 preg->translate = NULL;
486 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
488 /* POSIX doesn't distinguish between an unmatched open-group and an
489 unmatched close-group: both are REG_EPAREN. */
490 if (ret == REG_ERPAREN)
493 /* We have already checked preg->fastmap != NULL. */
494 if (BE (ret == REG_NOERROR, 1))
495 /* Compute the fastmap now, since regexec cannot modify the pattern
496 buffer. This function never fails in this implementation. */
497 (void) re_compile_fastmap (preg);
500 /* Some error occurred while compiling the expression. */
501 re_free (preg->fastmap);
502 preg->fastmap = NULL;
508 weak_alias (__regcomp, regcomp)
511 /* Returns a message corresponding to an error code, ERRCODE, returned
512 from either regcomp or regexec. We don't use PREG here. */
516 regerror (errcode, preg, errbuf, errbuf_size)
518 const regex_t *_Restrict_ preg;
519 char *_Restrict_ errbuf;
521 #else /* size_t might promote */
523 regerror (int errcode, const regex_t *_Restrict_ preg,
524 char *_Restrict_ errbuf, size_t errbuf_size)
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
541 msg_size = strlen (msg) + 1; /* Includes the null. */
543 if (BE (errbuf_size != 0, 1))
545 size_t cpy_size = msg_size;
546 if (BE (msg_size > errbuf_size, 0))
548 cpy_size = errbuf_size - 1;
549 errbuf[cpy_size] = '\0';
551 memcpy (errbuf, msg, cpy_size);
557 weak_alias (__regerror, regerror)
561 #ifdef RE_ENABLE_I18N
562 /* This static array is used for the map to single-byte characters when
563 UTF-8 is used. Otherwise we would allocate memory just to initialize
564 it the same all the time. UTF-8 is the preferred encoding so this is
565 a worthwhile optimization. */
566 static const bitset_t utf8_sb_map =
568 /* Set the first 128 bits. */
569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
570 # error "bitset_word_t is narrower than 32 bits"
571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
572 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX, BITSET_WORD_MAX,
575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
579 >> (SBC_MAX % BITSET_WORD_BITS == 0
581 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 free_dfa_content (re_dfa_t *dfa)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
615 re_dfastate_t *state = entry->array[j];
618 re_free (entry->array);
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
625 re_free (dfa->subexp_map);
627 re_free (dfa->re_str);
634 /* Free dynamically allocated space used by PREG. */
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
649 re_free (preg->translate);
650 preg->translate = NULL;
653 weak_alias (__regfree, regfree)
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
659 #if defined _REGEX_RE_COMP || defined _LIBC
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
684 if (re_comp_buf.buffer)
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
693 if (re_comp_buf.fastmap == NULL)
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
717 libc_freeres_fn (free_mem)
719 __regfree (&re_comp_buf);
723 #endif /* _REGEX_RE_COMP */
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
733 reg_errcode_t err = REG_NOERROR;
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
760 preg->used = sizeof (re_dfa_t);
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
765 free_dfa_content (dfa);
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
776 __libc_lock_init (dfa->lock);
778 err = re_string_construct (®exp, pattern, length, preg->translate,
779 syntax & RE_ICASE, dfa);
780 if (BE (err != REG_NOERROR, 0))
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (®exp);
785 free_dfa_content (dfa);
791 /* Parse the regular expression, and build a structure tree. */
793 dfa->str_tree = parse (®exp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (®exp);
815 if (BE (err != REG_NOERROR, 0))
817 free_dfa_content (dfa);
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
831 __re_size_t table_size;
835 #ifdef RE_ENABLE_I18N
836 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
838 size_t max_i18n_object_size = 0;
840 size_t max_object_size =
841 MAX (sizeof (struct re_state_table_entry),
842 MAX (sizeof (re_token_t),
843 MAX (sizeof (re_node_set),
844 MAX (sizeof (regmatch_t),
845 max_i18n_object_size))));
847 memset (dfa, '\0', sizeof (re_dfa_t));
849 /* Force allocation of str_tree_storage the first time. */
850 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
852 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
853 calculation below, and for similar doubling calculations
854 elsewhere. And it's <= rather than <, because some of the
855 doubling calculations add 1 afterwards. */
856 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
859 dfa->nodes_alloc = pat_len + 1;
860 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
862 /* table_size = 2 ^ ceil(log pat_len) */
863 for (table_size = 1; ; table_size <<= 1)
864 if (table_size > pat_len)
867 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868 dfa->state_hash_mask = table_size - 1;
870 dfa->mb_cur_max = MB_CUR_MAX;
872 if (dfa->mb_cur_max == 6
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
878 # ifdef HAVE_LANGINFO_CODESET
879 codeset_name = nl_langinfo (CODESET);
881 codeset_name = getenv ("LC_ALL");
882 if (codeset_name == NULL || codeset_name[0] == '\0')
883 codeset_name = getenv ("LC_CTYPE");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LANG");
886 if (codeset_name == NULL)
888 else if (strchr (codeset_name, '.') != NULL)
889 codeset_name = strchr (codeset_name, '.') + 1;
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 if (BE (dfa->sb_char == NULL, 0))
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
918 wint_t wch = __btowc (ch);
920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
922 if (isascii (ch) && wch != ch)
923 dfa->map_notascii = 1;
930 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
935 /* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
941 init_word_char (re_dfa_t *dfa)
944 dfa->word_ops_used = 1;
945 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
946 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
947 if (isalnum (ch) || ch == '_')
948 dfa->word_char[i] |= (bitset_word_t) 1 << j;
951 /* Free the work area which are only used while compiling. */
954 free_workarea_compile (regex_t *preg)
956 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
957 bin_tree_storage_t *storage, *next;
958 for (storage = dfa->str_tree_storage; storage; storage = next)
960 next = storage->next;
963 dfa->str_tree_storage = NULL;
964 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
965 dfa->str_tree = NULL;
966 re_free (dfa->org_indices);
967 dfa->org_indices = NULL;
970 /* Create initial states for all contexts. */
973 create_initial_state (re_dfa_t *dfa)
977 re_node_set init_nodes;
979 /* Initial states have the epsilon closure of the node which is
980 the first node of the regular expression. */
981 first = dfa->str_tree->first->node_idx;
982 dfa->init_node = first;
983 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
984 if (BE (err != REG_NOERROR, 0))
987 /* The back-references which are in initial states can epsilon transit,
988 since in this case all of the subexpressions can be null.
989 Then we add epsilon closures of the nodes which are the next nodes of
990 the back-references. */
991 if (dfa->nbackref > 0)
992 for (i = 0; i < init_nodes.nelem; ++i)
994 Idx node_idx = init_nodes.elems[i];
995 re_token_type_t type = dfa->nodes[node_idx].type;
998 if (type != OP_BACK_REF)
1000 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002 re_token_t *clexp_node;
1003 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1004 if (clexp_node->type == OP_CLOSE_SUBEXP
1005 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1008 if (clexp_idx == init_nodes.nelem)
1011 if (type == OP_BACK_REF)
1013 Idx dest_idx = dfa->edests[node_idx].elems[0];
1014 if (!re_node_set_contains (&init_nodes, dest_idx))
1016 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1022 /* It must be the first time to invoke acquire_state. */
1023 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1024 /* We don't check ERR here, since the initial state must not be NULL. */
1025 if (BE (dfa->init_state == NULL, 0))
1027 if (dfa->init_state->has_constraint)
1029 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1038 || dfa->init_state_begbuf == NULL, 0))
1042 dfa->init_state_word = dfa->init_state_nl
1043 = dfa->init_state_begbuf = dfa->init_state;
1045 re_node_set_free (&init_nodes);
1049 #ifdef RE_ENABLE_I18N
1050 /* If it is possible to do searching in single byte encoding instead of UTF-8
1051 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1052 DFA nodes where needed. */
1055 optimize_utf8 (re_dfa_t *dfa)
1059 bool mb_chars = false;
1060 bool has_period = false;
1062 for (node = 0; node < dfa->nodes_len; ++node)
1063 switch (dfa->nodes[node].type)
1066 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1070 switch (dfa->nodes[node].opr.idx)
1078 /* Word anchors etc. cannot be handled. */
1088 case OP_DUP_ASTERISK:
1089 case OP_OPEN_SUBEXP:
1090 case OP_CLOSE_SUBEXP:
1092 case COMPLEX_BRACKET:
1094 case SIMPLE_BRACKET:
1095 /* Just double check. */
1097 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1099 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1100 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1102 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1112 if (mb_chars || has_period)
1113 for (node = 0; node < dfa->nodes_len; ++node)
1115 if (dfa->nodes[node].type == CHARACTER
1116 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1117 dfa->nodes[node].mb_partial = 0;
1118 else if (dfa->nodes[node].type == OP_PERIOD)
1119 dfa->nodes[node].type = OP_UTF8_PERIOD;
1122 /* The search can be in single byte locale. */
1123 dfa->mb_cur_max = 1;
1125 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1129 /* Analyze the structure tree, and calculate "first", "next", "edest",
1130 "eclosure", and "inveclosure". */
1132 static reg_errcode_t
1133 analyze (regex_t *preg)
1135 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1138 /* Allocate arrays. */
1139 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1140 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1141 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1142 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1143 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1144 || dfa->eclosures == NULL, 0))
1147 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1148 if (dfa->subexp_map != NULL)
1151 for (i = 0; i < preg->re_nsub; i++)
1152 dfa->subexp_map[i] = i;
1153 preorder (dfa->str_tree, optimize_subexps, dfa);
1154 for (i = 0; i < preg->re_nsub; i++)
1155 if (dfa->subexp_map[i] != i)
1157 if (i == preg->re_nsub)
1159 free (dfa->subexp_map);
1160 dfa->subexp_map = NULL;
1164 ret = postorder (dfa->str_tree, lower_subexps, preg);
1165 if (BE (ret != REG_NOERROR, 0))
1167 ret = postorder (dfa->str_tree, calc_first, dfa);
1168 if (BE (ret != REG_NOERROR, 0))
1170 preorder (dfa->str_tree, calc_next, dfa);
1171 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1172 if (BE (ret != REG_NOERROR, 0))
1174 ret = calc_eclosure (dfa);
1175 if (BE (ret != REG_NOERROR, 0))
1178 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1179 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1180 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1183 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1184 if (BE (dfa->inveclosures == NULL, 0))
1186 ret = calc_inveclosure (dfa);
1192 /* Our parse trees are very unbalanced, so we cannot use a stack to
1193 implement parse tree visits. Instead, we use parent pointers and
1194 some hairy code in these two functions. */
1195 static reg_errcode_t
1196 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1199 bin_tree_t *node, *prev;
1201 for (node = root; ; )
1203 /* Descend down the tree, preferably to the left (or to the right
1204 if that's the only child). */
1205 while (node->left || node->right)
1213 reg_errcode_t err = fn (extra, node);
1214 if (BE (err != REG_NOERROR, 0))
1216 if (node->parent == NULL)
1219 node = node->parent;
1221 /* Go up while we have a node that is reached from the right. */
1222 while (node->right == prev || node->right == NULL);
1227 static reg_errcode_t
1228 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1233 for (node = root; ; )
1235 reg_errcode_t err = fn (extra, node);
1236 if (BE (err != REG_NOERROR, 0))
1239 /* Go to the left node, or up and to the right. */
1244 bin_tree_t *prev = NULL;
1245 while (node->right == prev || node->right == NULL)
1248 node = node->parent;
1257 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1258 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1259 backreferences as well. Requires a preorder visit. */
1260 static reg_errcode_t
1261 optimize_subexps (void *extra, bin_tree_t *node)
1263 re_dfa_t *dfa = (re_dfa_t *) extra;
1265 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1267 int idx = node->token.opr.idx;
1268 node->token.opr.idx = dfa->subexp_map[idx];
1269 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1272 else if (node->token.type == SUBEXP
1273 && node->left && node->left->token.type == SUBEXP)
1275 Idx other_idx = node->left->token.opr.idx;
1277 node->left = node->left->left;
1279 node->left->parent = node;
1281 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1282 if (other_idx < BITSET_WORD_BITS)
1283 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1289 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1290 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1291 static reg_errcode_t
1292 lower_subexps (void *extra, bin_tree_t *node)
1294 regex_t *preg = (regex_t *) extra;
1295 reg_errcode_t err = REG_NOERROR;
1297 if (node->left && node->left->token.type == SUBEXP)
1299 node->left = lower_subexp (&err, preg, node->left);
1301 node->left->parent = node;
1303 if (node->right && node->right->token.type == SUBEXP)
1305 node->right = lower_subexp (&err, preg, node->right);
1307 node->right->parent = node;
1314 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1316 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1317 bin_tree_t *body = node->left;
1318 bin_tree_t *op, *cls, *tree1, *tree;
1321 /* We do not optimize empty subexpressions, because otherwise we may
1322 have bad CONCAT nodes with NULL children. This is obviously not
1323 very common, so we do not lose much. An example that triggers
1324 this case is the sed "script" /\(\)/x. */
1325 && node->left != NULL
1326 && (node->token.opr.idx >= BITSET_WORD_BITS
1327 || !(dfa->used_bkref_map
1328 & ((bitset_word_t) 1 << node->token.opr.idx))))
1331 /* Convert the SUBEXP node to the concatenation of an
1332 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1333 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1334 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1335 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1336 tree = create_tree (dfa, op, tree1, CONCAT);
1337 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1343 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1344 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1348 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1349 nodes. Requires a postorder visit. */
1350 static reg_errcode_t
1351 calc_first (void *extra, bin_tree_t *node)
1353 re_dfa_t *dfa = (re_dfa_t *) extra;
1354 if (node->token.type == CONCAT)
1356 node->first = node->left->first;
1357 node->node_idx = node->left->node_idx;
1362 node->node_idx = re_dfa_add_node (dfa, node->token);
1363 if (BE (node->node_idx == REG_MISSING, 0))
1369 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1370 static reg_errcode_t
1371 calc_next (void *extra, bin_tree_t *node)
1373 switch (node->token.type)
1375 case OP_DUP_ASTERISK:
1376 node->left->next = node;
1379 node->left->next = node->right->first;
1380 node->right->next = node->next;
1384 node->left->next = node->next;
1386 node->right->next = node->next;
1392 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1393 static reg_errcode_t
1394 link_nfa_nodes (void *extra, bin_tree_t *node)
1396 re_dfa_t *dfa = (re_dfa_t *) extra;
1397 Idx idx = node->node_idx;
1398 reg_errcode_t err = REG_NOERROR;
1400 switch (node->token.type)
1406 assert (node->next == NULL);
1409 case OP_DUP_ASTERISK:
1413 dfa->has_plural_match = 1;
1414 if (node->left != NULL)
1415 left = node->left->first->node_idx;
1417 left = node->next->node_idx;
1418 if (node->right != NULL)
1419 right = node->right->first->node_idx;
1421 right = node->next->node_idx;
1422 assert (REG_VALID_INDEX (left));
1423 assert (REG_VALID_INDEX (right));
1424 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1429 case OP_OPEN_SUBEXP:
1430 case OP_CLOSE_SUBEXP:
1431 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1435 dfa->nexts[idx] = node->next->node_idx;
1436 if (node->token.type == OP_BACK_REF)
1437 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1441 assert (!IS_EPSILON_NODE (node->token.type));
1442 dfa->nexts[idx] = node->next->node_idx;
1449 /* Duplicate the epsilon closure of the node ROOT_NODE.
1450 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1451 to their own constraint. */
1453 static reg_errcode_t
1455 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1456 Idx root_node, unsigned int init_constraint)
1458 Idx org_node, clone_node;
1460 unsigned int constraint = init_constraint;
1461 for (org_node = top_org_node, clone_node = top_clone_node;;)
1463 Idx org_dest, clone_dest;
1464 if (dfa->nodes[org_node].type == OP_BACK_REF)
1466 /* If the back reference epsilon-transit, its destination must
1467 also have the constraint. Then duplicate the epsilon closure
1468 of the destination of the back reference, and store it in
1469 edests of the back reference. */
1470 org_dest = dfa->nexts[org_node];
1471 re_node_set_empty (dfa->edests + clone_node);
1472 clone_dest = duplicate_node (dfa, org_dest, constraint);
1473 if (BE (clone_dest == REG_MISSING, 0))
1475 dfa->nexts[clone_node] = dfa->nexts[org_node];
1476 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1480 else if (dfa->edests[org_node].nelem == 0)
1482 /* In case of the node can't epsilon-transit, don't duplicate the
1483 destination and store the original destination as the
1484 destination of the node. */
1485 dfa->nexts[clone_node] = dfa->nexts[org_node];
1488 else if (dfa->edests[org_node].nelem == 1)
1490 /* In case of the node can epsilon-transit, and it has only one
1492 org_dest = dfa->edests[org_node].elems[0];
1493 re_node_set_empty (dfa->edests + clone_node);
1494 if (dfa->nodes[org_node].type == ANCHOR)
1496 /* In case of the node has another constraint, append it. */
1497 if (org_node == root_node && clone_node != org_node)
1499 /* ...but if the node is root_node itself, it means the
1500 epsilon closure have a loop, then tie it to the
1501 destination of the root_node. */
1502 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507 constraint |= dfa->nodes[org_node].opr.ctx_type;
1509 clone_dest = duplicate_node (dfa, org_dest, constraint);
1510 if (BE (clone_dest == REG_MISSING, 0))
1512 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 else /* dfa->edests[org_node].nelem == 2 */
1518 /* In case of the node can epsilon-transit, and it has two
1519 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1520 org_dest = dfa->edests[org_node].elems[0];
1521 re_node_set_empty (dfa->edests + clone_node);
1522 /* Search for a duplicated node which satisfies the constraint. */
1523 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1524 if (clone_dest == REG_MISSING)
1526 /* There are no such a duplicated node, create a new one. */
1528 clone_dest = duplicate_node (dfa, org_dest, constraint);
1529 if (BE (clone_dest == REG_MISSING, 0))
1531 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1535 root_node, constraint);
1536 if (BE (err != REG_NOERROR, 0))
1541 /* There are a duplicated node which satisfy the constraint,
1542 use it to avoid infinite loop. */
1543 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 org_dest = dfa->edests[org_node].elems[1];
1549 clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 if (BE (clone_dest == REG_MISSING, 0))
1552 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 org_node = org_dest;
1557 clone_node = clone_dest;
1562 /* Search for a node which is duplicated from the node ORG_NODE, and
1563 satisfies the constraint CONSTRAINT. */
1566 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1567 unsigned int constraint)
1570 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1572 if (org_node == dfa->org_indices[idx]
1573 && constraint == dfa->nodes[idx].constraint)
1574 return idx; /* Found. */
1576 return REG_MISSING; /* Not found. */
1579 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1580 Return the index of the new node, or REG_MISSING if insufficient storage is
1584 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1586 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1587 if (BE (dup_idx != REG_MISSING, 1))
1589 dfa->nodes[dup_idx].constraint = constraint;
1590 if (dfa->nodes[org_idx].type == ANCHOR)
1591 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1592 dfa->nodes[dup_idx].duplicated = 1;
1594 /* Store the index of the original node. */
1595 dfa->org_indices[dup_idx] = org_idx;
1600 static reg_errcode_t
1601 calc_inveclosure (re_dfa_t *dfa)
1605 for (idx = 0; idx < dfa->nodes_len; ++idx)
1606 re_node_set_init_empty (dfa->inveclosures + idx);
1608 for (src = 0; src < dfa->nodes_len; ++src)
1610 Idx *elems = dfa->eclosures[src].elems;
1611 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1613 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622 /* Calculate "eclosure" for all the node in DFA. */
1624 static reg_errcode_t
1625 calc_eclosure (re_dfa_t *dfa)
1630 assert (dfa->nodes_len > 0);
1633 /* For each nodes, calculate epsilon closure. */
1634 for (node_idx = 0; ; ++node_idx)
1637 re_node_set eclosure_elem;
1638 if (node_idx == dfa->nodes_len)
1647 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1650 /* If we have already calculated, skip it. */
1651 if (dfa->eclosures[node_idx].nelem != 0)
1653 /* Calculate epsilon closure of `node_idx'. */
1654 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1655 if (BE (err != REG_NOERROR, 0))
1658 if (dfa->eclosures[node_idx].nelem == 0)
1661 re_node_set_free (&eclosure_elem);
1667 /* Calculate epsilon closure of NODE. */
1669 static reg_errcode_t
1670 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1673 unsigned int constraint;
1677 re_node_set eclosure;
1679 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1680 if (BE (err != REG_NOERROR, 0))
1683 /* This indicates that we are calculating this node now.
1684 We reference this value to avoid infinite loop. */
1685 dfa->eclosures[node].nelem = REG_MISSING;
1687 constraint = ((dfa->nodes[node].type == ANCHOR)
1688 ? dfa->nodes[node].opr.ctx_type : 0);
1689 /* If the current node has constraints, duplicate all nodes.
1690 Since they must inherit the constraints. */
1692 && dfa->edests[node].nelem
1693 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1695 err = duplicate_node_closure (dfa, node, node, node, constraint);
1696 if (BE (err != REG_NOERROR, 0))
1700 /* Expand each epsilon destination nodes. */
1701 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1702 for (i = 0; i < dfa->edests[node].nelem; ++i)
1704 re_node_set eclosure_elem;
1705 Idx edest = dfa->edests[node].elems[i];
1706 /* If calculating the epsilon closure of `edest' is in progress,
1707 return intermediate result. */
1708 if (dfa->eclosures[edest].nelem == REG_MISSING)
1713 /* If we haven't calculated the epsilon closure of `edest' yet,
1714 calculate now. Otherwise use calculated epsilon closure. */
1715 if (dfa->eclosures[edest].nelem == 0)
1717 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1718 if (BE (err != REG_NOERROR, 0))
1722 eclosure_elem = dfa->eclosures[edest];
1723 /* Merge the epsilon closure of `edest'. */
1724 re_node_set_merge (&eclosure, &eclosure_elem);
1725 /* If the epsilon closure of `edest' is incomplete,
1726 the epsilon closure of this node is also incomplete. */
1727 if (dfa->eclosures[edest].nelem == 0)
1730 re_node_set_free (&eclosure_elem);
1734 /* Epsilon closures include itself. */
1735 ok = re_node_set_insert (&eclosure, node);
1738 if (incomplete && !root)
1739 dfa->eclosures[node].nelem = 0;
1741 dfa->eclosures[node] = eclosure;
1742 *new_set = eclosure;
1746 /* Functions for token which are used in the parser. */
1748 /* Fetch a token from INPUT.
1749 We must not use this function inside bracket expressions. */
1753 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1755 re_string_skip_bytes (input, peek_token (result, input, syntax));
1758 /* Peek a token from INPUT, and return the length of the token.
1759 We must not use this function inside bracket expressions. */
1763 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 if (re_string_eoi (input))
1769 token->type = END_OF_RE;
1773 c = re_string_peek_byte (input, 0);
1776 token->word_char = 0;
1777 #ifdef RE_ENABLE_I18N
1778 token->mb_partial = 0;
1779 if (input->mb_cur_max > 1 &&
1780 !re_string_first_byte (input, re_string_cur_idx (input)))
1782 token->type = CHARACTER;
1783 token->mb_partial = 1;
1790 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1792 token->type = BACK_SLASH;
1796 c2 = re_string_peek_byte_case (input, 1);
1798 token->type = CHARACTER;
1799 #ifdef RE_ENABLE_I18N
1800 if (input->mb_cur_max > 1)
1802 wint_t wc = re_string_wchar_at (input,
1803 re_string_cur_idx (input) + 1);
1804 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 token->word_char = IS_WORD_CHAR (c2) != 0;
1813 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1814 token->type = OP_ALT;
1816 case '1': case '2': case '3': case '4': case '5':
1817 case '6': case '7': case '8': case '9':
1818 if (!(syntax & RE_NO_BK_REFS))
1820 token->type = OP_BACK_REF;
1821 token->opr.idx = c2 - '1';
1825 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = ANCHOR;
1828 token->opr.ctx_type = WORD_FIRST;
1832 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = ANCHOR;
1835 token->opr.ctx_type = WORD_LAST;
1839 if (!(syntax & RE_NO_GNU_OPS))
1841 token->type = ANCHOR;
1842 token->opr.ctx_type = WORD_DELIM;
1846 if (!(syntax & RE_NO_GNU_OPS))
1848 token->type = ANCHOR;
1849 token->opr.ctx_type = NOT_WORD_DELIM;
1853 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = OP_WORD;
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = OP_NOTWORD;
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_SPACE;
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_NOTSPACE;
1869 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = ANCHOR;
1872 token->opr.ctx_type = BUF_FIRST;
1876 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = ANCHOR;
1879 token->opr.ctx_type = BUF_LAST;
1883 if (!(syntax & RE_NO_BK_PARENS))
1884 token->type = OP_OPEN_SUBEXP;
1887 if (!(syntax & RE_NO_BK_PARENS))
1888 token->type = OP_CLOSE_SUBEXP;
1891 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892 token->type = OP_DUP_PLUS;
1895 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896 token->type = OP_DUP_QUESTION;
1899 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900 token->type = OP_OPEN_DUP_NUM;
1903 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904 token->type = OP_CLOSE_DUP_NUM;
1912 token->type = CHARACTER;
1913 #ifdef RE_ENABLE_I18N
1914 if (input->mb_cur_max > 1)
1916 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1917 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921 token->word_char = IS_WORD_CHAR (token->opr.c);
1926 if (syntax & RE_NEWLINE_ALT)
1927 token->type = OP_ALT;
1930 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1931 token->type = OP_ALT;
1934 token->type = OP_DUP_ASTERISK;
1937 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938 token->type = OP_DUP_PLUS;
1941 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942 token->type = OP_DUP_QUESTION;
1945 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946 token->type = OP_OPEN_DUP_NUM;
1949 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950 token->type = OP_CLOSE_DUP_NUM;
1953 if (syntax & RE_NO_BK_PARENS)
1954 token->type = OP_OPEN_SUBEXP;
1957 if (syntax & RE_NO_BK_PARENS)
1958 token->type = OP_CLOSE_SUBEXP;
1961 token->type = OP_OPEN_BRACKET;
1964 token->type = OP_PERIOD;
1967 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1968 re_string_cur_idx (input) != 0)
1970 char prev = re_string_peek_byte (input, -1);
1971 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974 token->type = ANCHOR;
1975 token->opr.ctx_type = LINE_FIRST;
1978 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1979 re_string_cur_idx (input) + 1 != re_string_length (input))
1982 re_string_skip_bytes (input, 1);
1983 peek_token (&next, input, syntax);
1984 re_string_skip_bytes (input, -1);
1985 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988 token->type = ANCHOR;
1989 token->opr.ctx_type = LINE_LAST;
1997 /* Peek a token from INPUT, and return the length of the token.
1998 We must not use this function out of bracket expressions. */
2002 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2005 if (re_string_eoi (input))
2007 token->type = END_OF_RE;
2010 c = re_string_peek_byte (input, 0);
2013 #ifdef RE_ENABLE_I18N
2014 if (input->mb_cur_max > 1 &&
2015 !re_string_first_byte (input, re_string_cur_idx (input)))
2017 token->type = CHARACTER;
2020 #endif /* RE_ENABLE_I18N */
2022 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023 && re_string_cur_idx (input) + 1 < re_string_length (input))
2025 /* In this case, '\' escape a character. */
2027 re_string_skip_bytes (input, 1);
2028 c2 = re_string_peek_byte (input, 0);
2030 token->type = CHARACTER;
2033 if (c == '[') /* '[' is a special char in a bracket exps. */
2037 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038 c2 = re_string_peek_byte (input, 1);
2046 token->type = OP_OPEN_COLL_ELEM;
2049 token->type = OP_OPEN_EQUIV_CLASS;
2052 if (syntax & RE_CHAR_CLASSES)
2054 token->type = OP_OPEN_CHAR_CLASS;
2057 /* else fall through. */
2059 token->type = CHARACTER;
2069 token->type = OP_CHARSET_RANGE;
2072 token->type = OP_CLOSE_BRACKET;
2075 token->type = OP_NON_MATCH_LIST;
2078 token->type = CHARACTER;
2083 /* Functions for parser. */
2085 /* Entry point of the parser.
2086 Parse the regular expression REGEXP and return the structure tree.
2087 If an error is occured, ERR is set by error code, and return NULL.
2088 This function build the following tree, from regular expression <reg_exp>:
2094 CAT means concatenation.
2095 EOR means end of regular expression. */
2098 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2101 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2102 bin_tree_t *tree, *eor, *root;
2103 re_token_t current_token;
2104 dfa->syntax = syntax;
2105 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2106 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2107 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2111 root = create_tree (dfa, tree, eor, CONCAT);
2114 if (BE (eor == NULL || root == NULL, 0))
2122 /* This function build the following tree, from regular expression
2123 <branch1>|<branch2>:
2129 ALT means alternative, which represents the operator `|'. */
2132 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2135 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2136 bin_tree_t *tree, *branch = NULL;
2137 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2138 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141 while (token->type == OP_ALT)
2143 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2144 if (token->type != OP_ALT && token->type != END_OF_RE
2145 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2147 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2153 tree = create_tree (dfa, tree, branch, OP_ALT);
2154 if (BE (tree == NULL, 0))
2163 /* This function build the following tree, from regular expression
2170 CAT means concatenation. */
2173 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2176 bin_tree_t *tree, *expr;
2177 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2178 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2179 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182 while (token->type != OP_ALT && token->type != END_OF_RE
2183 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2185 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2190 if (tree != NULL && expr != NULL)
2192 tree = create_tree (dfa, tree, expr, CONCAT);
2199 else if (tree == NULL)
2201 /* Otherwise expr == NULL, we don't need to create new tree. */
2206 /* This function build the following tree, from regular expression a*:
2213 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2214 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218 switch (token->type)
2221 tree = create_token_tree (dfa, NULL, NULL, token);
2222 if (BE (tree == NULL, 0))
2227 #ifdef RE_ENABLE_I18N
2228 if (dfa->mb_cur_max > 1)
2230 while (!re_string_eoi (regexp)
2231 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2233 bin_tree_t *mbc_remain;
2234 fetch_token (token, regexp, syntax);
2235 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2236 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2237 if (BE (mbc_remain == NULL || tree == NULL, 0))
2246 case OP_OPEN_SUBEXP:
2247 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2248 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2251 case OP_OPEN_BRACKET:
2252 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2253 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2262 dfa->used_bkref_map |= 1 << token->opr.idx;
2263 tree = create_token_tree (dfa, NULL, NULL, token);
2264 if (BE (tree == NULL, 0))
2270 dfa->has_mb_node = 1;
2272 case OP_OPEN_DUP_NUM:
2273 if (syntax & RE_CONTEXT_INVALID_DUP)
2279 case OP_DUP_ASTERISK:
2281 case OP_DUP_QUESTION:
2282 if (syntax & RE_CONTEXT_INVALID_OPS)
2287 else if (syntax & RE_CONTEXT_INDEP_OPS)
2289 fetch_token (token, regexp, syntax);
2290 return parse_expression (regexp, preg, token, syntax, nest, err);
2292 /* else fall through */
2293 case OP_CLOSE_SUBEXP:
2294 if ((token->type == OP_CLOSE_SUBEXP) &&
2295 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2300 /* else fall through */
2301 case OP_CLOSE_DUP_NUM:
2302 /* We treat it as a normal character. */
2304 /* Then we can these characters as normal characters. */
2305 token->type = CHARACTER;
2306 /* mb_partial and word_char bits should be initialized already
2308 tree = create_token_tree (dfa, NULL, NULL, token);
2309 if (BE (tree == NULL, 0))
2316 if ((token->opr.ctx_type
2317 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2318 && dfa->word_ops_used == 0)
2319 init_word_char (dfa);
2320 if (token->opr.ctx_type == WORD_DELIM
2321 || token->opr.ctx_type == NOT_WORD_DELIM)
2323 bin_tree_t *tree_first, *tree_last;
2324 if (token->opr.ctx_type == WORD_DELIM)
2326 token->opr.ctx_type = WORD_FIRST;
2327 tree_first = create_token_tree (dfa, NULL, NULL, token);
2328 token->opr.ctx_type = WORD_LAST;
2332 token->opr.ctx_type = INSIDE_WORD;
2333 tree_first = create_token_tree (dfa, NULL, NULL, token);
2334 token->opr.ctx_type = INSIDE_NOTWORD;
2336 tree_last = create_token_tree (dfa, NULL, NULL, token);
2337 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2338 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2346 tree = create_token_tree (dfa, NULL, NULL, token);
2347 if (BE (tree == NULL, 0))
2353 /* We must return here, since ANCHORs can't be followed
2354 by repetition operators.
2355 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2357 fetch_token (token, regexp, syntax);
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2366 if (dfa->mb_cur_max > 1)
2367 dfa->has_mb_node = 1;
2371 tree = build_charclass_op (dfa, regexp->trans,
2372 (const unsigned char *) "alnum",
2373 (const unsigned char *) "_",
2374 token->type == OP_NOTWORD, err);
2375 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380 tree = build_charclass_op (dfa, regexp->trans,
2381 (const unsigned char *) "space",
2382 (const unsigned char *) "",
2383 token->type == OP_NOTSPACE, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 /* Must not happen? */
2400 fetch_token (token, regexp, syntax);
2402 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2403 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2405 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2406 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2408 /* In BRE consecutive duplications are not allowed. */
2409 if ((syntax & RE_CONTEXT_INVALID_DUP)
2410 && (token->type == OP_DUP_ASTERISK
2411 || token->type == OP_OPEN_DUP_NUM))
2421 /* This function build the following tree, from regular expression
2429 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2430 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2432 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2435 cur_nsub = preg->re_nsub++;
2437 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2439 /* The subexpression may be a null string. */
2440 if (token->type == OP_CLOSE_SUBEXP)
2444 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2445 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2447 if (BE (*err != REG_NOERROR, 0))
2451 if (cur_nsub <= '9' - '1')
2452 dfa->completed_bkref_map |= 1 << cur_nsub;
2454 tree = create_tree (dfa, tree, NULL, SUBEXP);
2455 if (BE (tree == NULL, 0))
2460 tree->token.opr.idx = cur_nsub;
2464 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2467 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2468 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2470 bin_tree_t *tree = NULL, *old_tree = NULL;
2471 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2472 re_token_t start_token = *token;
2474 if (token->type == OP_OPEN_DUP_NUM)
2477 start = fetch_number (regexp, token, syntax);
2478 if (start == REG_MISSING)
2480 if (token->type == CHARACTER && token->opr.c == ',')
2481 start = 0; /* We treat "{,m}" as "{0,m}". */
2484 *err = REG_BADBR; /* <re>{} is invalid. */
2488 if (BE (start != REG_ERROR, 1))
2490 /* We treat "{n}" as "{n,n}". */
2491 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2492 : ((token->type == CHARACTER && token->opr.c == ',')
2493 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2495 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2497 /* Invalid sequence. */
2498 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2500 if (token->type == END_OF_RE)
2508 /* If the syntax bit is set, rollback. */
2509 re_string_set_index (regexp, start_idx);
2510 *token = start_token;
2511 token->type = CHARACTER;
2512 /* mb_partial and word_char bits should be already initialized by
2517 if (BE (end != REG_MISSING && start > end, 0))
2519 /* First number greater than second. */
2526 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2527 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2530 fetch_token (token, regexp, syntax);
2532 if (BE (elem == NULL, 0))
2534 if (BE (start == 0 && end == 0, 0))
2536 postorder (elem, free_tree, NULL);
2540 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2541 if (BE (start > 0, 0))
2544 for (i = 2; i <= start; ++i)
2546 elem = duplicate_tree (elem, dfa);
2547 tree = create_tree (dfa, tree, elem, CONCAT);
2548 if (BE (elem == NULL || tree == NULL, 0))
2549 goto parse_dup_op_espace;
2555 /* Duplicate ELEM before it is marked optional. */
2556 elem = duplicate_tree (elem, dfa);
2562 if (elem->token.type == SUBEXP)
2563 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2565 tree = create_tree (dfa, elem, NULL,
2566 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2567 if (BE (tree == NULL, 0))
2568 goto parse_dup_op_espace;
2570 /* This loop is actually executed only when end != REG_MISSING,
2571 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2572 already created the start+1-th copy. */
2573 if ((Idx) -1 < 0 || end != REG_MISSING)
2574 for (i = start + 2; i <= end; ++i)
2576 elem = duplicate_tree (elem, dfa);
2577 tree = create_tree (dfa, tree, elem, CONCAT);
2578 if (BE (elem == NULL || tree == NULL, 0))
2579 goto parse_dup_op_espace;
2581 tree = create_tree (dfa, tree, NULL, OP_ALT);
2582 if (BE (tree == NULL, 0))
2583 goto parse_dup_op_espace;
2587 tree = create_tree (dfa, old_tree, tree, CONCAT);
2591 parse_dup_op_espace:
2596 /* Size of the names for collating symbol/equivalence_class/character_class.
2597 I'm not sure, but maybe enough. */
2598 #define BRACKET_NAME_BUF_SIZE 32
2601 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2602 Build the range expression which starts from START_ELEM, and ends
2603 at END_ELEM. The result are written to MBCSET and SBCSET.
2604 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2605 mbcset->range_ends, is a pointer argument sinse we may
2608 static reg_errcode_t
2610 # ifdef RE_ENABLE_I18N
2611 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2612 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2613 # else /* not RE_ENABLE_I18N */
2614 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2615 bracket_elem_t *end_elem)
2616 # endif /* not RE_ENABLE_I18N */
2618 unsigned int start_ch, end_ch;
2619 /* Equivalence Classes and Character Classes can't be a range start/end. */
2620 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2621 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2625 /* We can handle no multi character collating elements without libc
2627 if (BE ((start_elem->type == COLL_SYM
2628 && strlen ((char *) start_elem->opr.name) > 1)
2629 || (end_elem->type == COLL_SYM
2630 && strlen ((char *) end_elem->opr.name) > 1), 0))
2631 return REG_ECOLLATE;
2633 # ifdef RE_ENABLE_I18N
2638 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2640 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2641 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2643 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2644 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2646 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2647 ? __btowc (start_ch) : start_elem->opr.wch);
2648 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2649 ? __btowc (end_ch) : end_elem->opr.wch);
2650 if (start_wc == WEOF || end_wc == WEOF)
2651 return REG_ECOLLATE;
2652 cmp_buf[0] = start_wc;
2653 cmp_buf[4] = end_wc;
2654 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2657 /* Got valid collation sequence values, add them as a new entry.
2658 However, for !_LIBC we have no collation elements: if the
2659 character set is single byte, the single byte character set
2660 that we build below suffices. parse_bracket_exp passes
2661 no MBCSET if dfa->mb_cur_max == 1. */
2664 /* Check the space of the arrays. */
2665 if (BE (*range_alloc == mbcset->nranges, 0))
2667 /* There is not enough space, need realloc. */
2668 wchar_t *new_array_start, *new_array_end;
2671 /* +1 in case of mbcset->nranges is 0. */
2672 new_nranges = 2 * mbcset->nranges + 1;
2673 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2674 are NULL if *range_alloc == 0. */
2675 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2677 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2680 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2683 mbcset->range_starts = new_array_start;
2684 mbcset->range_ends = new_array_end;
2685 *range_alloc = new_nranges;
2688 mbcset->range_starts[mbcset->nranges] = start_wc;
2689 mbcset->range_ends[mbcset->nranges++] = end_wc;
2692 /* Build the table for single byte characters. */
2693 for (wc = 0; wc < SBC_MAX; ++wc)
2696 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2697 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2698 bitset_set (sbcset, wc);
2701 # else /* not RE_ENABLE_I18N */
2704 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2705 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2707 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2708 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2710 if (start_ch > end_ch)
2712 /* Build the table for single byte characters. */
2713 for (ch = 0; ch < SBC_MAX; ++ch)
2714 if (start_ch <= ch && ch <= end_ch)
2715 bitset_set (sbcset, ch);
2717 # endif /* not RE_ENABLE_I18N */
2720 #endif /* not _LIBC */
2723 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2724 Build the collating element which is represented by NAME.
2725 The result are written to MBCSET and SBCSET.
2726 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2727 pointer argument since we may update it. */
2729 static reg_errcode_t
2731 build_collating_symbol (bitset_t sbcset,
2732 # ifdef RE_ENABLE_I18N
2733 re_charset_t *mbcset, Idx *coll_sym_alloc,
2735 const unsigned char *name)
2737 size_t name_len = strlen ((const char *) name);
2738 if (BE (name_len != 1, 0))
2739 return REG_ECOLLATE;
2742 bitset_set (sbcset, name[0]);
2746 #endif /* not _LIBC */
2748 /* This function parse bracket expression like "[abc]", "[a-c]",
2752 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2753 reg_syntax_t syntax, reg_errcode_t *err)
2756 const unsigned char *collseqmb;
2757 const char *collseqwc;
2760 const int32_t *symb_table;
2761 const unsigned char *extra;
2763 /* Local function for parse_bracket_exp used in _LIBC environement.
2764 Seek the collating symbol entry correspondings to NAME.
2765 Return the index of the symbol in the SYMB_TABLE. */
2768 __attribute ((always_inline))
2769 seek_collating_symbol_entry (name, name_len)
2770 const unsigned char *name;
2773 int32_t hash = elem_hash ((const char *) name, name_len);
2774 int32_t elem = hash % table_size;
2775 if (symb_table[2 * elem] != 0)
2777 int32_t second = hash % (table_size - 2) + 1;
2781 /* First compare the hashing value. */
2782 if (symb_table[2 * elem] == hash
2783 /* Compare the length of the name. */
2784 && name_len == extra[symb_table[2 * elem + 1]]
2785 /* Compare the name. */
2786 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2789 /* Yep, this is the entry. */
2796 while (symb_table[2 * elem] != 0);
2801 /* Local function for parse_bracket_exp used in _LIBC environement.
2802 Look up the collation sequence value of BR_ELEM.
2803 Return the value if succeeded, UINT_MAX otherwise. */
2805 auto inline unsigned int
2806 __attribute ((always_inline))
2807 lookup_collation_sequence_value (br_elem)
2808 bracket_elem_t *br_elem;
2810 if (br_elem->type == SB_CHAR)
2813 if (MB_CUR_MAX == 1)
2816 return collseqmb[br_elem->opr.ch];
2819 wint_t wc = __btowc (br_elem->opr.ch);
2820 return __collseq_table_lookup (collseqwc, wc);
2823 else if (br_elem->type == MB_CHAR)
2825 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2827 else if (br_elem->type == COLL_SYM)
2829 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833 elem = seek_collating_symbol_entry (br_elem->opr.name,
2835 if (symb_table[2 * elem] != 0)
2837 /* We found the entry. */
2838 idx = symb_table[2 * elem + 1];
2839 /* Skip the name of collating element name. */
2840 idx += 1 + extra[idx];
2841 /* Skip the byte sequence of the collating element. */
2842 idx += 1 + extra[idx];
2843 /* Adjust for the alignment. */
2844 idx = (idx + 3) & ~3;
2845 /* Skip the multibyte collation sequence value. */
2846 idx += sizeof (unsigned int);
2847 /* Skip the wide char sequence of the collating element. */
2848 idx += sizeof (unsigned int) *
2849 (1 + *(unsigned int *) (extra + idx));
2850 /* Return the collation sequence value. */
2851 return *(unsigned int *) (extra + idx);
2853 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2855 /* No valid character. Match it as a single byte
2857 return collseqmb[br_elem->opr.name[0]];
2860 else if (sym_name_len == 1)
2861 return collseqmb[br_elem->opr.name[0]];
2866 /* Local function for parse_bracket_exp used in _LIBC environement.
2867 Build the range expression which starts from START_ELEM, and ends
2868 at END_ELEM. The result are written to MBCSET and SBCSET.
2869 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2870 mbcset->range_ends, is a pointer argument sinse we may
2873 auto inline reg_errcode_t
2874 __attribute ((always_inline))
2875 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2876 re_charset_t *mbcset;
2879 bracket_elem_t *start_elem, *end_elem;
2882 uint32_t start_collseq;
2883 uint32_t end_collseq;
2885 /* Equivalence Classes and Character Classes can't be a range
2887 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2892 start_collseq = lookup_collation_sequence_value (start_elem);
2893 end_collseq = lookup_collation_sequence_value (end_elem);
2894 /* Check start/end collation sequence values. */
2895 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896 return REG_ECOLLATE;
2897 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2900 /* Got valid collation sequence values, add them as a new entry.
2901 However, if we have no collation elements, and the character set
2902 is single byte, the single byte character set that we
2903 build below suffices. */
2904 if (nrules > 0 || dfa->mb_cur_max > 1)
2906 /* Check the space of the arrays. */
2907 if (BE (*range_alloc == mbcset->nranges, 0))
2909 /* There is not enough space, need realloc. */
2910 uint32_t *new_array_start;
2911 uint32_t *new_array_end;
2914 /* +1 in case of mbcset->nranges is 0. */
2915 new_nranges = 2 * mbcset->nranges + 1;
2916 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2918 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2921 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2924 mbcset->range_starts = new_array_start;
2925 mbcset->range_ends = new_array_end;
2926 *range_alloc = new_nranges;
2929 mbcset->range_starts[mbcset->nranges] = start_collseq;
2930 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2933 /* Build the table for single byte characters. */
2934 for (ch = 0; ch < SBC_MAX; ch++)
2936 uint32_t ch_collseq;
2938 if (MB_CUR_MAX == 1)
2941 ch_collseq = collseqmb[ch];
2943 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945 bitset_set (sbcset, ch);
2950 /* Local function for parse_bracket_exp used in _LIBC environement.
2951 Build the collating element which is represented by NAME.
2952 The result are written to MBCSET and SBCSET.
2953 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954 pointer argument sinse we may update it. */
2956 auto inline reg_errcode_t
2957 __attribute ((always_inline))
2958 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2959 re_charset_t *mbcset;
2960 Idx *coll_sym_alloc;
2962 const unsigned char *name;
2965 size_t name_len = strlen ((const char *) name);
2968 elem = seek_collating_symbol_entry (name, name_len);
2969 if (symb_table[2 * elem] != 0)
2971 /* We found the entry. */
2972 idx = symb_table[2 * elem + 1];
2973 /* Skip the name of collating element name. */
2974 idx += 1 + extra[idx];
2976 else if (symb_table[2 * elem] == 0 && name_len == 1)
2978 /* No valid character, treat it as a normal
2980 bitset_set (sbcset, name[0]);
2984 return REG_ECOLLATE;
2986 /* Got valid collation sequence, add it as a new entry. */
2987 /* Check the space of the arrays. */
2988 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2990 /* Not enough, realloc it. */
2991 /* +1 in case of mbcset->ncoll_syms is 0. */
2992 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2993 /* Use realloc since mbcset->coll_syms is NULL
2995 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2996 new_coll_sym_alloc);
2997 if (BE (new_coll_syms == NULL, 0))
2999 mbcset->coll_syms = new_coll_syms;
3000 *coll_sym_alloc = new_coll_sym_alloc;
3002 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3007 if (BE (name_len != 1, 0))
3008 return REG_ECOLLATE;
3011 bitset_set (sbcset, name[0]);
3018 re_token_t br_token;
3019 re_bitset_ptr_t sbcset;
3020 #ifdef RE_ENABLE_I18N
3021 re_charset_t *mbcset;
3022 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3023 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3024 #endif /* not RE_ENABLE_I18N */
3025 bool non_match = false;
3026 bin_tree_t *work_tree;
3028 bool first_round = true;
3030 collseqmb = (const unsigned char *)
3031 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3032 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3038 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3039 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3040 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3041 _NL_COLLATE_SYMB_TABLEMB);
3042 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3043 _NL_COLLATE_SYMB_EXTRAMB);
3046 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3047 #ifdef RE_ENABLE_I18N
3048 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3049 #endif /* RE_ENABLE_I18N */
3050 #ifdef RE_ENABLE_I18N
3051 if (BE (sbcset == NULL || mbcset == NULL, 0))
3053 if (BE (sbcset == NULL, 0))
3054 #endif /* RE_ENABLE_I18N */
3060 token_len = peek_token_bracket (token, regexp, syntax);
3061 if (BE (token->type == END_OF_RE, 0))
3064 goto parse_bracket_exp_free_return;
3066 if (token->type == OP_NON_MATCH_LIST)
3068 #ifdef RE_ENABLE_I18N
3069 mbcset->non_match = 1;
3070 #endif /* not RE_ENABLE_I18N */
3072 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3073 bitset_set (sbcset, '\0');
3074 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3075 token_len = peek_token_bracket (token, regexp, syntax);
3076 if (BE (token->type == END_OF_RE, 0))
3079 goto parse_bracket_exp_free_return;
3083 /* We treat the first ']' as a normal character. */
3084 if (token->type == OP_CLOSE_BRACKET)
3085 token->type = CHARACTER;
3089 bracket_elem_t start_elem, end_elem;
3090 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3091 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3094 bool is_range_exp = false;
3097 start_elem.opr.name = start_name_buf;
3098 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099 syntax, first_round);
3100 if (BE (ret != REG_NOERROR, 0))
3103 goto parse_bracket_exp_free_return;
3105 first_round = false;
3107 /* Get information about the next token. We need it in any case. */
3108 token_len = peek_token_bracket (token, regexp, syntax);
3110 /* Do not check for ranges if we know they are not allowed. */
3111 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3113 if (BE (token->type == END_OF_RE, 0))
3116 goto parse_bracket_exp_free_return;
3118 if (token->type == OP_CHARSET_RANGE)
3120 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3121 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122 if (BE (token2.type == END_OF_RE, 0))
3125 goto parse_bracket_exp_free_return;
3127 if (token2.type == OP_CLOSE_BRACKET)
3129 /* We treat the last '-' as a normal character. */
3130 re_string_skip_bytes (regexp, -token_len);
3131 token->type = CHARACTER;
3134 is_range_exp = true;
3138 if (is_range_exp == true)
3140 end_elem.opr.name = end_name_buf;
3141 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3143 if (BE (ret != REG_NOERROR, 0))
3146 goto parse_bracket_exp_free_return;
3149 token_len = peek_token_bracket (token, regexp, syntax);
3152 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153 &start_elem, &end_elem);
3155 # ifdef RE_ENABLE_I18N
3156 *err = build_range_exp (sbcset,
3157 dfa->mb_cur_max > 1 ? mbcset : NULL,
3158 &range_alloc, &start_elem, &end_elem);
3160 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3162 #endif /* RE_ENABLE_I18N */
3163 if (BE (*err != REG_NOERROR, 0))
3164 goto parse_bracket_exp_free_return;
3168 switch (start_elem.type)
3171 bitset_set (sbcset, start_elem.opr.ch);
3173 #ifdef RE_ENABLE_I18N
3175 /* Check whether the array has enough space. */
3176 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3178 wchar_t *new_mbchars;
3179 /* Not enough, realloc it. */
3180 /* +1 in case of mbcset->nmbchars is 0. */
3181 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182 /* Use realloc since array is NULL if *alloc == 0. */
3183 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3185 if (BE (new_mbchars == NULL, 0))
3186 goto parse_bracket_exp_espace;
3187 mbcset->mbchars = new_mbchars;
3189 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3191 #endif /* RE_ENABLE_I18N */
3193 *err = build_equiv_class (sbcset,
3194 #ifdef RE_ENABLE_I18N
3195 mbcset, &equiv_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197 start_elem.opr.name);
3198 if (BE (*err != REG_NOERROR, 0))
3199 goto parse_bracket_exp_free_return;
3202 *err = build_collating_symbol (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204 mbcset, &coll_sym_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206 start_elem.opr.name);
3207 if (BE (*err != REG_NOERROR, 0))
3208 goto parse_bracket_exp_free_return;
3211 *err = build_charclass (regexp->trans, sbcset,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset, &char_class_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem.opr.name, syntax);
3216 if (BE (*err != REG_NOERROR, 0))
3217 goto parse_bracket_exp_free_return;
3224 if (BE (token->type == END_OF_RE, 0))
3227 goto parse_bracket_exp_free_return;
3229 if (token->type == OP_CLOSE_BRACKET)
3233 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3235 /* If it is non-matching list. */
3237 bitset_not (sbcset);
3239 #ifdef RE_ENABLE_I18N
3240 /* Ensure only single byte characters are set. */
3241 if (dfa->mb_cur_max > 1)
3242 bitset_mask (sbcset, dfa->sb_char);
3244 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246 || mbcset->non_match)))
3248 bin_tree_t *mbc_tree;
3250 /* Build a tree for complex bracket. */
3251 dfa->has_mb_node = 1;
3252 br_token.type = COMPLEX_BRACKET;
3253 br_token.opr.mbcset = mbcset;
3254 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255 if (BE (mbc_tree == NULL, 0))
3256 goto parse_bracket_exp_espace;
3257 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258 if (sbcset[sbc_idx])
3260 /* If there are no bits set in sbcset, there is no point
3261 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3262 if (sbc_idx < BITSET_WORDS)
3264 /* Build a tree for simple bracket. */
3265 br_token.type = SIMPLE_BRACKET;
3266 br_token.opr.sbcset = sbcset;
3267 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268 if (BE (work_tree == NULL, 0))
3269 goto parse_bracket_exp_espace;
3271 /* Then join them by ALT node. */
3272 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273 if (BE (work_tree == NULL, 0))
3274 goto parse_bracket_exp_espace;
3279 work_tree = mbc_tree;
3283 #endif /* not RE_ENABLE_I18N */
3285 #ifdef RE_ENABLE_I18N
3286 free_charset (mbcset);
3288 /* Build a tree for simple bracket. */
3289 br_token.type = SIMPLE_BRACKET;
3290 br_token.opr.sbcset = sbcset;
3291 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292 if (BE (work_tree == NULL, 0))
3293 goto parse_bracket_exp_espace;
3297 parse_bracket_exp_espace:
3299 parse_bracket_exp_free_return:
3301 #ifdef RE_ENABLE_I18N
3302 free_charset (mbcset);
3303 #endif /* RE_ENABLE_I18N */
3307 /* Parse an element in the bracket expression. */
3309 static reg_errcode_t
3310 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311 re_token_t *token, int token_len, re_dfa_t *dfa,
3312 reg_syntax_t syntax, bool accept_hyphen)
3314 #ifdef RE_ENABLE_I18N
3316 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317 if (cur_char_size > 1)
3319 elem->type = MB_CHAR;
3320 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321 re_string_skip_bytes (regexp, cur_char_size);
3324 #endif /* RE_ENABLE_I18N */
3325 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3326 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327 || token->type == OP_OPEN_EQUIV_CLASS)
3328 return parse_bracket_symbol (elem, regexp, token);
3329 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3331 /* A '-' must only appear as anything but a range indicator before
3332 the closing bracket. Everything else is an error. */
3334 (void) peek_token_bracket (&token2, regexp, syntax);
3335 if (token2.type != OP_CLOSE_BRACKET)
3336 /* The actual error value is not standardized since this whole
3337 case is undefined. But ERANGE makes good sense. */
3340 elem->type = SB_CHAR;
3341 elem->opr.ch = token->opr.c;
3345 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3346 such as [:<character_class>:], [.<collating_element>.], and
3347 [=<equivalent_class>=]. */
3349 static reg_errcode_t
3350 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3353 unsigned char ch, delim = token->opr.c;
3355 if (re_string_eoi(regexp))
3359 if (i >= BRACKET_NAME_BUF_SIZE)
3361 if (token->type == OP_OPEN_CHAR_CLASS)
3362 ch = re_string_fetch_byte_case (regexp);
3364 ch = re_string_fetch_byte (regexp);
3365 if (re_string_eoi(regexp))
3367 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3369 elem->opr.name[i] = ch;
3371 re_string_skip_bytes (regexp, 1);
3372 elem->opr.name[i] = '\0';
3373 switch (token->type)
3375 case OP_OPEN_COLL_ELEM:
3376 elem->type = COLL_SYM;
3378 case OP_OPEN_EQUIV_CLASS:
3379 elem->type = EQUIV_CLASS;
3381 case OP_OPEN_CHAR_CLASS:
3382 elem->type = CHAR_CLASS;
3390 /* Helper function for parse_bracket_exp.
3391 Build the equivalence class which is represented by NAME.
3392 The result are written to MBCSET and SBCSET.
3393 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394 is a pointer argument sinse we may update it. */
3396 static reg_errcode_t
3397 #ifdef RE_ENABLE_I18N
3398 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399 Idx *equiv_class_alloc, const unsigned char *name)
3400 #else /* not RE_ENABLE_I18N */
3401 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3402 #endif /* not RE_ENABLE_I18N */
3405 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3408 const int32_t *table, *indirect;
3409 const unsigned char *weights, *extra, *cp;
3410 unsigned char char_buf[2];
3414 /* This #include defines a local function! */
3415 # include <locale/weight.h>
3416 /* Calculate the index for equivalence class. */
3418 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420 _NL_COLLATE_WEIGHTMB);
3421 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422 _NL_COLLATE_EXTRAMB);
3423 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424 _NL_COLLATE_INDIRECTMB);
3425 idx1 = findidx (&cp);
3426 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3427 /* This isn't a valid character. */
3428 return REG_ECOLLATE;
3430 /* Build single byte matcing table for this equivalence class. */
3431 char_buf[1] = (unsigned char) '\0';
3432 len = weights[idx1];
3433 for (ch = 0; ch < SBC_MAX; ++ch)
3437 idx2 = findidx (&cp);
3442 /* This isn't a valid character. */
3444 if (len == weights[idx2])
3447 while (cnt <= len &&
3448 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3452 bitset_set (sbcset, ch);
3455 /* Check whether the array has enough space. */
3456 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3458 /* Not enough, realloc it. */
3459 /* +1 in case of mbcset->nequiv_classes is 0. */
3460 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3461 /* Use realloc since the array is NULL if *alloc == 0. */
3462 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3464 new_equiv_class_alloc);
3465 if (BE (new_equiv_classes == NULL, 0))
3467 mbcset->equiv_classes = new_equiv_classes;
3468 *equiv_class_alloc = new_equiv_class_alloc;
3470 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3475 if (BE (strlen ((const char *) name) != 1, 0))
3476 return REG_ECOLLATE;
3477 bitset_set (sbcset, *name);
3482 /* Helper function for parse_bracket_exp.
3483 Build the character class which is represented by NAME.
3484 The result are written to MBCSET and SBCSET.
3485 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3486 is a pointer argument sinse we may update it. */
3488 static reg_errcode_t
3489 #ifdef RE_ENABLE_I18N
3490 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3491 re_charset_t *mbcset, Idx *char_class_alloc,
3492 const unsigned char *class_name, reg_syntax_t syntax)
3493 #else /* not RE_ENABLE_I18N */
3494 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3495 const unsigned char *class_name, reg_syntax_t syntax)
3496 #endif /* not RE_ENABLE_I18N */
3499 const char *name = (const char *) class_name;
3501 /* In case of REG_ICASE "upper" and "lower" match the both of
3502 upper and lower cases. */
3503 if ((syntax & RE_ICASE)
3504 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3507 #ifdef RE_ENABLE_I18N
3508 /* Check the space of the arrays. */
3509 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3511 /* Not enough, realloc it. */
3512 /* +1 in case of mbcset->nchar_classes is 0. */
3513 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3514 /* Use realloc since array is NULL if *alloc == 0. */
3515 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3516 new_char_class_alloc);
3517 if (BE (new_char_classes == NULL, 0))
3519 mbcset->char_classes = new_char_classes;
3520 *char_class_alloc = new_char_class_alloc;
3522 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3523 #endif /* RE_ENABLE_I18N */
3525 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3527 if (BE (trans != NULL, 0)) \
3529 for (i = 0; i < SBC_MAX; ++i) \
3530 if (ctype_func (i)) \
3531 bitset_set (sbcset, trans[i]); \
3535 for (i = 0; i < SBC_MAX; ++i) \
3536 if (ctype_func (i)) \
3537 bitset_set (sbcset, i); \
3541 if (strcmp (name, "alnum") == 0)
3542 BUILD_CHARCLASS_LOOP (isalnum);
3543 else if (strcmp (name, "cntrl") == 0)
3544 BUILD_CHARCLASS_LOOP (iscntrl);
3545 else if (strcmp (name, "lower") == 0)
3546 BUILD_CHARCLASS_LOOP (islower);
3547 else if (strcmp (name, "space") == 0)
3548 BUILD_CHARCLASS_LOOP (isspace);
3549 else if (strcmp (name, "alpha") == 0)
3550 BUILD_CHARCLASS_LOOP (isalpha);
3551 else if (strcmp (name, "digit") == 0)
3552 BUILD_CHARCLASS_LOOP (isdigit);
3553 else if (strcmp (name, "print") == 0)
3554 BUILD_CHARCLASS_LOOP (isprint);
3555 else if (strcmp (name, "upper") == 0)
3556 BUILD_CHARCLASS_LOOP (isupper);
3557 else if (strcmp (name, "blank") == 0)
3558 BUILD_CHARCLASS_LOOP (isblank);
3559 else if (strcmp (name, "graph") == 0)
3560 BUILD_CHARCLASS_LOOP (isgraph);
3561 else if (strcmp (name, "punct") == 0)
3562 BUILD_CHARCLASS_LOOP (ispunct);
3563 else if (strcmp (name, "xdigit") == 0)
3564 BUILD_CHARCLASS_LOOP (isxdigit);
3572 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3573 const unsigned char *class_name,
3574 const unsigned char *extra, bool non_match,
3577 re_bitset_ptr_t sbcset;
3578 #ifdef RE_ENABLE_I18N
3579 re_charset_t *mbcset;
3581 #endif /* not RE_ENABLE_I18N */
3583 re_token_t br_token;
3586 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3587 #ifdef RE_ENABLE_I18N
3588 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3589 #endif /* RE_ENABLE_I18N */
3591 #ifdef RE_ENABLE_I18N
3592 if (BE (sbcset == NULL || mbcset == NULL, 0))
3593 #else /* not RE_ENABLE_I18N */
3594 if (BE (sbcset == NULL, 0))
3595 #endif /* not RE_ENABLE_I18N */
3603 #ifdef RE_ENABLE_I18N
3605 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3606 bitset_set(cset->sbcset, '\0');
3608 mbcset->non_match = 1;
3609 #endif /* not RE_ENABLE_I18N */
3612 /* We don't care the syntax in this case. */
3613 ret = build_charclass (trans, sbcset,
3614 #ifdef RE_ENABLE_I18N
3616 #endif /* RE_ENABLE_I18N */
3619 if (BE (ret != REG_NOERROR, 0))
3622 #ifdef RE_ENABLE_I18N
3623 free_charset (mbcset);
3624 #endif /* RE_ENABLE_I18N */
3628 /* \w match '_' also. */
3629 for (; *extra; extra++)
3630 bitset_set (sbcset, *extra);
3632 /* If it is non-matching list. */
3634 bitset_not (sbcset);
3636 #ifdef RE_ENABLE_I18N
3637 /* Ensure only single byte characters are set. */
3638 if (dfa->mb_cur_max > 1)
3639 bitset_mask (sbcset, dfa->sb_char);
3642 /* Build a tree for simple bracket. */
3643 br_token.type = SIMPLE_BRACKET;
3644 br_token.opr.sbcset = sbcset;
3645 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3646 if (BE (tree == NULL, 0))
3647 goto build_word_op_espace;
3649 #ifdef RE_ENABLE_I18N
3650 if (dfa->mb_cur_max > 1)
3652 bin_tree_t *mbc_tree;
3653 /* Build a tree for complex bracket. */
3654 br_token.type = COMPLEX_BRACKET;
3655 br_token.opr.mbcset = mbcset;
3656 dfa->has_mb_node = 1;
3657 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3658 if (BE (mbc_tree == NULL, 0))
3659 goto build_word_op_espace;
3660 /* Then join them by ALT node. */
3661 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3662 if (BE (mbc_tree != NULL, 1))
3667 free_charset (mbcset);
3670 #else /* not RE_ENABLE_I18N */
3672 #endif /* not RE_ENABLE_I18N */
3674 build_word_op_espace:
3676 #ifdef RE_ENABLE_I18N
3677 free_charset (mbcset);
3678 #endif /* RE_ENABLE_I18N */
3683 /* This is intended for the expressions like "a{1,3}".
3684 Fetch a number from `input', and return the number.
3685 Return REG_MISSING if the number field is empty like "{,1}".
3686 Return REG_ERROR if an error occurred. */
3689 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3691 Idx num = REG_MISSING;
3695 fetch_token (token, input, syntax);
3697 if (BE (token->type == END_OF_RE, 0))
3699 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3701 num = ((token->type != CHARACTER || c < '0' || '9' < c
3702 || num == REG_ERROR)
3704 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3705 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3710 #ifdef RE_ENABLE_I18N
3712 free_charset (re_charset_t *cset)
3714 re_free (cset->mbchars);
3716 re_free (cset->coll_syms);
3717 re_free (cset->equiv_classes);
3718 re_free (cset->range_starts);
3719 re_free (cset->range_ends);
3721 re_free (cset->char_classes);
3724 #endif /* RE_ENABLE_I18N */
3726 /* Functions for binary tree operation. */
3728 /* Create a tree node. */
3731 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3732 re_token_type_t type)
3736 return create_token_tree (dfa, left, right, &t);
3740 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3741 const re_token_t *token)
3744 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3746 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3748 if (storage == NULL)
3750 storage->next = dfa->str_tree_storage;
3751 dfa->str_tree_storage = storage;
3752 dfa->str_tree_storage_idx = 0;
3754 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3756 tree->parent = NULL;
3758 tree->right = right;
3759 tree->token = *token;
3760 tree->token.duplicated = 0;
3761 tree->token.opt_subexp = 0;
3764 tree->node_idx = REG_MISSING;
3767 left->parent = tree;
3769 right->parent = tree;
3773 /* Mark the tree SRC as an optional subexpression.
3774 To be called from preorder or postorder. */
3776 static reg_errcode_t
3777 mark_opt_subexp (void *extra, bin_tree_t *node)
3779 Idx idx = (Idx) (long) extra;
3780 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3781 node->token.opt_subexp = 1;
3786 /* Free the allocated memory inside NODE. */
3789 free_token (re_token_t *node)
3791 #ifdef RE_ENABLE_I18N
3792 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3793 free_charset (node->opr.mbcset);
3795 #endif /* RE_ENABLE_I18N */
3796 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3797 re_free (node->opr.sbcset);
3800 /* Worker function for tree walking. Free the allocated memory inside NODE
3801 and its children. */
3803 static reg_errcode_t
3804 free_tree (void *extra, bin_tree_t *node)
3806 free_token (&node->token);
3811 /* Duplicate the node SRC, and return new node. This is a preorder
3812 visit similar to the one implemented by the generic visitor, but
3813 we need more infrastructure to maintain two parallel trees --- so,
3814 it's easier to duplicate. */
3817 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3819 const bin_tree_t *node;
3820 bin_tree_t *dup_root;
3821 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3823 for (node = root; ; )
3825 /* Create a new tree and link it back to the current parent. */
3826 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3829 (*p_new)->parent = dup_node;
3830 (*p_new)->token.duplicated = 1;
3833 /* Go to the left node, or up and to the right. */
3837 p_new = &dup_node->left;
3841 const bin_tree_t *prev = NULL;
3842 while (node->right == prev || node->right == NULL)
3845 node = node->parent;
3846 dup_node = dup_node->parent;
3851 p_new = &dup_node->right;