1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
380 if (dfa->mb_cur_max > 1)
381 for (i = 0; i < SBC_MAX; ++i)
382 if (__btowc (i) == WEOF)
383 re_set_fastmap (fastmap, icase, i);
384 # endif /* not _LIBC */
386 for (i = 0; i < cset->nmbchars; ++i)
390 memset (&state, '\0', sizeof (state));
391 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
395 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
397 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
401 #endif /* RE_ENABLE_I18N */
402 else if (type == OP_PERIOD
403 #ifdef RE_ENABLE_I18N
404 || type == OP_UTF8_PERIOD
405 #endif /* RE_ENABLE_I18N */
406 || type == END_OF_RE)
408 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 if (type == END_OF_RE)
410 bufp->can_be_null = 1;
416 /* Entry point for POSIX code. */
417 /* regcomp takes a regular expression as a string and compiles it.
419 PREG is a regex_t *. We do not expect any fields to be initialized,
420 since POSIX says we shouldn't. Thus, we set
422 `buffer' to the compiled pattern;
423 `used' to the length of the compiled pattern;
424 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425 REG_EXTENDED bit in CFLAGS is set; otherwise, to
426 RE_SYNTAX_POSIX_BASIC;
427 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428 `fastmap' to an allocated space for the fastmap;
429 `fastmap_accurate' to zero;
430 `re_nsub' to the number of subexpressions in PATTERN.
432 PATTERN is the address of the pattern string.
434 CFLAGS is a series of bits which affect compilation.
436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437 use POSIX basic syntax.
439 If REG_NEWLINE is set, then . and [^...] don't match newline.
440 Also, regexec will try a match beginning after every newline.
442 If REG_ICASE is set, then we considers upper- and lowercase
443 versions of letters to be equivalent when matching.
445 If REG_NOSUB is set, then when PREG is passed to regexec, that
446 routine will report only success or failure, and nothing about the
449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
450 the return codes and their meanings.) */
453 regcomp (preg, pattern, cflags)
454 regex_t *__restrict preg;
455 const char *__restrict pattern;
459 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 : RE_SYNTAX_POSIX_BASIC);
466 /* Try to allocate space for the fastmap. */
467 preg->fastmap = re_malloc (char, SBC_MAX);
468 if (BE (preg->fastmap == NULL, 0))
471 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
473 /* If REG_NEWLINE is set, newlines are treated differently. */
474 if (cflags & REG_NEWLINE)
475 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
476 syntax &= ~RE_DOT_NEWLINE;
477 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478 /* It also changes the matching behavior. */
479 preg->newline_anchor = 1;
482 preg->newline_anchor = 0;
483 preg->no_sub = !!(cflags & REG_NOSUB);
484 preg->translate = NULL;
486 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
488 /* POSIX doesn't distinguish between an unmatched open-group and an
489 unmatched close-group: both are REG_EPAREN. */
490 if (ret == REG_ERPAREN)
493 /* We have already checked preg->fastmap != NULL. */
494 if (BE (ret == REG_NOERROR, 1))
495 /* Compute the fastmap now, since regexec cannot modify the pattern
496 buffer. This function never fails in this implementation. */
497 (void) re_compile_fastmap (preg);
500 /* Some error occurred while compiling the expression. */
501 re_free (preg->fastmap);
502 preg->fastmap = NULL;
508 weak_alias (__regcomp, regcomp)
511 /* Returns a message corresponding to an error code, ERRCODE, returned
512 from either regcomp or regexec. We don't use PREG here. */
516 regerror (errcode, preg, errbuf, errbuf_size)
518 const regex_t *__restrict preg;
519 char *__restrict errbuf;
521 #else /* size_t might promote */
523 regerror (int errcode, const regex_t *__restrict preg,
524 char *__restrict errbuf, size_t errbuf_size)
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
541 msg_size = strlen (msg) + 1; /* Includes the null. */
543 if (BE (errbuf_size != 0, 1))
545 if (BE (msg_size > errbuf_size, 0))
547 #if defined HAVE_MEMPCPY || defined _LIBC
548 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
550 memcpy (errbuf, msg, errbuf_size - 1);
551 errbuf[errbuf_size - 1] = 0;
555 memcpy (errbuf, msg, msg_size);
561 weak_alias (__regerror, regerror)
565 #ifdef RE_ENABLE_I18N
566 /* This static array is used for the map to single-byte characters when
567 UTF-8 is used. Otherwise we would allocate memory just to initialize
568 it the same all the time. UTF-8 is the preferred encoding so this is
569 a worthwhile optimization. */
570 static const bitset_t utf8_sb_map =
572 /* Set the first 128 bits. */
573 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
574 # error "bitset_word_t is narrower than 32 bits"
575 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
576 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
577 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX,
579 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
583 >> (SBC_MAX % BITSET_WORD_BITS == 0
585 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
591 free_dfa_content (re_dfa_t *dfa)
596 for (i = 0; i < dfa->nodes_len; ++i)
597 free_token (dfa->nodes + i);
598 re_free (dfa->nexts);
599 for (i = 0; i < dfa->nodes_len; ++i)
601 if (dfa->eclosures != NULL)
602 re_node_set_free (dfa->eclosures + i);
603 if (dfa->inveclosures != NULL)
604 re_node_set_free (dfa->inveclosures + i);
605 if (dfa->edests != NULL)
606 re_node_set_free (dfa->edests + i);
608 re_free (dfa->edests);
609 re_free (dfa->eclosures);
610 re_free (dfa->inveclosures);
611 re_free (dfa->nodes);
613 if (dfa->state_table)
614 for (i = 0; i <= dfa->state_hash_mask; ++i)
616 struct re_state_table_entry *entry = dfa->state_table + i;
617 for (j = 0; j < entry->num; ++j)
619 re_dfastate_t *state = entry->array[j];
622 re_free (entry->array);
624 re_free (dfa->state_table);
625 #ifdef RE_ENABLE_I18N
626 if (dfa->sb_char != utf8_sb_map)
627 re_free (dfa->sb_char);
629 re_free (dfa->subexp_map);
631 re_free (dfa->re_str);
638 /* Free dynamically allocated space used by PREG. */
644 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
645 if (BE (dfa != NULL, 1))
646 free_dfa_content (dfa);
650 re_free (preg->fastmap);
651 preg->fastmap = NULL;
653 re_free (preg->translate);
654 preg->translate = NULL;
657 weak_alias (__regfree, regfree)
660 /* Entry points compatible with 4.2 BSD regex library. We don't define
661 them unless specifically requested. */
663 #if defined _REGEX_RE_COMP || defined _LIBC
665 /* BSD has one and only one pattern buffer. */
666 static struct re_pattern_buffer re_comp_buf;
670 /* Make these definitions weak in libc, so POSIX programs can redefine
671 these names if they don't use our functions, and still use
672 regcomp/regexec above without link errors. */
683 if (!re_comp_buf.buffer)
684 return gettext ("No previous regular expression");
688 if (re_comp_buf.buffer)
690 fastmap = re_comp_buf.fastmap;
691 re_comp_buf.fastmap = NULL;
692 __regfree (&re_comp_buf);
693 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
694 re_comp_buf.fastmap = fastmap;
697 if (re_comp_buf.fastmap == NULL)
699 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
700 if (re_comp_buf.fastmap == NULL)
701 return (char *) gettext (__re_error_msgid
702 + __re_error_msgid_idx[(int) REG_ESPACE]);
705 /* Since `re_exec' always passes NULL for the `regs' argument, we
706 don't need to initialize the pattern buffer fields which affect it. */
708 /* Match anchors at newlines. */
709 re_comp_buf.newline_anchor = 1;
711 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
716 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
717 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
721 libc_freeres_fn (free_mem)
723 __regfree (&re_comp_buf);
727 #endif /* _REGEX_RE_COMP */
729 /* Internal entry point.
730 Compile the regular expression PATTERN, whose length is LENGTH.
731 SYNTAX indicate regular expression's syntax. */
734 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
737 reg_errcode_t err = REG_NOERROR;
741 /* Initialize the pattern buffer. */
742 preg->fastmap_accurate = 0;
743 preg->syntax = syntax;
744 preg->not_bol = preg->not_eol = 0;
747 preg->can_be_null = 0;
748 preg->regs_allocated = REGS_UNALLOCATED;
750 /* Initialize the dfa. */
751 dfa = (re_dfa_t *) preg->buffer;
752 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
754 /* If zero allocated, but buffer is non-null, try to realloc
755 enough space. This loses if buffer's address is bogus, but
756 that is the user's responsibility. If ->buffer is NULL this
757 is a simple allocation. */
758 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
761 preg->allocated = sizeof (re_dfa_t);
762 preg->buffer = (unsigned char *) dfa;
764 preg->used = sizeof (re_dfa_t);
766 err = init_dfa (dfa, length);
767 if (BE (err != REG_NOERROR, 0))
769 free_dfa_content (dfa);
775 /* Note: length+1 will not overflow since it is checked in init_dfa. */
776 dfa->re_str = re_malloc (char, length + 1);
777 strncpy (dfa->re_str, pattern, length + 1);
780 __libc_lock_init (dfa->lock);
782 err = re_string_construct (®exp, pattern, length, preg->translate,
783 syntax & RE_ICASE, dfa);
784 if (BE (err != REG_NOERROR, 0))
786 re_compile_internal_free_return:
787 free_workarea_compile (preg);
788 re_string_destruct (®exp);
789 free_dfa_content (dfa);
795 /* Parse the regular expression, and build a structure tree. */
797 dfa->str_tree = parse (®exp, preg, syntax, &err);
798 if (BE (dfa->str_tree == NULL, 0))
799 goto re_compile_internal_free_return;
801 /* Analyze the tree and create the nfa. */
802 err = analyze (preg);
803 if (BE (err != REG_NOERROR, 0))
804 goto re_compile_internal_free_return;
806 #ifdef RE_ENABLE_I18N
807 /* If possible, do searching in single byte encoding to speed things up. */
808 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
812 /* Then create the initial state of the dfa. */
813 err = create_initial_state (dfa);
815 /* Release work areas. */
816 free_workarea_compile (preg);
817 re_string_destruct (®exp);
819 if (BE (err != REG_NOERROR, 0))
821 free_dfa_content (dfa);
829 /* Initialize DFA. We use the length of the regular expression PAT_LEN
830 as the initial length of some arrays. */
833 init_dfa (re_dfa_t *dfa, size_t pat_len)
835 __re_size_t table_size;
839 size_t max_object_size =
840 MAX (sizeof (struct re_state_table_entry),
841 MAX (sizeof (re_token_t),
842 MAX (sizeof (re_node_set),
843 MAX (sizeof (regmatch_t),
844 MAX (sizeof (regoff_t),
845 MAX (sizeof (wchar_t),
846 MAX (sizeof (wctype_t),
849 memset (dfa, '\0', sizeof (re_dfa_t));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
855 calculation below, and for similar doubling calculations
856 elsewhere. And it's <= rather than <, because some of the
857 doubling calculations add 1 afterwards. */
858 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
861 dfa->nodes_alloc = pat_len + 1;
862 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
864 /* table_size = 2 ^ ceil(log pat_len) */
865 for (table_size = 1; ; table_size <<= 1)
866 if (table_size > pat_len)
869 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
870 dfa->state_hash_mask = table_size - 1;
872 dfa->mb_cur_max = MB_CUR_MAX;
874 if (dfa->mb_cur_max == 6
875 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
877 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 # ifdef HAVE_LANGINFO_CODESET
881 codeset_name = nl_langinfo (CODESET);
883 codeset_name = getenv ("LC_ALL");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LC_CTYPE");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LANG");
888 if (codeset_name == NULL)
890 else if (strchr (codeset_name, '.') != NULL)
891 codeset_name = strchr (codeset_name, '.') + 1;
894 if (strcasecmp (codeset_name, "UTF-8") == 0
895 || strcasecmp (codeset_name, "UTF8") == 0)
898 /* We check exhaustively in the loop below if this charset is a
899 superset of ASCII. */
900 dfa->map_notascii = 0;
903 #ifdef RE_ENABLE_I18N
904 if (dfa->mb_cur_max > 1)
907 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
912 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
913 if (BE (dfa->sb_char == NULL, 0))
916 /* Set the bits corresponding to single byte chars. */
917 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
918 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
920 wint_t wch = __btowc (ch);
922 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
924 if (isascii (ch) && wch != ch)
925 dfa->map_notascii = 1;
932 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
937 /* Initialize WORD_CHAR table, which indicate which character is
938 "word". In this case "word" means that it is the word construction
939 character used by some operators like "\<", "\>", etc. */
943 init_word_char (re_dfa_t *dfa)
946 dfa->word_ops_used = 1;
947 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
948 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
949 if (isalnum (ch) || ch == '_')
950 dfa->word_char[i] |= (bitset_word_t) 1 << j;
953 /* Free the work area which are only used while compiling. */
956 free_workarea_compile (regex_t *preg)
958 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
959 bin_tree_storage_t *storage, *next;
960 for (storage = dfa->str_tree_storage; storage; storage = next)
962 next = storage->next;
965 dfa->str_tree_storage = NULL;
966 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
967 dfa->str_tree = NULL;
968 re_free (dfa->org_indices);
969 dfa->org_indices = NULL;
972 /* Create initial states for all contexts. */
975 create_initial_state (re_dfa_t *dfa)
979 re_node_set init_nodes;
981 /* Initial states have the epsilon closure of the node which is
982 the first node of the regular expression. */
983 first = dfa->str_tree->first->node_idx;
984 dfa->init_node = first;
985 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
986 if (BE (err != REG_NOERROR, 0))
989 /* The back-references which are in initial states can epsilon transit,
990 since in this case all of the subexpressions can be null.
991 Then we add epsilon closures of the nodes which are the next nodes of
992 the back-references. */
993 if (dfa->nbackref > 0)
994 for (i = 0; i < init_nodes.nelem; ++i)
996 Idx node_idx = init_nodes.elems[i];
997 re_token_type_t type = dfa->nodes[node_idx].type;
1000 if (type != OP_BACK_REF)
1002 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1004 re_token_t *clexp_node;
1005 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1006 if (clexp_node->type == OP_CLOSE_SUBEXP
1007 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1010 if (clexp_idx == init_nodes.nelem)
1013 if (type == OP_BACK_REF)
1015 Idx dest_idx = dfa->edests[node_idx].elems[0];
1016 if (!re_node_set_contains (&init_nodes, dest_idx))
1018 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1024 /* It must be the first time to invoke acquire_state. */
1025 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1026 /* We don't check ERR here, since the initial state must not be NULL. */
1027 if (BE (dfa->init_state == NULL, 0))
1029 if (dfa->init_state->has_constraint)
1031 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1039 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1040 || dfa->init_state_begbuf == NULL, 0))
1044 dfa->init_state_word = dfa->init_state_nl
1045 = dfa->init_state_begbuf = dfa->init_state;
1047 re_node_set_free (&init_nodes);
1051 #ifdef RE_ENABLE_I18N
1052 /* If it is possible to do searching in single byte encoding instead of UTF-8
1053 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1054 DFA nodes where needed. */
1057 optimize_utf8 (re_dfa_t *dfa)
1061 bool mb_chars = false;
1062 bool has_period = false;
1064 for (node = 0; node < dfa->nodes_len; ++node)
1065 switch (dfa->nodes[node].type)
1068 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1072 switch (dfa->nodes[node].opr.idx)
1080 /* Word anchors etc. cannot be handled. */
1090 case OP_DUP_ASTERISK:
1091 case OP_OPEN_SUBEXP:
1092 case OP_CLOSE_SUBEXP:
1094 case COMPLEX_BRACKET:
1096 case SIMPLE_BRACKET:
1097 /* Just double check. */
1099 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1101 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1102 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1104 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1114 if (mb_chars || has_period)
1115 for (node = 0; node < dfa->nodes_len; ++node)
1117 if (dfa->nodes[node].type == CHARACTER
1118 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1119 dfa->nodes[node].mb_partial = 0;
1120 else if (dfa->nodes[node].type == OP_PERIOD)
1121 dfa->nodes[node].type = OP_UTF8_PERIOD;
1124 /* The search can be in single byte locale. */
1125 dfa->mb_cur_max = 1;
1127 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1131 /* Analyze the structure tree, and calculate "first", "next", "edest",
1132 "eclosure", and "inveclosure". */
1134 static reg_errcode_t
1135 analyze (regex_t *preg)
1137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1140 /* Allocate arrays. */
1141 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1142 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1143 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1144 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1145 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1146 || dfa->eclosures == NULL, 0))
1149 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1150 if (dfa->subexp_map != NULL)
1153 for (i = 0; i < preg->re_nsub; i++)
1154 dfa->subexp_map[i] = i;
1155 preorder (dfa->str_tree, optimize_subexps, dfa);
1156 for (i = 0; i < preg->re_nsub; i++)
1157 if (dfa->subexp_map[i] != i)
1159 if (i == preg->re_nsub)
1161 free (dfa->subexp_map);
1162 dfa->subexp_map = NULL;
1166 ret = postorder (dfa->str_tree, lower_subexps, preg);
1167 if (BE (ret != REG_NOERROR, 0))
1169 ret = postorder (dfa->str_tree, calc_first, dfa);
1170 if (BE (ret != REG_NOERROR, 0))
1172 preorder (dfa->str_tree, calc_next, dfa);
1173 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1174 if (BE (ret != REG_NOERROR, 0))
1176 ret = calc_eclosure (dfa);
1177 if (BE (ret != REG_NOERROR, 0))
1180 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1181 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1182 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1185 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1186 if (BE (dfa->inveclosures == NULL, 0))
1188 ret = calc_inveclosure (dfa);
1194 /* Our parse trees are very unbalanced, so we cannot use a stack to
1195 implement parse tree visits. Instead, we use parent pointers and
1196 some hairy code in these two functions. */
1197 static reg_errcode_t
1198 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1201 bin_tree_t *node, *prev;
1203 for (node = root; ; )
1205 /* Descend down the tree, preferably to the left (or to the right
1206 if that's the only child). */
1207 while (node->left || node->right)
1215 reg_errcode_t err = fn (extra, node);
1216 if (BE (err != REG_NOERROR, 0))
1218 if (node->parent == NULL)
1221 node = node->parent;
1223 /* Go up while we have a node that is reached from the right. */
1224 while (node->right == prev || node->right == NULL);
1229 static reg_errcode_t
1230 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1235 for (node = root; ; )
1237 reg_errcode_t err = fn (extra, node);
1238 if (BE (err != REG_NOERROR, 0))
1241 /* Go to the left node, or up and to the right. */
1246 bin_tree_t *prev = NULL;
1247 while (node->right == prev || node->right == NULL)
1250 node = node->parent;
1259 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1260 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1261 backreferences as well. Requires a preorder visit. */
1262 static reg_errcode_t
1263 optimize_subexps (void *extra, bin_tree_t *node)
1265 re_dfa_t *dfa = (re_dfa_t *) extra;
1267 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1269 int idx = node->token.opr.idx;
1270 node->token.opr.idx = dfa->subexp_map[idx];
1271 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1274 else if (node->token.type == SUBEXP
1275 && node->left && node->left->token.type == SUBEXP)
1277 Idx other_idx = node->left->token.opr.idx;
1279 node->left = node->left->left;
1281 node->left->parent = node;
1283 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1284 if (other_idx < BITSET_WORD_BITS)
1285 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1291 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1292 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1293 static reg_errcode_t
1294 lower_subexps (void *extra, bin_tree_t *node)
1296 regex_t *preg = (regex_t *) extra;
1297 reg_errcode_t err = REG_NOERROR;
1299 if (node->left && node->left->token.type == SUBEXP)
1301 node->left = lower_subexp (&err, preg, node->left);
1303 node->left->parent = node;
1305 if (node->right && node->right->token.type == SUBEXP)
1307 node->right = lower_subexp (&err, preg, node->right);
1309 node->right->parent = node;
1316 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1318 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1319 bin_tree_t *body = node->left;
1320 bin_tree_t *op, *cls, *tree1, *tree;
1323 /* We do not optimize empty subexpressions, because otherwise we may
1324 have bad CONCAT nodes with NULL children. This is obviously not
1325 very common, so we do not lose much. An example that triggers
1326 this case is the sed "script" /\(\)/x. */
1327 && node->left != NULL
1328 && (node->token.opr.idx >= BITSET_WORD_BITS
1329 || !(dfa->used_bkref_map
1330 & ((bitset_word_t) 1 << node->token.opr.idx))))
1333 /* Convert the SUBEXP node to the concatenation of an
1334 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1335 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1336 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1337 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1338 tree = create_tree (dfa, op, tree1, CONCAT);
1339 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1346 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1351 nodes. Requires a postorder visit. */
1352 static reg_errcode_t
1353 calc_first (void *extra, bin_tree_t *node)
1355 re_dfa_t *dfa = (re_dfa_t *) extra;
1356 if (node->token.type == CONCAT)
1358 node->first = node->left->first;
1359 node->node_idx = node->left->node_idx;
1364 node->node_idx = re_dfa_add_node (dfa, node->token);
1365 if (BE (node->node_idx == REG_MISSING, 0))
1371 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1372 static reg_errcode_t
1373 calc_next (void *extra, bin_tree_t *node)
1375 switch (node->token.type)
1377 case OP_DUP_ASTERISK:
1378 node->left->next = node;
1381 node->left->next = node->right->first;
1382 node->right->next = node->next;
1386 node->left->next = node->next;
1388 node->right->next = node->next;
1394 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1395 static reg_errcode_t
1396 link_nfa_nodes (void *extra, bin_tree_t *node)
1398 re_dfa_t *dfa = (re_dfa_t *) extra;
1399 Idx idx = node->node_idx;
1400 reg_errcode_t err = REG_NOERROR;
1402 switch (node->token.type)
1408 assert (node->next == NULL);
1411 case OP_DUP_ASTERISK:
1415 dfa->has_plural_match = 1;
1416 if (node->left != NULL)
1417 left = node->left->first->node_idx;
1419 left = node->next->node_idx;
1420 if (node->right != NULL)
1421 right = node->right->first->node_idx;
1423 right = node->next->node_idx;
1424 assert (REG_VALID_INDEX (left));
1425 assert (REG_VALID_INDEX (right));
1426 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1431 case OP_OPEN_SUBEXP:
1432 case OP_CLOSE_SUBEXP:
1433 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1437 dfa->nexts[idx] = node->next->node_idx;
1438 if (node->token.type == OP_BACK_REF)
1439 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1443 assert (!IS_EPSILON_NODE (node->token.type));
1444 dfa->nexts[idx] = node->next->node_idx;
1451 /* Duplicate the epsilon closure of the node ROOT_NODE.
1452 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1453 to their own constraint. */
1455 static reg_errcode_t
1457 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1458 Idx root_node, unsigned int init_constraint)
1460 Idx org_node, clone_node;
1462 unsigned int constraint = init_constraint;
1463 for (org_node = top_org_node, clone_node = top_clone_node;;)
1465 Idx org_dest, clone_dest;
1466 if (dfa->nodes[org_node].type == OP_BACK_REF)
1468 /* If the back reference epsilon-transit, its destination must
1469 also have the constraint. Then duplicate the epsilon closure
1470 of the destination of the back reference, and store it in
1471 edests of the back reference. */
1472 org_dest = dfa->nexts[org_node];
1473 re_node_set_empty (dfa->edests + clone_node);
1474 clone_dest = duplicate_node (dfa, org_dest, constraint);
1475 if (BE (clone_dest == REG_MISSING, 0))
1477 dfa->nexts[clone_node] = dfa->nexts[org_node];
1478 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1482 else if (dfa->edests[org_node].nelem == 0)
1484 /* In case of the node can't epsilon-transit, don't duplicate the
1485 destination and store the original destination as the
1486 destination of the node. */
1487 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 else if (dfa->edests[org_node].nelem == 1)
1492 /* In case of the node can epsilon-transit, and it has only one
1494 org_dest = dfa->edests[org_node].elems[0];
1495 re_node_set_empty (dfa->edests + clone_node);
1496 if (dfa->nodes[org_node].type == ANCHOR)
1498 /* In case of the node has another constraint, append it. */
1499 if (org_node == root_node && clone_node != org_node)
1501 /* ...but if the node is root_node itself, it means the
1502 epsilon closure have a loop, then tie it to the
1503 destination of the root_node. */
1504 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509 constraint |= dfa->nodes[org_node].opr.ctx_type;
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (BE (clone_dest == REG_MISSING, 0))
1514 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1518 else /* dfa->edests[org_node].nelem == 2 */
1520 /* In case of the node can epsilon-transit, and it has two
1521 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1522 org_dest = dfa->edests[org_node].elems[0];
1523 re_node_set_empty (dfa->edests + clone_node);
1524 /* Search for a duplicated node which satisfies the constraint. */
1525 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1526 if (clone_dest == REG_MISSING)
1528 /* There are no such a duplicated node, create a new one. */
1530 clone_dest = duplicate_node (dfa, org_dest, constraint);
1531 if (BE (clone_dest == REG_MISSING, 0))
1533 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1537 root_node, constraint);
1538 if (BE (err != REG_NOERROR, 0))
1543 /* There are a duplicated node which satisfy the constraint,
1544 use it to avoid infinite loop. */
1545 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 org_dest = dfa->edests[org_node].elems[1];
1551 clone_dest = duplicate_node (dfa, org_dest, constraint);
1552 if (BE (clone_dest == REG_MISSING, 0))
1554 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 org_node = org_dest;
1559 clone_node = clone_dest;
1564 /* Search for a node which is duplicated from the node ORG_NODE, and
1565 satisfies the constraint CONSTRAINT. */
1568 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1569 unsigned int constraint)
1572 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1574 if (org_node == dfa->org_indices[idx]
1575 && constraint == dfa->nodes[idx].constraint)
1576 return idx; /* Found. */
1578 return REG_MISSING; /* Not found. */
1581 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1582 Return the index of the new node, or REG_MISSING if insufficient storage is
1586 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1588 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1589 if (BE (dup_idx != REG_MISSING, 1))
1591 dfa->nodes[dup_idx].constraint = constraint;
1592 if (dfa->nodes[org_idx].type == ANCHOR)
1593 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1594 dfa->nodes[dup_idx].duplicated = 1;
1596 /* Store the index of the original node. */
1597 dfa->org_indices[dup_idx] = org_idx;
1602 static reg_errcode_t
1603 calc_inveclosure (re_dfa_t *dfa)
1607 for (idx = 0; idx < dfa->nodes_len; ++idx)
1608 re_node_set_init_empty (dfa->inveclosures + idx);
1610 for (src = 0; src < dfa->nodes_len; ++src)
1612 Idx *elems = dfa->eclosures[src].elems;
1613 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1615 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624 /* Calculate "eclosure" for all the node in DFA. */
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1632 assert (dfa->nodes_len > 0);
1635 /* For each nodes, calculate epsilon closure. */
1636 for (node_idx = 0; ; ++node_idx)
1639 re_node_set eclosure_elem;
1640 if (node_idx == dfa->nodes_len)
1649 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1652 /* If we have already calculated, skip it. */
1653 if (dfa->eclosures[node_idx].nelem != 0)
1655 /* Calculate epsilon closure of `node_idx'. */
1656 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1657 if (BE (err != REG_NOERROR, 0))
1660 if (dfa->eclosures[node_idx].nelem == 0)
1663 re_node_set_free (&eclosure_elem);
1669 /* Calculate epsilon closure of NODE. */
1671 static reg_errcode_t
1672 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1675 unsigned int constraint;
1679 re_node_set eclosure;
1681 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682 if (BE (err != REG_NOERROR, 0))
1685 /* This indicates that we are calculating this node now.
1686 We reference this value to avoid infinite loop. */
1687 dfa->eclosures[node].nelem = REG_MISSING;
1689 constraint = ((dfa->nodes[node].type == ANCHOR)
1690 ? dfa->nodes[node].opr.ctx_type : 0);
1691 /* If the current node has constraints, duplicate all nodes.
1692 Since they must inherit the constraints. */
1694 && dfa->edests[node].nelem
1695 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697 err = duplicate_node_closure (dfa, node, node, node, constraint);
1698 if (BE (err != REG_NOERROR, 0))
1702 /* Expand each epsilon destination nodes. */
1703 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1704 for (i = 0; i < dfa->edests[node].nelem; ++i)
1706 re_node_set eclosure_elem;
1707 Idx edest = dfa->edests[node].elems[i];
1708 /* If calculating the epsilon closure of `edest' is in progress,
1709 return intermediate result. */
1710 if (dfa->eclosures[edest].nelem == REG_MISSING)
1715 /* If we haven't calculated the epsilon closure of `edest' yet,
1716 calculate now. Otherwise use calculated epsilon closure. */
1717 if (dfa->eclosures[edest].nelem == 0)
1719 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1720 if (BE (err != REG_NOERROR, 0))
1724 eclosure_elem = dfa->eclosures[edest];
1725 /* Merge the epsilon closure of `edest'. */
1726 re_node_set_merge (&eclosure, &eclosure_elem);
1727 /* If the epsilon closure of `edest' is incomplete,
1728 the epsilon closure of this node is also incomplete. */
1729 if (dfa->eclosures[edest].nelem == 0)
1732 re_node_set_free (&eclosure_elem);
1736 /* Epsilon closures include itself. */
1737 ok = re_node_set_insert (&eclosure, node);
1740 if (incomplete && !root)
1741 dfa->eclosures[node].nelem = 0;
1743 dfa->eclosures[node] = eclosure;
1744 *new_set = eclosure;
1748 /* Functions for token which are used in the parser. */
1750 /* Fetch a token from INPUT.
1751 We must not use this function inside bracket expressions. */
1755 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 re_string_skip_bytes (input, peek_token (result, input, syntax));
1760 /* Peek a token from INPUT, and return the length of the token.
1761 We must not use this function inside bracket expressions. */
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1769 if (re_string_eoi (input))
1771 token->type = END_OF_RE;
1775 c = re_string_peek_byte (input, 0);
1778 token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780 token->mb_partial = 0;
1781 if (input->mb_cur_max > 1 &&
1782 !re_string_first_byte (input, re_string_cur_idx (input)))
1784 token->type = CHARACTER;
1785 token->mb_partial = 1;
1792 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 token->type = BACK_SLASH;
1798 c2 = re_string_peek_byte_case (input, 1);
1800 token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802 if (input->mb_cur_max > 1)
1804 wint_t wc = re_string_wchar_at (input,
1805 re_string_cur_idx (input) + 1);
1806 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1810 token->word_char = IS_WORD_CHAR (c2) != 0;
1815 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816 token->type = OP_ALT;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax & RE_NO_BK_REFS))
1822 token->type = OP_BACK_REF;
1823 token->opr.idx = c2 - '1';
1827 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = ANCHOR;
1830 token->opr.ctx_type = WORD_FIRST;
1834 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = WORD_LAST;
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_DELIM;
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = NOT_WORD_DELIM;
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_WORD;
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_NOTWORD;
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_SPACE;
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTSPACE;
1871 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = ANCHOR;
1874 token->opr.ctx_type = BUF_FIRST;
1878 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = BUF_LAST;
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_OPEN_SUBEXP;
1889 if (!(syntax & RE_NO_BK_PARENS))
1890 token->type = OP_CLOSE_SUBEXP;
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_PLUS;
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_QUESTION;
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_OPEN_DUP_NUM;
1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 token->type = OP_CLOSE_DUP_NUM;
1914 token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916 if (input->mb_cur_max > 1)
1918 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1923 token->word_char = IS_WORD_CHAR (token->opr.c);
1928 if (syntax & RE_NEWLINE_ALT)
1929 token->type = OP_ALT;
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933 token->type = OP_ALT;
1936 token->type = OP_DUP_ASTERISK;
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_PLUS;
1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_QUESTION;
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948 token->type = OP_OPEN_DUP_NUM;
1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952 token->type = OP_CLOSE_DUP_NUM;
1955 if (syntax & RE_NO_BK_PARENS)
1956 token->type = OP_OPEN_SUBEXP;
1959 if (syntax & RE_NO_BK_PARENS)
1960 token->type = OP_CLOSE_SUBEXP;
1963 token->type = OP_OPEN_BRACKET;
1966 token->type = OP_PERIOD;
1969 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970 re_string_cur_idx (input) != 0)
1972 char prev = re_string_peek_byte (input, -1);
1973 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1976 token->type = ANCHOR;
1977 token->opr.ctx_type = LINE_FIRST;
1980 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981 re_string_cur_idx (input) + 1 != re_string_length (input))
1984 re_string_skip_bytes (input, 1);
1985 peek_token (&next, input, syntax);
1986 re_string_skip_bytes (input, -1);
1987 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_LAST;
1999 /* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2004 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2007 if (re_string_eoi (input))
2009 token->type = END_OF_RE;
2012 c = re_string_peek_byte (input, 0);
2015 #ifdef RE_ENABLE_I18N
2016 if (input->mb_cur_max > 1 &&
2017 !re_string_first_byte (input, re_string_cur_idx (input)))
2019 token->type = CHARACTER;
2022 #endif /* RE_ENABLE_I18N */
2024 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2025 && re_string_cur_idx (input) + 1 < re_string_length (input))
2027 /* In this case, '\' escape a character. */
2029 re_string_skip_bytes (input, 1);
2030 c2 = re_string_peek_byte (input, 0);
2032 token->type = CHARACTER;
2035 if (c == '[') /* '[' is a special char in a bracket exps. */
2039 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2040 c2 = re_string_peek_byte (input, 1);
2048 token->type = OP_OPEN_COLL_ELEM;
2051 token->type = OP_OPEN_EQUIV_CLASS;
2054 if (syntax & RE_CHAR_CLASSES)
2056 token->type = OP_OPEN_CHAR_CLASS;
2059 /* else fall through. */
2061 token->type = CHARACTER;
2071 token->type = OP_CHARSET_RANGE;
2074 token->type = OP_CLOSE_BRACKET;
2077 token->type = OP_NON_MATCH_LIST;
2080 token->type = CHARACTER;
2085 /* Functions for parser. */
2087 /* Entry point of the parser.
2088 Parse the regular expression REGEXP and return the structure tree.
2089 If an error is occured, ERR is set by error code, and return NULL.
2090 This function build the following tree, from regular expression <reg_exp>:
2096 CAT means concatenation.
2097 EOR means end of regular expression. */
2100 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2103 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104 bin_tree_t *tree, *eor, *root;
2105 re_token_t current_token;
2106 dfa->syntax = syntax;
2107 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2108 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2109 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113 root = create_tree (dfa, tree, eor, CONCAT);
2116 if (BE (eor == NULL || root == NULL, 0))
2124 /* This function build the following tree, from regular expression
2125 <branch1>|<branch2>:
2131 ALT means alternative, which represents the operator `|'. */
2134 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138 bin_tree_t *tree, *branch = NULL;
2139 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2140 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143 while (token->type == OP_ALT)
2145 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2146 if (token->type != OP_ALT && token->type != END_OF_RE
2147 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2150 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155 tree = create_tree (dfa, tree, branch, OP_ALT);
2156 if (BE (tree == NULL, 0))
2165 /* This function build the following tree, from regular expression
2172 CAT means concatenation. */
2175 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2178 bin_tree_t *tree, *exp;
2179 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2180 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 while (token->type != OP_ALT && token->type != END_OF_RE
2185 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2192 if (tree != NULL && exp != NULL)
2194 tree = create_tree (dfa, tree, exp, CONCAT);
2201 else if (tree == NULL)
2203 /* Otherwise exp == NULL, we don't need to create new tree. */
2208 /* This function build the following tree, from regular expression a*:
2215 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2216 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2220 switch (token->type)
2223 tree = create_token_tree (dfa, NULL, NULL, token);
2224 if (BE (tree == NULL, 0))
2229 #ifdef RE_ENABLE_I18N
2230 if (dfa->mb_cur_max > 1)
2232 while (!re_string_eoi (regexp)
2233 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2235 bin_tree_t *mbc_remain;
2236 fetch_token (token, regexp, syntax);
2237 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2238 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2239 if (BE (mbc_remain == NULL || tree == NULL, 0))
2248 case OP_OPEN_SUBEXP:
2249 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2250 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253 case OP_OPEN_BRACKET:
2254 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2255 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2264 dfa->used_bkref_map |= 1 << token->opr.idx;
2265 tree = create_token_tree (dfa, NULL, NULL, token);
2266 if (BE (tree == NULL, 0))
2272 dfa->has_mb_node = 1;
2274 case OP_OPEN_DUP_NUM:
2275 if (syntax & RE_CONTEXT_INVALID_DUP)
2281 case OP_DUP_ASTERISK:
2283 case OP_DUP_QUESTION:
2284 if (syntax & RE_CONTEXT_INVALID_OPS)
2289 else if (syntax & RE_CONTEXT_INDEP_OPS)
2291 fetch_token (token, regexp, syntax);
2292 return parse_expression (regexp, preg, token, syntax, nest, err);
2294 /* else fall through */
2295 case OP_CLOSE_SUBEXP:
2296 if ((token->type == OP_CLOSE_SUBEXP) &&
2297 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2302 /* else fall through */
2303 case OP_CLOSE_DUP_NUM:
2304 /* We treat it as a normal character. */
2306 /* Then we can these characters as normal characters. */
2307 token->type = CHARACTER;
2308 /* mb_partial and word_char bits should be initialized already
2310 tree = create_token_tree (dfa, NULL, NULL, token);
2311 if (BE (tree == NULL, 0))
2318 if ((token->opr.ctx_type
2319 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2320 && dfa->word_ops_used == 0)
2321 init_word_char (dfa);
2322 if (token->opr.ctx_type == WORD_DELIM
2323 || token->opr.ctx_type == NOT_WORD_DELIM)
2325 bin_tree_t *tree_first, *tree_last;
2326 if (token->opr.ctx_type == WORD_DELIM)
2328 token->opr.ctx_type = WORD_FIRST;
2329 tree_first = create_token_tree (dfa, NULL, NULL, token);
2330 token->opr.ctx_type = WORD_LAST;
2334 token->opr.ctx_type = INSIDE_WORD;
2335 tree_first = create_token_tree (dfa, NULL, NULL, token);
2336 token->opr.ctx_type = INSIDE_NOTWORD;
2338 tree_last = create_token_tree (dfa, NULL, NULL, token);
2339 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2340 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2348 tree = create_token_tree (dfa, NULL, NULL, token);
2349 if (BE (tree == NULL, 0))
2355 /* We must return here, since ANCHORs can't be followed
2356 by repetition operators.
2357 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2358 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2359 fetch_token (token, regexp, syntax);
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2368 if (dfa->mb_cur_max > 1)
2369 dfa->has_mb_node = 1;
2373 tree = build_charclass_op (dfa, regexp->trans,
2374 (const unsigned char *) "alnum",
2375 (const unsigned char *) "_",
2376 token->type == OP_NOTWORD, err);
2377 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 (const unsigned char *) "space",
2384 (const unsigned char *) "",
2385 token->type == OP_NOTSPACE, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 /* Must not happen? */
2402 fetch_token (token, regexp, syntax);
2404 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2405 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2407 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2408 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2410 /* In BRE consecutive duplications are not allowed. */
2411 if ((syntax & RE_CONTEXT_INVALID_DUP)
2412 && (token->type == OP_DUP_ASTERISK
2413 || token->type == OP_OPEN_DUP_NUM))
2423 /* This function build the following tree, from regular expression
2431 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2432 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2434 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2437 cur_nsub = preg->re_nsub++;
2439 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2441 /* The subexpression may be a null string. */
2442 if (token->type == OP_CLOSE_SUBEXP)
2446 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2447 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2449 if (BE (*err != REG_NOERROR, 0))
2453 if (cur_nsub <= '9' - '1')
2454 dfa->completed_bkref_map |= 1 << cur_nsub;
2456 tree = create_tree (dfa, tree, NULL, SUBEXP);
2457 if (BE (tree == NULL, 0))
2462 tree->token.opr.idx = cur_nsub;
2466 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2469 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2470 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2472 bin_tree_t *tree = NULL, *old_tree = NULL;
2473 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2474 re_token_t start_token = *token;
2476 if (token->type == OP_OPEN_DUP_NUM)
2479 start = fetch_number (regexp, token, syntax);
2480 if (start == REG_MISSING)
2482 if (token->type == CHARACTER && token->opr.c == ',')
2483 start = 0; /* We treat "{,m}" as "{0,m}". */
2486 *err = REG_BADBR; /* <re>{} is invalid. */
2490 if (BE (start != REG_ERROR, 1))
2492 /* We treat "{n}" as "{n,n}". */
2493 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2494 : ((token->type == CHARACTER && token->opr.c == ',')
2495 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2497 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2499 /* Invalid sequence. */
2500 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2502 if (token->type == END_OF_RE)
2510 /* If the syntax bit is set, rollback. */
2511 re_string_set_index (regexp, start_idx);
2512 *token = start_token;
2513 token->type = CHARACTER;
2514 /* mb_partial and word_char bits should be already initialized by
2519 if (BE (end != REG_MISSING && start > end, 0))
2521 /* First number greater than second. */
2528 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2529 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2532 fetch_token (token, regexp, syntax);
2534 if (BE (elem == NULL, 0))
2536 if (BE (start == 0 && end == 0, 0))
2538 postorder (elem, free_tree, NULL);
2542 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2543 if (BE (start > 0, 0))
2546 for (i = 2; i <= start; ++i)
2548 elem = duplicate_tree (elem, dfa);
2549 tree = create_tree (dfa, tree, elem, CONCAT);
2550 if (BE (elem == NULL || tree == NULL, 0))
2551 goto parse_dup_op_espace;
2557 /* Duplicate ELEM before it is marked optional. */
2558 elem = duplicate_tree (elem, dfa);
2564 if (elem->token.type == SUBEXP)
2565 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2567 tree = create_tree (dfa, elem, NULL,
2568 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2569 if (BE (tree == NULL, 0))
2570 goto parse_dup_op_espace;
2572 /* This loop is actually executed only when end != REG_MISSING,
2573 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2574 already created the start+1-th copy. */
2575 if ((Idx) -1 < 0 || end != REG_MISSING)
2576 for (i = start + 2; i <= end; ++i)
2578 elem = duplicate_tree (elem, dfa);
2579 tree = create_tree (dfa, tree, elem, CONCAT);
2580 if (BE (elem == NULL || tree == NULL, 0))
2581 goto parse_dup_op_espace;
2583 tree = create_tree (dfa, tree, NULL, OP_ALT);
2584 if (BE (tree == NULL, 0))
2585 goto parse_dup_op_espace;
2589 tree = create_tree (dfa, old_tree, tree, CONCAT);
2593 parse_dup_op_espace:
2598 /* Size of the names for collating symbol/equivalence_class/character_class.
2599 I'm not sure, but maybe enough. */
2600 #define BRACKET_NAME_BUF_SIZE 32
2603 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2604 Build the range expression which starts from START_ELEM, and ends
2605 at END_ELEM. The result are written to MBCSET and SBCSET.
2606 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2607 mbcset->range_ends, is a pointer argument sinse we may
2610 static reg_errcode_t
2612 # ifdef RE_ENABLE_I18N
2613 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2614 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2615 # else /* not RE_ENABLE_I18N */
2616 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2617 bracket_elem_t *end_elem)
2618 # endif /* not RE_ENABLE_I18N */
2620 unsigned int start_ch, end_ch;
2621 /* Equivalence Classes and Character Classes can't be a range start/end. */
2622 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2623 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2627 /* We can handle no multi character collating elements without libc
2629 if (BE ((start_elem->type == COLL_SYM
2630 && strlen ((char *) start_elem->opr.name) > 1)
2631 || (end_elem->type == COLL_SYM
2632 && strlen ((char *) end_elem->opr.name) > 1), 0))
2633 return REG_ECOLLATE;
2635 # ifdef RE_ENABLE_I18N
2640 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2642 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2643 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2645 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2646 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2648 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2649 ? __btowc (start_ch) : start_elem->opr.wch);
2650 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2651 ? __btowc (end_ch) : end_elem->opr.wch);
2652 if (start_wc == WEOF || end_wc == WEOF)
2653 return REG_ECOLLATE;
2654 cmp_buf[0] = start_wc;
2655 cmp_buf[4] = end_wc;
2656 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2659 /* Got valid collation sequence values, add them as a new entry.
2660 However, for !_LIBC we have no collation elements: if the
2661 character set is single byte, the single byte character set
2662 that we build below suffices. parse_bracket_exp passes
2663 no MBCSET if dfa->mb_cur_max == 1. */
2666 /* Check the space of the arrays. */
2667 if (BE (*range_alloc == mbcset->nranges, 0))
2669 /* There is not enough space, need realloc. */
2670 wchar_t *new_array_start, *new_array_end;
2673 /* +1 in case of mbcset->nranges is 0. */
2674 new_nranges = 2 * mbcset->nranges + 1;
2675 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2676 are NULL if *range_alloc == 0. */
2677 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2679 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2682 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2685 mbcset->range_starts = new_array_start;
2686 mbcset->range_ends = new_array_end;
2687 *range_alloc = new_nranges;
2690 mbcset->range_starts[mbcset->nranges] = start_wc;
2691 mbcset->range_ends[mbcset->nranges++] = end_wc;
2694 /* Build the table for single byte characters. */
2695 for (wc = 0; wc < SBC_MAX; ++wc)
2698 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2699 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2700 bitset_set (sbcset, wc);
2703 # else /* not RE_ENABLE_I18N */
2706 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2707 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2709 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2710 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2712 if (start_ch > end_ch)
2714 /* Build the table for single byte characters. */
2715 for (ch = 0; ch < SBC_MAX; ++ch)
2716 if (start_ch <= ch && ch <= end_ch)
2717 bitset_set (sbcset, ch);
2719 # endif /* not RE_ENABLE_I18N */
2722 #endif /* not _LIBC */
2725 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2726 Build the collating element which is represented by NAME.
2727 The result are written to MBCSET and SBCSET.
2728 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2729 pointer argument since we may update it. */
2731 static reg_errcode_t
2733 build_collating_symbol (bitset_t sbcset,
2734 # ifdef RE_ENABLE_I18N
2735 re_charset_t *mbcset, Idx *coll_sym_alloc,
2737 const unsigned char *name)
2739 size_t name_len = strlen ((const char *) name);
2740 if (BE (name_len != 1, 0))
2741 return REG_ECOLLATE;
2744 bitset_set (sbcset, name[0]);
2748 #endif /* not _LIBC */
2750 /* This function parse bracket expression like "[abc]", "[a-c]",
2754 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2755 reg_syntax_t syntax, reg_errcode_t *err)
2758 const unsigned char *collseqmb;
2759 const char *collseqwc;
2762 const int32_t *symb_table;
2763 const unsigned char *extra;
2765 /* Local function for parse_bracket_exp used in _LIBC environement.
2766 Seek the collating symbol entry correspondings to NAME.
2767 Return the index of the symbol in the SYMB_TABLE. */
2770 __attribute ((always_inline))
2771 seek_collating_symbol_entry (name, name_len)
2772 const unsigned char *name;
2775 int32_t hash = elem_hash ((const char *) name, name_len);
2776 int32_t elem = hash % table_size;
2777 if (symb_table[2 * elem] != 0)
2779 int32_t second = hash % (table_size - 2) + 1;
2783 /* First compare the hashing value. */
2784 if (symb_table[2 * elem] == hash
2785 /* Compare the length of the name. */
2786 && name_len == extra[symb_table[2 * elem + 1]]
2787 /* Compare the name. */
2788 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2791 /* Yep, this is the entry. */
2798 while (symb_table[2 * elem] != 0);
2803 /* Local function for parse_bracket_exp used in _LIBC environement.
2804 Look up the collation sequence value of BR_ELEM.
2805 Return the value if succeeded, UINT_MAX otherwise. */
2807 auto inline unsigned int
2808 __attribute ((always_inline))
2809 lookup_collation_sequence_value (br_elem)
2810 bracket_elem_t *br_elem;
2812 if (br_elem->type == SB_CHAR)
2815 if (MB_CUR_MAX == 1)
2818 return collseqmb[br_elem->opr.ch];
2821 wint_t wc = __btowc (br_elem->opr.ch);
2822 return __collseq_table_lookup (collseqwc, wc);
2825 else if (br_elem->type == MB_CHAR)
2827 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829 else if (br_elem->type == COLL_SYM)
2831 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2835 elem = seek_collating_symbol_entry (br_elem->opr.name,
2837 if (symb_table[2 * elem] != 0)
2839 /* We found the entry. */
2840 idx = symb_table[2 * elem + 1];
2841 /* Skip the name of collating element name. */
2842 idx += 1 + extra[idx];
2843 /* Skip the byte sequence of the collating element. */
2844 idx += 1 + extra[idx];
2845 /* Adjust for the alignment. */
2846 idx = (idx + 3) & ~3;
2847 /* Skip the multibyte collation sequence value. */
2848 idx += sizeof (unsigned int);
2849 /* Skip the wide char sequence of the collating element. */
2850 idx += sizeof (unsigned int) *
2851 (1 + *(unsigned int *) (extra + idx));
2852 /* Return the collation sequence value. */
2853 return *(unsigned int *) (extra + idx);
2855 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2857 /* No valid character. Match it as a single byte
2859 return collseqmb[br_elem->opr.name[0]];
2862 else if (sym_name_len == 1)
2863 return collseqmb[br_elem->opr.name[0]];
2868 /* Local function for parse_bracket_exp used in _LIBC environement.
2869 Build the range expression which starts from START_ELEM, and ends
2870 at END_ELEM. The result are written to MBCSET and SBCSET.
2871 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2872 mbcset->range_ends, is a pointer argument sinse we may
2875 auto inline reg_errcode_t
2876 __attribute ((always_inline))
2877 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2878 re_charset_t *mbcset;
2881 bracket_elem_t *start_elem, *end_elem;
2884 uint32_t start_collseq;
2885 uint32_t end_collseq;
2887 /* Equivalence Classes and Character Classes can't be a range
2889 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2890 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2894 start_collseq = lookup_collation_sequence_value (start_elem);
2895 end_collseq = lookup_collation_sequence_value (end_elem);
2896 /* Check start/end collation sequence values. */
2897 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2898 return REG_ECOLLATE;
2899 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2902 /* Got valid collation sequence values, add them as a new entry.
2903 However, if we have no collation elements, and the character set
2904 is single byte, the single byte character set that we
2905 build below suffices. */
2906 if (nrules > 0 || dfa->mb_cur_max > 1)
2908 /* Check the space of the arrays. */
2909 if (BE (*range_alloc == mbcset->nranges, 0))
2911 /* There is not enough space, need realloc. */
2912 uint32_t *new_array_start;
2913 uint32_t *new_array_end;
2916 /* +1 in case of mbcset->nranges is 0. */
2917 new_nranges = 2 * mbcset->nranges + 1;
2918 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2923 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2926 mbcset->range_starts = new_array_start;
2927 mbcset->range_ends = new_array_end;
2928 *range_alloc = new_nranges;
2931 mbcset->range_starts[mbcset->nranges] = start_collseq;
2932 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935 /* Build the table for single byte characters. */
2936 for (ch = 0; ch < SBC_MAX; ch++)
2938 uint32_t ch_collseq;
2940 if (MB_CUR_MAX == 1)
2943 ch_collseq = collseqmb[ch];
2945 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947 bitset_set (sbcset, ch);
2952 /* Local function for parse_bracket_exp used in _LIBC environement.
2953 Build the collating element which is represented by NAME.
2954 The result are written to MBCSET and SBCSET.
2955 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956 pointer argument sinse we may update it. */
2958 auto inline reg_errcode_t
2959 __attribute ((always_inline))
2960 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2961 re_charset_t *mbcset;
2962 Idx *coll_sym_alloc;
2964 const unsigned char *name;
2967 size_t name_len = strlen ((const char *) name);
2970 elem = seek_collating_symbol_entry (name, name_len);
2971 if (symb_table[2 * elem] != 0)
2973 /* We found the entry. */
2974 idx = symb_table[2 * elem + 1];
2975 /* Skip the name of collating element name. */
2976 idx += 1 + extra[idx];
2978 else if (symb_table[2 * elem] == 0 && name_len == 1)
2980 /* No valid character, treat it as a normal
2982 bitset_set (sbcset, name[0]);
2986 return REG_ECOLLATE;
2988 /* Got valid collation sequence, add it as a new entry. */
2989 /* Check the space of the arrays. */
2990 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2992 /* Not enough, realloc it. */
2993 /* +1 in case of mbcset->ncoll_syms is 0. */
2994 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2995 /* Use realloc since mbcset->coll_syms is NULL
2997 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2998 new_coll_sym_alloc);
2999 if (BE (new_coll_syms == NULL, 0))
3001 mbcset->coll_syms = new_coll_syms;
3002 *coll_sym_alloc = new_coll_sym_alloc;
3004 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3009 if (BE (name_len != 1, 0))
3010 return REG_ECOLLATE;
3013 bitset_set (sbcset, name[0]);
3020 re_token_t br_token;
3021 re_bitset_ptr_t sbcset;
3022 #ifdef RE_ENABLE_I18N
3023 re_charset_t *mbcset;
3024 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3025 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3026 #endif /* not RE_ENABLE_I18N */
3027 bool non_match = false;
3028 bin_tree_t *work_tree;
3030 bool first_round = true;
3032 collseqmb = (const unsigned char *)
3033 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3034 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3040 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3041 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3042 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3043 _NL_COLLATE_SYMB_TABLEMB);
3044 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_SYMB_EXTRAMB);
3048 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049 #ifdef RE_ENABLE_I18N
3050 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3051 #endif /* RE_ENABLE_I18N */
3052 #ifdef RE_ENABLE_I18N
3053 if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 if (BE (sbcset == NULL, 0))
3056 #endif /* RE_ENABLE_I18N */
3062 token_len = peek_token_bracket (token, regexp, syntax);
3063 if (BE (token->type == END_OF_RE, 0))
3066 goto parse_bracket_exp_free_return;
3068 if (token->type == OP_NON_MATCH_LIST)
3070 #ifdef RE_ENABLE_I18N
3071 mbcset->non_match = 1;
3072 #endif /* not RE_ENABLE_I18N */
3074 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3075 bitset_set (sbcset, '\0');
3076 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3077 token_len = peek_token_bracket (token, regexp, syntax);
3078 if (BE (token->type == END_OF_RE, 0))
3081 goto parse_bracket_exp_free_return;
3085 /* We treat the first ']' as a normal character. */
3086 if (token->type == OP_CLOSE_BRACKET)
3087 token->type = CHARACTER;
3091 bracket_elem_t start_elem, end_elem;
3092 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3093 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3096 bool is_range_exp = false;
3099 start_elem.opr.name = start_name_buf;
3100 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3101 syntax, first_round);
3102 if (BE (ret != REG_NOERROR, 0))
3105 goto parse_bracket_exp_free_return;
3107 first_round = false;
3109 /* Get information about the next token. We need it in any case. */
3110 token_len = peek_token_bracket (token, regexp, syntax);
3112 /* Do not check for ranges if we know they are not allowed. */
3113 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3115 if (BE (token->type == END_OF_RE, 0))
3118 goto parse_bracket_exp_free_return;
3120 if (token->type == OP_CHARSET_RANGE)
3122 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3123 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3124 if (BE (token2.type == END_OF_RE, 0))
3127 goto parse_bracket_exp_free_return;
3129 if (token2.type == OP_CLOSE_BRACKET)
3131 /* We treat the last '-' as a normal character. */
3132 re_string_skip_bytes (regexp, -token_len);
3133 token->type = CHARACTER;
3136 is_range_exp = true;
3140 if (is_range_exp == true)
3142 end_elem.opr.name = end_name_buf;
3143 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3145 if (BE (ret != REG_NOERROR, 0))
3148 goto parse_bracket_exp_free_return;
3151 token_len = peek_token_bracket (token, regexp, syntax);
3154 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3155 &start_elem, &end_elem);
3157 # ifdef RE_ENABLE_I18N
3158 *err = build_range_exp (sbcset,
3159 dfa->mb_cur_max > 1 ? mbcset : NULL,
3160 &range_alloc, &start_elem, &end_elem);
3162 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3164 #endif /* RE_ENABLE_I18N */
3165 if (BE (*err != REG_NOERROR, 0))
3166 goto parse_bracket_exp_free_return;
3170 switch (start_elem.type)
3173 bitset_set (sbcset, start_elem.opr.ch);
3175 #ifdef RE_ENABLE_I18N
3177 /* Check whether the array has enough space. */
3178 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3180 wchar_t *new_mbchars;
3181 /* Not enough, realloc it. */
3182 /* +1 in case of mbcset->nmbchars is 0. */
3183 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3184 /* Use realloc since array is NULL if *alloc == 0. */
3185 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3187 if (BE (new_mbchars == NULL, 0))
3188 goto parse_bracket_exp_espace;
3189 mbcset->mbchars = new_mbchars;
3191 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3193 #endif /* RE_ENABLE_I18N */
3195 *err = build_equiv_class (sbcset,
3196 #ifdef RE_ENABLE_I18N
3197 mbcset, &equiv_class_alloc,
3198 #endif /* RE_ENABLE_I18N */
3199 start_elem.opr.name);
3200 if (BE (*err != REG_NOERROR, 0))
3201 goto parse_bracket_exp_free_return;
3204 *err = build_collating_symbol (sbcset,
3205 #ifdef RE_ENABLE_I18N
3206 mbcset, &coll_sym_alloc,
3207 #endif /* RE_ENABLE_I18N */
3208 start_elem.opr.name);
3209 if (BE (*err != REG_NOERROR, 0))
3210 goto parse_bracket_exp_free_return;
3213 *err = build_charclass (regexp->trans, sbcset,
3214 #ifdef RE_ENABLE_I18N
3215 mbcset, &char_class_alloc,
3216 #endif /* RE_ENABLE_I18N */
3217 start_elem.opr.name, syntax);
3218 if (BE (*err != REG_NOERROR, 0))
3219 goto parse_bracket_exp_free_return;
3226 if (BE (token->type == END_OF_RE, 0))
3229 goto parse_bracket_exp_free_return;
3231 if (token->type == OP_CLOSE_BRACKET)
3235 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3237 /* If it is non-matching list. */
3239 bitset_not (sbcset);
3241 #ifdef RE_ENABLE_I18N
3242 /* Ensure only single byte characters are set. */
3243 if (dfa->mb_cur_max > 1)
3244 bitset_mask (sbcset, dfa->sb_char);
3246 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3247 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3248 || mbcset->non_match)))
3250 bin_tree_t *mbc_tree;
3252 /* Build a tree for complex bracket. */
3253 dfa->has_mb_node = 1;
3254 br_token.type = COMPLEX_BRACKET;
3255 br_token.opr.mbcset = mbcset;
3256 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3257 if (BE (mbc_tree == NULL, 0))
3258 goto parse_bracket_exp_espace;
3259 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3260 if (sbcset[sbc_idx])
3262 /* If there are no bits set in sbcset, there is no point
3263 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3264 if (sbc_idx < BITSET_WORDS)
3266 /* Build a tree for simple bracket. */
3267 br_token.type = SIMPLE_BRACKET;
3268 br_token.opr.sbcset = sbcset;
3269 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270 if (BE (work_tree == NULL, 0))
3271 goto parse_bracket_exp_espace;
3273 /* Then join them by ALT node. */
3274 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3275 if (BE (work_tree == NULL, 0))
3276 goto parse_bracket_exp_espace;
3281 work_tree = mbc_tree;
3285 #endif /* not RE_ENABLE_I18N */
3287 #ifdef RE_ENABLE_I18N
3288 free_charset (mbcset);
3290 /* Build a tree for simple bracket. */
3291 br_token.type = SIMPLE_BRACKET;
3292 br_token.opr.sbcset = sbcset;
3293 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294 if (BE (work_tree == NULL, 0))
3295 goto parse_bracket_exp_espace;
3299 parse_bracket_exp_espace:
3301 parse_bracket_exp_free_return:
3303 #ifdef RE_ENABLE_I18N
3304 free_charset (mbcset);
3305 #endif /* RE_ENABLE_I18N */
3309 /* Parse an element in the bracket expression. */
3311 static reg_errcode_t
3312 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3313 re_token_t *token, int token_len, re_dfa_t *dfa,
3314 reg_syntax_t syntax, bool accept_hyphen)
3316 #ifdef RE_ENABLE_I18N
3318 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3319 if (cur_char_size > 1)
3321 elem->type = MB_CHAR;
3322 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3323 re_string_skip_bytes (regexp, cur_char_size);
3326 #endif /* RE_ENABLE_I18N */
3327 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3328 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3329 || token->type == OP_OPEN_EQUIV_CLASS)
3330 return parse_bracket_symbol (elem, regexp, token);
3331 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3333 /* A '-' must only appear as anything but a range indicator before
3334 the closing bracket. Everything else is an error. */
3336 (void) peek_token_bracket (&token2, regexp, syntax);
3337 if (token2.type != OP_CLOSE_BRACKET)
3338 /* The actual error value is not standardized since this whole
3339 case is undefined. But ERANGE makes good sense. */
3342 elem->type = SB_CHAR;
3343 elem->opr.ch = token->opr.c;
3347 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3348 such as [:<character_class>:], [.<collating_element>.], and
3349 [=<equivalent_class>=]. */
3351 static reg_errcode_t
3352 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3355 unsigned char ch, delim = token->opr.c;
3357 if (re_string_eoi(regexp))
3361 if (i >= BRACKET_NAME_BUF_SIZE)
3363 if (token->type == OP_OPEN_CHAR_CLASS)
3364 ch = re_string_fetch_byte_case (regexp);
3366 ch = re_string_fetch_byte (regexp);
3367 if (re_string_eoi(regexp))
3369 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3371 elem->opr.name[i] = ch;
3373 re_string_skip_bytes (regexp, 1);
3374 elem->opr.name[i] = '\0';
3375 switch (token->type)
3377 case OP_OPEN_COLL_ELEM:
3378 elem->type = COLL_SYM;
3380 case OP_OPEN_EQUIV_CLASS:
3381 elem->type = EQUIV_CLASS;
3383 case OP_OPEN_CHAR_CLASS:
3384 elem->type = CHAR_CLASS;
3392 /* Helper function for parse_bracket_exp.
3393 Build the equivalence class which is represented by NAME.
3394 The result are written to MBCSET and SBCSET.
3395 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3396 is a pointer argument sinse we may update it. */
3398 static reg_errcode_t
3399 #ifdef RE_ENABLE_I18N
3400 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3401 Idx *equiv_class_alloc, const unsigned char *name)
3402 #else /* not RE_ENABLE_I18N */
3403 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3404 #endif /* not RE_ENABLE_I18N */
3407 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3410 const int32_t *table, *indirect;
3411 const unsigned char *weights, *extra, *cp;
3412 unsigned char char_buf[2];
3416 /* This #include defines a local function! */
3417 # include <locale/weight.h>
3418 /* Calculate the index for equivalence class. */
3420 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3421 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422 _NL_COLLATE_WEIGHTMB);
3423 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424 _NL_COLLATE_EXTRAMB);
3425 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3426 _NL_COLLATE_INDIRECTMB);
3427 idx1 = findidx (&cp);
3428 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3429 /* This isn't a valid character. */
3430 return REG_ECOLLATE;
3432 /* Build single byte matcing table for this equivalence class. */
3433 char_buf[1] = (unsigned char) '\0';
3434 len = weights[idx1];
3435 for (ch = 0; ch < SBC_MAX; ++ch)
3439 idx2 = findidx (&cp);
3444 /* This isn't a valid character. */
3446 if (len == weights[idx2])
3449 while (cnt <= len &&
3450 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3454 bitset_set (sbcset, ch);
3457 /* Check whether the array has enough space. */
3458 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460 /* Not enough, realloc it. */
3461 /* +1 in case of mbcset->nequiv_classes is 0. */
3462 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3463 /* Use realloc since the array is NULL if *alloc == 0. */
3464 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466 new_equiv_class_alloc);
3467 if (BE (new_equiv_classes == NULL, 0))
3469 mbcset->equiv_classes = new_equiv_classes;
3470 *equiv_class_alloc = new_equiv_class_alloc;
3472 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3477 if (BE (strlen ((const char *) name) != 1, 0))
3478 return REG_ECOLLATE;
3479 bitset_set (sbcset, *name);
3484 /* Helper function for parse_bracket_exp.
3485 Build the character class which is represented by NAME.
3486 The result are written to MBCSET and SBCSET.
3487 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3488 is a pointer argument sinse we may update it. */
3490 static reg_errcode_t
3491 #ifdef RE_ENABLE_I18N
3492 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3493 re_charset_t *mbcset, Idx *char_class_alloc,
3494 const unsigned char *class_name, reg_syntax_t syntax)
3495 #else /* not RE_ENABLE_I18N */
3496 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3497 const unsigned char *class_name, reg_syntax_t syntax)
3498 #endif /* not RE_ENABLE_I18N */
3501 const char *name = (const char *) class_name;
3503 /* In case of REG_ICASE "upper" and "lower" match the both of
3504 upper and lower cases. */
3505 if ((syntax & RE_ICASE)
3506 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3509 #ifdef RE_ENABLE_I18N
3510 /* Check the space of the arrays. */
3511 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513 /* Not enough, realloc it. */
3514 /* +1 in case of mbcset->nchar_classes is 0. */
3515 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3516 /* Use realloc since array is NULL if *alloc == 0. */
3517 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3518 new_char_class_alloc);
3519 if (BE (new_char_classes == NULL, 0))
3521 mbcset->char_classes = new_char_classes;
3522 *char_class_alloc = new_char_class_alloc;
3524 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3525 #endif /* RE_ENABLE_I18N */
3527 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3529 if (BE (trans != NULL, 0)) \
3531 for (i = 0; i < SBC_MAX; ++i) \
3532 if (ctype_func (i)) \
3533 bitset_set (sbcset, trans[i]); \
3537 for (i = 0; i < SBC_MAX; ++i) \
3538 if (ctype_func (i)) \
3539 bitset_set (sbcset, i); \
3543 if (strcmp (name, "alnum") == 0)
3544 BUILD_CHARCLASS_LOOP (isalnum);
3545 else if (strcmp (name, "cntrl") == 0)
3546 BUILD_CHARCLASS_LOOP (iscntrl);
3547 else if (strcmp (name, "lower") == 0)
3548 BUILD_CHARCLASS_LOOP (islower);
3549 else if (strcmp (name, "space") == 0)
3550 BUILD_CHARCLASS_LOOP (isspace);
3551 else if (strcmp (name, "alpha") == 0)
3552 BUILD_CHARCLASS_LOOP (isalpha);
3553 else if (strcmp (name, "digit") == 0)
3554 BUILD_CHARCLASS_LOOP (isdigit);
3555 else if (strcmp (name, "print") == 0)
3556 BUILD_CHARCLASS_LOOP (isprint);
3557 else if (strcmp (name, "upper") == 0)
3558 BUILD_CHARCLASS_LOOP (isupper);
3559 else if (strcmp (name, "blank") == 0)
3560 BUILD_CHARCLASS_LOOP (isblank);
3561 else if (strcmp (name, "graph") == 0)
3562 BUILD_CHARCLASS_LOOP (isgraph);
3563 else if (strcmp (name, "punct") == 0)
3564 BUILD_CHARCLASS_LOOP (ispunct);
3565 else if (strcmp (name, "xdigit") == 0)
3566 BUILD_CHARCLASS_LOOP (isxdigit);
3574 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3575 const unsigned char *class_name,
3576 const unsigned char *extra, bool non_match,
3579 re_bitset_ptr_t sbcset;
3580 #ifdef RE_ENABLE_I18N
3581 re_charset_t *mbcset;
3583 #endif /* not RE_ENABLE_I18N */
3585 re_token_t br_token;
3588 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3589 #ifdef RE_ENABLE_I18N
3590 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3591 #endif /* RE_ENABLE_I18N */
3593 #ifdef RE_ENABLE_I18N
3594 if (BE (sbcset == NULL || mbcset == NULL, 0))
3595 #else /* not RE_ENABLE_I18N */
3596 if (BE (sbcset == NULL, 0))
3597 #endif /* not RE_ENABLE_I18N */
3605 #ifdef RE_ENABLE_I18N
3607 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3608 bitset_set(cset->sbcset, '\0');
3610 mbcset->non_match = 1;
3611 #endif /* not RE_ENABLE_I18N */
3614 /* We don't care the syntax in this case. */
3615 ret = build_charclass (trans, sbcset,
3616 #ifdef RE_ENABLE_I18N
3618 #endif /* RE_ENABLE_I18N */
3621 if (BE (ret != REG_NOERROR, 0))
3624 #ifdef RE_ENABLE_I18N
3625 free_charset (mbcset);
3626 #endif /* RE_ENABLE_I18N */
3630 /* \w match '_' also. */
3631 for (; *extra; extra++)
3632 bitset_set (sbcset, *extra);
3634 /* If it is non-matching list. */
3636 bitset_not (sbcset);
3638 #ifdef RE_ENABLE_I18N
3639 /* Ensure only single byte characters are set. */
3640 if (dfa->mb_cur_max > 1)
3641 bitset_mask (sbcset, dfa->sb_char);
3644 /* Build a tree for simple bracket. */
3645 br_token.type = SIMPLE_BRACKET;
3646 br_token.opr.sbcset = sbcset;
3647 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3648 if (BE (tree == NULL, 0))
3649 goto build_word_op_espace;
3651 #ifdef RE_ENABLE_I18N
3652 if (dfa->mb_cur_max > 1)
3654 bin_tree_t *mbc_tree;
3655 /* Build a tree for complex bracket. */
3656 br_token.type = COMPLEX_BRACKET;
3657 br_token.opr.mbcset = mbcset;
3658 dfa->has_mb_node = 1;
3659 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660 if (BE (mbc_tree == NULL, 0))
3661 goto build_word_op_espace;
3662 /* Then join them by ALT node. */
3663 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3664 if (BE (mbc_tree != NULL, 1))
3669 free_charset (mbcset);
3672 #else /* not RE_ENABLE_I18N */
3674 #endif /* not RE_ENABLE_I18N */
3676 build_word_op_espace:
3678 #ifdef RE_ENABLE_I18N
3679 free_charset (mbcset);
3680 #endif /* RE_ENABLE_I18N */
3685 /* This is intended for the expressions like "a{1,3}".
3686 Fetch a number from `input', and return the number.
3687 Return REG_MISSING if the number field is empty like "{,1}".
3688 Return REG_ERROR if an error occurred. */
3691 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3693 Idx num = REG_MISSING;
3697 fetch_token (token, input, syntax);
3699 if (BE (token->type == END_OF_RE, 0))
3701 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3703 num = ((token->type != CHARACTER || c < '0' || '9' < c
3704 || num == REG_ERROR)
3706 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3707 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3712 #ifdef RE_ENABLE_I18N
3714 free_charset (re_charset_t *cset)
3716 re_free (cset->mbchars);
3718 re_free (cset->coll_syms);
3719 re_free (cset->equiv_classes);
3720 re_free (cset->range_starts);
3721 re_free (cset->range_ends);
3723 re_free (cset->char_classes);
3726 #endif /* RE_ENABLE_I18N */
3728 /* Functions for binary tree operation. */
3730 /* Create a tree node. */
3733 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3734 re_token_type_t type)
3738 return create_token_tree (dfa, left, right, &t);
3742 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743 const re_token_t *token)
3746 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3748 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3750 if (storage == NULL)
3752 storage->next = dfa->str_tree_storage;
3753 dfa->str_tree_storage = storage;
3754 dfa->str_tree_storage_idx = 0;
3756 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3758 tree->parent = NULL;
3760 tree->right = right;
3761 tree->token = *token;
3762 tree->token.duplicated = 0;
3763 tree->token.opt_subexp = 0;
3766 tree->node_idx = REG_MISSING;
3769 left->parent = tree;
3771 right->parent = tree;
3775 /* Mark the tree SRC as an optional subexpression.
3776 To be called from preorder or postorder. */
3778 static reg_errcode_t
3779 mark_opt_subexp (void *extra, bin_tree_t *node)
3781 Idx idx = (Idx) (long) extra;
3782 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3783 node->token.opt_subexp = 1;
3788 /* Free the allocated memory inside NODE. */
3791 free_token (re_token_t *node)
3793 #ifdef RE_ENABLE_I18N
3794 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3795 free_charset (node->opr.mbcset);
3797 #endif /* RE_ENABLE_I18N */
3798 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3799 re_free (node->opr.sbcset);
3802 /* Worker function for tree walking. Free the allocated memory inside NODE
3803 and its children. */
3805 static reg_errcode_t
3806 free_tree (void *extra, bin_tree_t *node)
3808 free_token (&node->token);
3813 /* Duplicate the node SRC, and return new node. This is a preorder
3814 visit similar to the one implemented by the generic visitor, but
3815 we need more infrastructure to maintain two parallel trees --- so,
3816 it's easier to duplicate. */
3819 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3821 const bin_tree_t *node;
3822 bin_tree_t *dup_root;
3823 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3825 for (node = root; ; )
3827 /* Create a new tree and link it back to the current parent. */
3828 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831 (*p_new)->parent = dup_node;
3832 (*p_new)->token.duplicated = 1;
3835 /* Go to the left node, or up and to the right. */
3839 p_new = &dup_node->left;
3843 const bin_tree_t *prev = NULL;
3844 while (node->right == prev || node->right == NULL)
3847 node = node->parent;
3848 dup_node = dup_node->parent;
3853 p_new = &dup_node->right;