1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 Idx length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, bool icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, false, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
642 re_comp (const char *s)
649 if (!re_comp_buf.re_buffer)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf.re_buffer)
656 fastmap = re_comp_buf.re_fastmap;
657 re_comp_buf.re_fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.re_fastmap = fastmap;
663 if (re_comp_buf.re_fastmap == NULL)
665 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
666 if (re_comp_buf.re_fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since `re_exec' always passes NULL for the `regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.re_newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
682 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
703 reg_errcode_t err = REG_NOERROR;
707 /* Initialize the pattern buffer. */
708 preg->re_fastmap_accurate = 0;
709 preg->re_syntax = syntax;
710 preg->re_not_bol = preg->re_not_eol = 0;
713 preg->re_can_be_null = 0;
714 preg->re_regs_allocated = REG_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->re_buffer;
718 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If buffer is null this
723 is a simple allocation. */
724 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
727 preg->re_allocated = sizeof (re_dfa_t);
728 preg->re_buffer = (unsigned char *) dfa;
730 preg->re_used = sizeof (re_dfa_t);
732 __libc_lock_init (dfa->lock);
734 err = init_dfa (dfa, length);
735 if (BE (err != REG_NOERROR, 0))
737 free_dfa_content (dfa);
738 preg->re_buffer = NULL;
739 preg->re_allocated = 0;
743 dfa->re_str = re_malloc (char, length + 1);
744 strncpy (dfa->re_str, pattern, length + 1);
747 err = re_string_construct (®exp, pattern, length, preg->re_translate,
748 syntax & REG_IGNORE_CASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->re_buffer = NULL;
756 preg->re_allocated = 0;
760 /* Parse the regular expression, and build a structure tree. */
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->re_buffer = NULL;
788 preg->re_allocated = 0;
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
798 init_dfa (re_dfa_t *dfa, Idx pat_len)
800 __re_size_t table_size;
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
815 if (0 < (Idx) -1 && table_size == 0)
818 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->mb_cur_max = MB_CUR_MAX;
823 if (dfa->mb_cur_max == 6
824 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
826 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name = nl_langinfo (CODESET);
832 codeset_name = getenv ("LC_ALL");
833 if (codeset_name == NULL || codeset_name[0] == '\0')
834 codeset_name = getenv ("LC_CTYPE");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LANG");
837 if (codeset_name == NULL)
839 else if (strchr (codeset_name, '.') != NULL)
840 codeset_name = strchr (codeset_name, '.') + 1;
843 if (strcasecmp (codeset_name, "UTF-8") == 0
844 || strcasecmp (codeset_name, "UTF8") == 0)
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa->map_notascii = 0;
852 #ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
856 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
862 if (BE (dfa->sb_char == NULL, 0))
865 /* Clear all bits by, then set those corresponding to single
867 bitset_empty (dfa->sb_char);
869 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
870 for (j = 0; j < UINT_BITS; ++j, ++ch)
872 wint_t wch = __btowc (ch);
874 dfa->sb_char[i] |= 1u << j;
876 if (isascii (ch) && wch != ch)
877 dfa->map_notascii = 1;
884 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
894 init_word_char (re_dfa_t *dfa)
897 dfa->word_ops_used = 1;
898 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
899 for (j = 0; j < UINT_BITS; ++j, ++ch)
900 if (isalnum (ch) || ch == '_')
901 dfa->word_char[i] |= 1u << j;
904 /* Free the work area which are only used while compiling. */
907 free_workarea_compile (regex_t *preg)
909 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
910 bin_tree_storage_t *storage, *next;
911 for (storage = dfa->str_tree_storage; storage; storage = next)
913 next = storage->next;
916 dfa->str_tree_storage = NULL;
917 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
918 dfa->str_tree = NULL;
919 re_free (dfa->org_indices);
920 dfa->org_indices = NULL;
923 /* Create initial states for all contexts. */
926 create_initial_state (re_dfa_t *dfa)
930 re_node_set init_nodes;
932 /* Initial states have the epsilon closure of the node which is
933 the first node of the regular expression. */
934 first = dfa->str_tree->first->node_idx;
935 dfa->init_node = first;
936 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
937 if (BE (err != REG_NOERROR, 0))
940 /* The back-references which are in initial states can epsilon transit,
941 since in this case all of the subexpressions can be null.
942 Then we add epsilon closures of the nodes which are the next nodes of
943 the back-references. */
944 if (dfa->nbackref > 0)
945 for (i = 0; i < init_nodes.nelem; ++i)
947 Idx node_idx = init_nodes.elems[i];
948 re_token_type_t type = dfa->nodes[node_idx].type;
951 if (type != OP_BACK_REF)
953 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
955 re_token_t *clexp_node;
956 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
957 if (clexp_node->type == OP_CLOSE_SUBEXP
958 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 if (clexp_idx == init_nodes.nelem)
964 if (type == OP_BACK_REF)
966 Idx dest_idx = dfa->edests[node_idx].elems[0];
967 if (!re_node_set_contains (&init_nodes, dest_idx))
969 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
975 /* It must be the first time to invoke acquire_state. */
976 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
977 /* We don't check ERR here, since the initial state must not be NULL. */
978 if (BE (dfa->init_state == NULL, 0))
980 if (dfa->init_state->has_constraint)
982 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
984 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
986 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
990 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
991 || dfa->init_state_begbuf == NULL, 0))
995 dfa->init_state_word = dfa->init_state_nl
996 = dfa->init_state_begbuf = dfa->init_state;
998 re_node_set_free (&init_nodes);
1002 #ifdef RE_ENABLE_I18N
1003 /* If it is possible to do searching in single byte encoding instead of UTF-8
1004 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1005 DFA nodes where needed. */
1008 optimize_utf8 (re_dfa_t *dfa)
1012 bool mb_chars = false;
1013 bool has_period = false;
1015 for (node = 0; node < dfa->nodes_len; ++node)
1016 switch (dfa->nodes[node].type)
1019 if (dfa->nodes[node].opr.c >= 0x80)
1023 switch (dfa->nodes[node].opr.idx)
1031 /* Word anchors etc. cannot be handled. */
1041 case OP_DUP_ASTERISK:
1042 case OP_OPEN_SUBEXP:
1043 case OP_CLOSE_SUBEXP:
1045 case COMPLEX_BRACKET:
1047 case SIMPLE_BRACKET:
1048 /* Just double check. */
1049 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1050 if (dfa->nodes[node].opr.sbcset[i])
1057 if (mb_chars || has_period)
1058 for (node = 0; node < dfa->nodes_len; ++node)
1060 if (dfa->nodes[node].type == CHARACTER
1061 && dfa->nodes[node].opr.c >= 0x80)
1062 dfa->nodes[node].mb_partial = 0;
1063 else if (dfa->nodes[node].type == OP_PERIOD)
1064 dfa->nodes[node].type = OP_UTF8_PERIOD;
1067 /* The search can be in single byte locale. */
1068 dfa->mb_cur_max = 1;
1070 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075 "eclosure", and "inveclosure". */
1077 static reg_errcode_t
1078 analyze (regex_t *preg)
1080 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1083 /* Allocate arrays. */
1084 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1085 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1086 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089 || dfa->eclosures == NULL, 0))
1092 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1093 if (dfa->subexp_map != NULL)
1096 for (i = 0; i < preg->re_nsub; i++)
1097 dfa->subexp_map[i] = i;
1098 preorder (dfa->str_tree, optimize_subexps, dfa);
1099 for (i = 0; i < preg->re_nsub; i++)
1100 if (dfa->subexp_map[i] != i)
1102 if (i == preg->re_nsub)
1104 free (dfa->subexp_map);
1105 dfa->subexp_map = NULL;
1109 ret = postorder (dfa->str_tree, lower_subexps, preg);
1110 if (BE (ret != REG_NOERROR, 0))
1112 ret = postorder (dfa->str_tree, calc_first, dfa);
1113 if (BE (ret != REG_NOERROR, 0))
1115 preorder (dfa->str_tree, calc_next, dfa);
1116 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1119 ret = calc_eclosure (dfa);
1120 if (BE (ret != REG_NOERROR, 0))
1123 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1125 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1128 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129 if (BE (dfa->inveclosures == NULL, 0))
1131 ret = calc_inveclosure (dfa);
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138 implement parse tree visits. Instead, we use parent pointers and
1139 some hairy code in these two functions. */
1140 static reg_errcode_t
1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1144 bin_tree_t *node, *prev;
1146 for (node = root; ; )
1148 /* Descend down the tree, preferably to the left (or to the right
1149 if that's the only child). */
1150 while (node->left || node->right)
1158 reg_errcode_t err = fn (extra, node);
1159 if (BE (err != REG_NOERROR, 0))
1161 if (node->parent == NULL)
1164 node = node->parent;
1166 /* Go up while we have a node that is reached from the right. */
1167 while (node->right == prev || node->right == NULL);
1172 static reg_errcode_t
1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1178 for (node = root; ; )
1180 reg_errcode_t err = fn (extra, node);
1181 if (BE (err != REG_NOERROR, 0))
1184 /* Go to the left node, or up and to the right. */
1189 bin_tree_t *prev = NULL;
1190 while (node->right == prev || node->right == NULL)
1193 node = node->parent;
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1204 backreferences as well. Requires a preorder visit. */
1205 static reg_errcode_t
1206 optimize_subexps (void *extra, bin_tree_t *node)
1208 re_dfa_t *dfa = (re_dfa_t *) extra;
1210 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1212 int idx = node->token.opr.idx;
1213 node->token.opr.idx = dfa->subexp_map[idx];
1214 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1217 else if (node->token.type == SUBEXP
1218 && node->left && node->left->token.type == SUBEXP)
1220 Idx other_idx = node->left->token.opr.idx;
1222 node->left = node->left->left;
1224 node->left->parent = node;
1226 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1228 dfa->used_bkref_map &= ~(1u << other_idx);
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1236 static reg_errcode_t
1237 lower_subexps (void *extra, bin_tree_t *node)
1239 regex_t *preg = (regex_t *) extra;
1240 reg_errcode_t err = REG_NOERROR;
1242 if (node->left && node->left->token.type == SUBEXP)
1244 node->left = lower_subexp (&err, preg, node->left);
1246 node->left->parent = node;
1248 if (node->right && node->right->token.type == SUBEXP)
1250 node->right = lower_subexp (&err, preg, node->right);
1252 node->right->parent = node;
1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1261 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1262 bin_tree_t *body = node->left;
1263 bin_tree_t *op, *cls, *tree1, *tree;
1266 /* We do not optimize empty subexpressions, because otherwise we may
1267 have bad CONCAT nodes with NULL children. This is obviously not
1268 very common, so we do not lose much. An example that triggers
1269 this case is the sed "script" /\(\)/x. */
1270 && node->left != NULL
1271 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1272 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1275 /* Convert the SUBEXP node to the concatenation of an
1276 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1277 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1278 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1279 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1280 tree = create_tree (dfa, op, tree1, CONCAT);
1281 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1287 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1288 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1292 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1293 nodes. Requires a postorder visit. */
1294 static reg_errcode_t
1295 calc_first (void *extra, bin_tree_t *node)
1297 re_dfa_t *dfa = (re_dfa_t *) extra;
1298 if (node->token.type == CONCAT)
1300 node->first = node->left->first;
1301 node->node_idx = node->left->node_idx;
1306 node->node_idx = re_dfa_add_node (dfa, node->token);
1307 if (BE (node->node_idx == REG_MISSING, 0))
1313 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1314 static reg_errcode_t
1315 calc_next (void *extra, bin_tree_t *node)
1317 switch (node->token.type)
1319 case OP_DUP_ASTERISK:
1320 node->left->next = node;
1323 node->left->next = node->right->first;
1324 node->right->next = node->next;
1328 node->left->next = node->next;
1330 node->right->next = node->next;
1336 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1337 static reg_errcode_t
1338 link_nfa_nodes (void *extra, bin_tree_t *node)
1340 re_dfa_t *dfa = (re_dfa_t *) extra;
1341 Idx idx = node->node_idx;
1342 reg_errcode_t err = REG_NOERROR;
1344 switch (node->token.type)
1350 assert (node->next == NULL);
1353 case OP_DUP_ASTERISK:
1357 dfa->has_plural_match = 1;
1358 if (node->left != NULL)
1359 left = node->left->first->node_idx;
1361 left = node->next->node_idx;
1362 if (node->right != NULL)
1363 right = node->right->first->node_idx;
1365 right = node->next->node_idx;
1366 assert (REG_VALID_INDEX (left));
1367 assert (REG_VALID_INDEX (right));
1368 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1373 case OP_OPEN_SUBEXP:
1374 case OP_CLOSE_SUBEXP:
1375 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1379 dfa->nexts[idx] = node->next->node_idx;
1380 if (node->token.type == OP_BACK_REF)
1381 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1385 assert (!IS_EPSILON_NODE (node->token.type));
1386 dfa->nexts[idx] = node->next->node_idx;
1393 /* Duplicate the epsilon closure of the node ROOT_NODE.
1394 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1395 to their own constraint. */
1397 static reg_errcode_t
1398 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1399 Idx top_clone_node, Idx root_node,
1400 unsigned int init_constraint)
1402 Idx org_node, clone_node;
1404 unsigned int constraint = init_constraint;
1405 for (org_node = top_org_node, clone_node = top_clone_node;;)
1407 Idx org_dest, clone_dest;
1408 if (dfa->nodes[org_node].type == OP_BACK_REF)
1410 /* If the back reference epsilon-transit, its destination must
1411 also have the constraint. Then duplicate the epsilon closure
1412 of the destination of the back reference, and store it in
1413 edests of the back reference. */
1414 org_dest = dfa->nexts[org_node];
1415 re_node_set_empty (dfa->edests + clone_node);
1416 clone_dest = duplicate_node (dfa, org_dest, constraint);
1417 if (BE (clone_dest == REG_MISSING, 0))
1419 dfa->nexts[clone_node] = dfa->nexts[org_node];
1420 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1424 else if (dfa->edests[org_node].nelem == 0)
1426 /* In case of the node can't epsilon-transit, don't duplicate the
1427 destination and store the original destination as the
1428 destination of the node. */
1429 dfa->nexts[clone_node] = dfa->nexts[org_node];
1432 else if (dfa->edests[org_node].nelem == 1)
1434 /* In case of the node can epsilon-transit, and it has only one
1436 org_dest = dfa->edests[org_node].elems[0];
1437 re_node_set_empty (dfa->edests + clone_node);
1438 if (dfa->nodes[org_node].type == ANCHOR)
1440 /* In case of the node has another constraint, append it. */
1441 if (org_node == root_node && clone_node != org_node)
1443 /* ...but if the node is root_node itself, it means the
1444 epsilon closure have a loop, then tie it to the
1445 destination of the root_node. */
1446 ok = re_node_set_insert (dfa->edests + clone_node,
1452 constraint |= dfa->nodes[org_node].opr.ctx_type;
1454 clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 if (BE (clone_dest == REG_MISSING, 0))
1457 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1461 else /* dfa->edests[org_node].nelem == 2 */
1463 /* In case of the node can epsilon-transit, and it has two
1464 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1465 org_dest = dfa->edests[org_node].elems[0];
1466 re_node_set_empty (dfa->edests + clone_node);
1467 /* Search for a duplicated node which satisfies the constraint. */
1468 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1469 if (clone_dest == REG_MISSING)
1471 /* There are no such a duplicated node, create a new one. */
1473 clone_dest = duplicate_node (dfa, org_dest, constraint);
1474 if (BE (clone_dest == REG_MISSING, 0))
1476 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1479 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1480 root_node, constraint);
1481 if (BE (err != REG_NOERROR, 0))
1486 /* There are a duplicated node which satisfy the constraint,
1487 use it to avoid infinite loop. */
1488 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493 org_dest = dfa->edests[org_node].elems[1];
1494 clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 if (BE (clone_dest == REG_MISSING, 0))
1497 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501 org_node = org_dest;
1502 clone_node = clone_dest;
1507 /* Search for a node which is duplicated from the node ORG_NODE, and
1508 satisfies the constraint CONSTRAINT. */
1511 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1512 unsigned int constraint)
1515 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1517 if (org_node == dfa->org_indices[idx]
1518 && constraint == dfa->nodes[idx].constraint)
1519 return idx; /* Found. */
1521 return REG_MISSING; /* Not found. */
1524 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1525 Return the index of the new node, or REG_MISSING if insufficient storage is
1529 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1531 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1532 if (BE (dup_idx != REG_MISSING, 1))
1534 dfa->nodes[dup_idx].constraint = constraint;
1535 if (dfa->nodes[org_idx].type == ANCHOR)
1536 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1537 dfa->nodes[dup_idx].duplicated = 1;
1539 /* Store the index of the original node. */
1540 dfa->org_indices[dup_idx] = org_idx;
1545 static reg_errcode_t
1546 calc_inveclosure (re_dfa_t *dfa)
1550 for (idx = 0; idx < dfa->nodes_len; ++idx)
1551 re_node_set_init_empty (dfa->inveclosures + idx);
1553 for (src = 0; src < dfa->nodes_len; ++src)
1555 Idx *elems = dfa->eclosures[src].elems;
1556 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1558 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1567 /* Calculate "eclosure" for all the node in DFA. */
1569 static reg_errcode_t
1570 calc_eclosure (re_dfa_t *dfa)
1575 assert (dfa->nodes_len > 0);
1578 /* For each nodes, calculate epsilon closure. */
1579 for (node_idx = 0; ; ++node_idx)
1582 re_node_set eclosure_elem;
1583 if (node_idx == dfa->nodes_len)
1592 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1595 /* If we have already calculated, skip it. */
1596 if (dfa->eclosures[node_idx].nelem != 0)
1598 /* Calculate epsilon closure of `node_idx'. */
1599 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1600 if (BE (err != REG_NOERROR, 0))
1603 if (dfa->eclosures[node_idx].nelem == 0)
1606 re_node_set_free (&eclosure_elem);
1612 /* Calculate epsilon closure of NODE. */
1614 static reg_errcode_t
1615 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1618 unsigned int constraint;
1622 re_node_set eclosure;
1624 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1625 if (BE (err != REG_NOERROR, 0))
1628 /* This indicates that we are calculating this node now.
1629 We reference this value to avoid infinite loop. */
1630 dfa->eclosures[node].nelem = REG_MISSING;
1632 constraint = ((dfa->nodes[node].type == ANCHOR)
1633 ? dfa->nodes[node].opr.ctx_type : 0);
1634 /* If the current node has constraints, duplicate all nodes.
1635 Since they must inherit the constraints. */
1637 && dfa->edests[node].nelem
1638 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1640 Idx org_node, cur_node;
1641 org_node = cur_node = node;
1642 err = duplicate_node_closure (dfa, node, node, node, constraint);
1643 if (BE (err != REG_NOERROR, 0))
1647 /* Expand each epsilon destination nodes. */
1648 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1649 for (i = 0; i < dfa->edests[node].nelem; ++i)
1651 re_node_set eclosure_elem;
1652 Idx edest = dfa->edests[node].elems[i];
1653 /* If calculating the epsilon closure of `edest' is in progress,
1654 return intermediate result. */
1655 if (dfa->eclosures[edest].nelem == REG_MISSING)
1660 /* If we haven't calculated the epsilon closure of `edest' yet,
1661 calculate now. Otherwise use calculated epsilon closure. */
1662 if (dfa->eclosures[edest].nelem == 0)
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1665 if (BE (err != REG_NOERROR, 0))
1669 eclosure_elem = dfa->eclosures[edest];
1670 /* Merge the epsilon closure of `edest'. */
1671 re_node_set_merge (&eclosure, &eclosure_elem);
1672 /* If the epsilon closure of `edest' is incomplete,
1673 the epsilon closure of this node is also incomplete. */
1674 if (dfa->eclosures[edest].nelem == 0)
1677 re_node_set_free (&eclosure_elem);
1681 /* Epsilon closures include itself. */
1682 ok = re_node_set_insert (&eclosure, node);
1685 if (incomplete && !root)
1686 dfa->eclosures[node].nelem = 0;
1688 dfa->eclosures[node] = eclosure;
1689 *new_set = eclosure;
1693 /* Functions for token which are used in the parser. */
1695 /* Fetch a token from INPUT.
1696 We must not use this function inside bracket expressions. */
1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1701 re_string_skip_bytes (input, peek_token (result, input, syntax));
1704 /* Peek a token from INPUT, and return the length of the token.
1705 We must not use this function inside bracket expressions. */
1708 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1712 if (re_string_eoi (input))
1714 token->type = END_OF_RE;
1718 c = re_string_peek_byte (input, 0);
1721 token->word_char = 0;
1722 #ifdef RE_ENABLE_I18N
1723 token->mb_partial = 0;
1724 if (input->mb_cur_max > 1 &&
1725 !re_string_first_byte (input, re_string_cur_idx (input)))
1727 token->type = CHARACTER;
1728 token->mb_partial = 1;
1735 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1737 token->type = BACK_SLASH;
1741 c2 = re_string_peek_byte_case (input, 1);
1743 token->type = CHARACTER;
1744 #ifdef RE_ENABLE_I18N
1745 if (input->mb_cur_max > 1)
1747 wint_t wc = re_string_wchar_at (input,
1748 re_string_cur_idx (input) + 1);
1749 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1753 token->word_char = IS_WORD_CHAR (c2) != 0;
1758 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1759 token->type = OP_ALT;
1761 case '1': case '2': case '3': case '4': case '5':
1762 case '6': case '7': case '8': case '9':
1763 if (!(syntax & REG_NO_BK_REFS))
1765 token->type = OP_BACK_REF;
1766 token->opr.idx = c2 - '1';
1770 if (!(syntax & REG_NO_GNU_OPS))
1772 token->type = ANCHOR;
1773 token->opr.ctx_type = WORD_FIRST;
1777 if (!(syntax & REG_NO_GNU_OPS))
1779 token->type = ANCHOR;
1780 token->opr.ctx_type = WORD_LAST;
1784 if (!(syntax & REG_NO_GNU_OPS))
1786 token->type = ANCHOR;
1787 token->opr.ctx_type = WORD_DELIM;
1791 if (!(syntax & REG_NO_GNU_OPS))
1793 token->type = ANCHOR;
1794 token->opr.ctx_type = NOT_WORD_DELIM;
1798 if (!(syntax & REG_NO_GNU_OPS))
1799 token->type = OP_WORD;
1802 if (!(syntax & REG_NO_GNU_OPS))
1803 token->type = OP_NOTWORD;
1806 if (!(syntax & REG_NO_GNU_OPS))
1807 token->type = OP_SPACE;
1810 if (!(syntax & REG_NO_GNU_OPS))
1811 token->type = OP_NOTSPACE;
1814 if (!(syntax & REG_NO_GNU_OPS))
1816 token->type = ANCHOR;
1817 token->opr.ctx_type = BUF_FIRST;
1821 if (!(syntax & REG_NO_GNU_OPS))
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = BUF_LAST;
1828 if (!(syntax & REG_NO_BK_PARENS))
1829 token->type = OP_OPEN_SUBEXP;
1832 if (!(syntax & REG_NO_BK_PARENS))
1833 token->type = OP_CLOSE_SUBEXP;
1836 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1837 token->type = OP_DUP_PLUS;
1840 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1841 token->type = OP_DUP_QUESTION;
1844 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1845 token->type = OP_OPEN_DUP_NUM;
1848 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1849 token->type = OP_CLOSE_DUP_NUM;
1857 token->type = CHARACTER;
1858 #ifdef RE_ENABLE_I18N
1859 if (input->mb_cur_max > 1)
1861 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1862 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1866 token->word_char = IS_WORD_CHAR (token->opr.c);
1871 if (syntax & REG_NEWLINE_ALT)
1872 token->type = OP_ALT;
1875 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1876 token->type = OP_ALT;
1879 token->type = OP_DUP_ASTERISK;
1882 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1883 token->type = OP_DUP_PLUS;
1886 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1887 token->type = OP_DUP_QUESTION;
1890 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1891 token->type = OP_OPEN_DUP_NUM;
1894 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1895 token->type = OP_CLOSE_DUP_NUM;
1898 if (syntax & REG_NO_BK_PARENS)
1899 token->type = OP_OPEN_SUBEXP;
1902 if (syntax & REG_NO_BK_PARENS)
1903 token->type = OP_CLOSE_SUBEXP;
1906 token->type = OP_OPEN_BRACKET;
1909 token->type = OP_PERIOD;
1912 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1913 re_string_cur_idx (input) != 0)
1915 char prev = re_string_peek_byte (input, -1);
1916 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = LINE_FIRST;
1923 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1924 re_string_cur_idx (input) + 1 != re_string_length (input))
1927 re_string_skip_bytes (input, 1);
1928 peek_token (&next, input, syntax);
1929 re_string_skip_bytes (input, -1);
1930 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1933 token->type = ANCHOR;
1934 token->opr.ctx_type = LINE_LAST;
1942 /* Peek a token from INPUT, and return the length of the token.
1943 We must not use this function out of bracket expressions. */
1946 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1949 if (re_string_eoi (input))
1951 token->type = END_OF_RE;
1954 c = re_string_peek_byte (input, 0);
1957 #ifdef RE_ENABLE_I18N
1958 if (input->mb_cur_max > 1 &&
1959 !re_string_first_byte (input, re_string_cur_idx (input)))
1961 token->type = CHARACTER;
1964 #endif /* RE_ENABLE_I18N */
1966 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1967 && re_string_cur_idx (input) + 1 < re_string_length (input))
1969 /* In this case, '\' escape a character. */
1971 re_string_skip_bytes (input, 1);
1972 c2 = re_string_peek_byte (input, 0);
1974 token->type = CHARACTER;
1977 if (c == '[') /* '[' is a special char in a bracket exps. */
1981 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1982 c2 = re_string_peek_byte (input, 1);
1990 token->type = OP_OPEN_COLL_ELEM;
1993 token->type = OP_OPEN_EQUIV_CLASS;
1996 if (syntax & REG_CHAR_CLASSES)
1998 token->type = OP_OPEN_CHAR_CLASS;
2001 /* else fall through. */
2003 token->type = CHARACTER;
2013 token->type = OP_CHARSET_RANGE;
2016 token->type = OP_CLOSE_BRACKET;
2019 token->type = OP_NON_MATCH_LIST;
2022 token->type = CHARACTER;
2027 /* Functions for parser. */
2029 /* Entry point of the parser.
2030 Parse the regular expression REGEXP and return the structure tree.
2031 If an error is occured, ERR is set by error code, and return NULL.
2032 This function build the following tree, from regular expression <reg_exp>:
2038 CAT means concatenation.
2039 EOR means end of regular expression. */
2042 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2046 bin_tree_t *tree, *eor, *root;
2047 re_token_t current_token;
2048 dfa->syntax = syntax;
2049 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2050 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2051 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2053 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2055 root = create_tree (dfa, tree, eor, CONCAT);
2058 if (BE (eor == NULL || root == NULL, 0))
2066 /* This function build the following tree, from regular expression
2067 <branch1>|<branch2>:
2073 ALT means alternative, which represents the operator `|'. */
2076 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2077 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2079 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2080 bin_tree_t *tree, *branch = NULL;
2081 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2082 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085 while (token->type == OP_ALT)
2087 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2088 if (token->type != OP_ALT && token->type != END_OF_RE
2089 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2091 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2092 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2097 tree = create_tree (dfa, tree, branch, OP_ALT);
2098 if (BE (tree == NULL, 0))
2107 /* This function build the following tree, from regular expression
2114 CAT means concatenation. */
2117 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2118 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2120 bin_tree_t *tree, *exp;
2121 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2122 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 while (token->type != OP_ALT && token->type != END_OF_RE
2127 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2129 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2130 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2134 if (tree != NULL && exp != NULL)
2136 tree = create_tree (dfa, tree, exp, CONCAT);
2143 else if (tree == NULL)
2145 /* Otherwise exp == NULL, we don't need to create new tree. */
2150 /* This function build the following tree, from regular expression a*:
2157 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2158 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2160 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2162 switch (token->type)
2165 tree = create_token_tree (dfa, NULL, NULL, token);
2166 if (BE (tree == NULL, 0))
2171 #ifdef RE_ENABLE_I18N
2172 if (dfa->mb_cur_max > 1)
2174 while (!re_string_eoi (regexp)
2175 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2177 bin_tree_t *mbc_remain;
2178 fetch_token (token, regexp, syntax);
2179 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2180 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2181 if (BE (mbc_remain == NULL || tree == NULL, 0))
2190 case OP_OPEN_SUBEXP:
2191 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 case OP_OPEN_BRACKET:
2196 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2201 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2206 dfa->used_bkref_map |= 1 << token->opr.idx;
2207 tree = create_token_tree (dfa, NULL, NULL, token);
2208 if (BE (tree == NULL, 0))
2214 dfa->has_mb_node = 1;
2216 case OP_OPEN_DUP_NUM:
2217 if (syntax & REG_CONTEXT_INVALID_DUP)
2223 case OP_DUP_ASTERISK:
2225 case OP_DUP_QUESTION:
2226 if (syntax & REG_CONTEXT_INVALID_OPS)
2231 else if (syntax & REG_CONTEXT_INDEP_OPS)
2233 fetch_token (token, regexp, syntax);
2234 return parse_expression (regexp, preg, token, syntax, nest, err);
2236 /* else fall through */
2237 case OP_CLOSE_SUBEXP:
2238 if ((token->type == OP_CLOSE_SUBEXP) &&
2239 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2244 /* else fall through */
2245 case OP_CLOSE_DUP_NUM:
2246 /* We treat it as a normal character. */
2248 /* Then we can these characters as normal characters. */
2249 token->type = CHARACTER;
2250 /* mb_partial and word_char bits should be initialized already
2252 tree = create_token_tree (dfa, NULL, NULL, token);
2253 if (BE (tree == NULL, 0))
2260 if ((token->opr.ctx_type
2261 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2262 && dfa->word_ops_used == 0)
2263 init_word_char (dfa);
2264 if (token->opr.ctx_type == WORD_DELIM
2265 || token->opr.ctx_type == NOT_WORD_DELIM)
2267 bin_tree_t *tree_first, *tree_last;
2268 if (token->opr.ctx_type == WORD_DELIM)
2270 token->opr.ctx_type = WORD_FIRST;
2271 tree_first = create_token_tree (dfa, NULL, NULL, token);
2272 token->opr.ctx_type = WORD_LAST;
2276 token->opr.ctx_type = INSIDE_WORD;
2277 tree_first = create_token_tree (dfa, NULL, NULL, token);
2278 token->opr.ctx_type = INSIDE_NOTWORD;
2280 tree_last = create_token_tree (dfa, NULL, NULL, token);
2281 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2282 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2290 tree = create_token_tree (dfa, NULL, NULL, token);
2291 if (BE (tree == NULL, 0))
2297 /* We must return here, since ANCHORs can't be followed
2298 by repetition operators.
2299 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2300 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2301 fetch_token (token, regexp, syntax);
2304 tree = create_token_tree (dfa, NULL, NULL, token);
2305 if (BE (tree == NULL, 0))
2310 if (dfa->mb_cur_max > 1)
2311 dfa->has_mb_node = 1;
2315 tree = build_charclass_op (dfa, regexp->trans,
2316 (const unsigned char *) "alnum",
2317 (const unsigned char *) "_",
2318 token->type == OP_NOTWORD, err);
2319 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 tree = build_charclass_op (dfa, regexp->trans,
2325 (const unsigned char *) "space",
2326 (const unsigned char *) "",
2327 token->type == OP_NOTSPACE, err);
2328 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2338 /* Must not happen? */
2344 fetch_token (token, regexp, syntax);
2346 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2347 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2349 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2350 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352 /* In BRE consecutive duplications are not allowed. */
2353 if ((syntax & REG_CONTEXT_INVALID_DUP)
2354 && (token->type == OP_DUP_ASTERISK
2355 || token->type == OP_OPEN_DUP_NUM))
2365 /* This function build the following tree, from regular expression
2373 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2374 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2376 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2379 cur_nsub = preg->re_nsub++;
2381 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2383 /* The subexpression may be a null string. */
2384 if (token->type == OP_CLOSE_SUBEXP)
2388 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2389 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2391 if (BE (*err != REG_NOERROR, 0))
2395 if (cur_nsub <= '9' - '1')
2396 dfa->completed_bkref_map |= 1 << cur_nsub;
2398 tree = create_tree (dfa, tree, NULL, SUBEXP);
2399 if (BE (tree == NULL, 0))
2404 tree->token.opr.idx = cur_nsub;
2408 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2411 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2412 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2414 bin_tree_t *tree = NULL, *old_tree = NULL;
2415 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2416 re_token_t start_token = *token;
2418 if (token->type == OP_OPEN_DUP_NUM)
2421 start = fetch_number (regexp, token, syntax);
2422 if (start == REG_MISSING)
2424 if (token->type == CHARACTER && token->opr.c == ',')
2425 start = 0; /* We treat "{,m}" as "{0,m}". */
2428 *err = REG_BADBR; /* <re>{} is invalid. */
2432 if (BE (start != REG_ERROR, 1))
2434 /* We treat "{n}" as "{n,n}". */
2435 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2436 : ((token->type == CHARACTER && token->opr.c == ',')
2437 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2439 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2441 /* Invalid sequence. */
2442 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2444 if (token->type == END_OF_RE)
2452 /* If the syntax bit is set, rollback. */
2453 re_string_set_index (regexp, start_idx);
2454 *token = start_token;
2455 token->type = CHARACTER;
2456 /* mb_partial and word_char bits should be already initialized by
2461 if (BE (end != REG_MISSING && start > end, 0))
2463 /* First number greater than second. */
2470 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2471 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2474 fetch_token (token, regexp, syntax);
2476 if (BE (elem == NULL, 0))
2478 if (BE (start == 0 && end == 0, 0))
2480 postorder (elem, free_tree, NULL);
2484 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2485 if (BE (start > 0, 0))
2488 for (i = 2; i <= start; ++i)
2490 elem = duplicate_tree (elem, dfa);
2491 tree = create_tree (dfa, tree, elem, CONCAT);
2492 if (BE (elem == NULL || tree == NULL, 0))
2493 goto parse_dup_op_espace;
2499 /* Duplicate ELEM before it is marked optional. */
2500 elem = duplicate_tree (elem, dfa);
2506 if (elem->token.type == SUBEXP)
2507 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2509 tree = create_tree (dfa, elem, NULL,
2510 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2511 if (BE (tree == NULL, 0))
2512 goto parse_dup_op_espace;
2514 /* This loop is actually executed only when end != REG_MISSING,
2515 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2516 already created the start+1-th copy. */
2517 if ((Idx) -1 < 0 || end != REG_MISSING)
2518 for (i = start + 2; i <= end; ++i)
2520 elem = duplicate_tree (elem, dfa);
2521 tree = create_tree (dfa, tree, elem, CONCAT);
2522 if (BE (elem == NULL || tree == NULL, 0))
2523 goto parse_dup_op_espace;
2525 tree = create_tree (dfa, tree, NULL, OP_ALT);
2526 if (BE (tree == NULL, 0))
2527 goto parse_dup_op_espace;
2531 tree = create_tree (dfa, old_tree, tree, CONCAT);
2535 parse_dup_op_espace:
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541 I'm not sure, but maybe enough. */
2542 #define BRACKET_NAME_BUF_SIZE 32
2545 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546 Build the range expression which starts from START_ELEM, and ends
2547 at END_ELEM. The result are written to MBCSET and SBCSET.
2548 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549 mbcset->range_ends, is a pointer argument sinse we may
2552 static reg_errcode_t
2553 build_range_exp (re_bitset_ptr_t sbcset,
2554 # ifdef RE_ENABLE_I18N
2555 re_charset_t *mbcset, Idx *range_alloc,
2557 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2559 unsigned int start_ch, end_ch;
2560 /* Equivalence Classes and Character Classes can't be a range start/end. */
2561 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2562 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2566 /* We can handle no multi character collating elements without libc
2568 if (BE ((start_elem->type == COLL_SYM
2569 && strlen ((char *) start_elem->opr.name) > 1)
2570 || (end_elem->type == COLL_SYM
2571 && strlen ((char *) end_elem->opr.name) > 1), 0))
2572 return REG_ECOLLATE;
2574 # ifdef RE_ENABLE_I18N
2577 wint_t start_wc, end_wc;
2578 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2580 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2581 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2583 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2584 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2586 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2587 ? __btowc (start_ch) : start_elem->opr.wch);
2588 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2589 ? __btowc (end_ch) : end_elem->opr.wch);
2590 if (start_wc == WEOF || end_wc == WEOF)
2591 return REG_ECOLLATE;
2592 cmp_buf[0] = start_wc;
2593 cmp_buf[4] = end_wc;
2594 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2597 /* Got valid collation sequence values, add them as a new entry.
2598 However, for !_LIBC we have no collation elements: if the
2599 character set is single byte, the single byte character set
2600 that we build below suffices. parse_bracket_exp passes
2601 no MBCSET if dfa->mb_cur_max == 1. */
2604 /* Check the space of the arrays. */
2605 if (BE (*range_alloc == mbcset->nranges, 0))
2607 /* There is not enough space, need realloc. */
2608 wchar_t *new_array_start, *new_array_end;
2611 /* +1 in case of mbcset->nranges is 0. */
2612 new_nranges = 2 * mbcset->nranges + 1;
2613 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2614 are NULL if *range_alloc == 0. */
2615 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2617 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2620 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2623 mbcset->range_starts = new_array_start;
2624 mbcset->range_ends = new_array_end;
2625 *range_alloc = new_nranges;
2628 mbcset->range_starts[mbcset->nranges] = start_wc;
2629 mbcset->range_ends[mbcset->nranges++] = end_wc;
2632 /* Build the table for single byte characters. */
2633 for (wc = 0; wc < SBC_MAX; ++wc)
2636 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2637 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2638 bitset_set (sbcset, wc);
2641 # else /* not RE_ENABLE_I18N */
2644 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2645 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2647 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2648 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2650 if (start_ch > end_ch)
2652 /* Build the table for single byte characters. */
2653 for (ch = 0; ch < SBC_MAX; ++ch)
2654 if (start_ch <= ch && ch <= end_ch)
2655 bitset_set (sbcset, ch);
2657 # endif /* not RE_ENABLE_I18N */
2660 #endif /* not _LIBC */
2663 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2664 Build the collating element which is represented by NAME.
2665 The result are written to MBCSET and SBCSET.
2666 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2667 pointer argument since we may update it. */
2669 static reg_errcode_t
2670 build_collating_symbol (re_bitset_ptr_t sbcset,
2671 # ifdef RE_ENABLE_I18N
2672 re_charset_t *mbcset, Idx *coll_sym_alloc,
2674 const unsigned char *name)
2676 size_t name_len = strlen ((const char *) name);
2677 if (BE (name_len != 1, 0))
2678 return REG_ECOLLATE;
2681 bitset_set (sbcset, name[0]);
2685 #endif /* not _LIBC */
2687 /* This function parse bracket expression like "[abc]", "[a-c]",
2691 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2692 reg_syntax_t syntax, reg_errcode_t *err)
2695 const unsigned char *collseqmb;
2696 const char *collseqwc;
2699 const int32_t *symb_table;
2700 const unsigned char *extra;
2702 /* Local function for parse_bracket_exp used in _LIBC environement.
2703 Seek the collating symbol entry correspondings to NAME.
2704 Return the index of the symbol in the SYMB_TABLE. */
2707 __attribute ((always_inline))
2708 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2710 int32_t hash = elem_hash ((const char *) name, name_len);
2711 int32_t elem = hash % table_size;
2712 int32_t second = hash % (table_size - 2);
2713 while (symb_table[2 * elem] != 0)
2715 /* First compare the hashing value. */
2716 if (symb_table[2 * elem] == hash
2717 /* Compare the length of the name. */
2718 && name_len == extra[symb_table[2 * elem + 1]]
2719 /* Compare the name. */
2720 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2723 /* Yep, this is the entry. */
2733 /* Local function for parse_bracket_exp used in _LIBC environement.
2734 Look up the collation sequence value of BR_ELEM.
2735 Return the value if succeeded, UINT_MAX otherwise. */
2737 auto inline unsigned int
2738 __attribute ((always_inline))
2739 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2741 if (br_elem->type == SB_CHAR)
2744 if (MB_CUR_MAX == 1)
2747 return collseqmb[br_elem->opr.ch];
2750 wint_t wc = __btowc (br_elem->opr.ch);
2751 return __collseq_table_lookup (collseqwc, wc);
2754 else if (br_elem->type == MB_CHAR)
2756 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2758 else if (br_elem->type == COLL_SYM)
2760 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2764 elem = seek_collating_symbol_entry (br_elem->opr.name,
2766 if (symb_table[2 * elem] != 0)
2768 /* We found the entry. */
2769 idx = symb_table[2 * elem + 1];
2770 /* Skip the name of collating element name. */
2771 idx += 1 + extra[idx];
2772 /* Skip the byte sequence of the collating element. */
2773 idx += 1 + extra[idx];
2774 /* Adjust for the alignment. */
2775 idx = (idx + 3) & ~3;
2776 /* Skip the multibyte collation sequence value. */
2777 idx += sizeof (unsigned int);
2778 /* Skip the wide char sequence of the collating element. */
2779 idx += sizeof (unsigned int) *
2780 (1 + *(unsigned int *) (extra + idx));
2781 /* Return the collation sequence value. */
2782 return *(unsigned int *) (extra + idx);
2784 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2786 /* No valid character. Match it as a single byte
2788 return collseqmb[br_elem->opr.name[0]];
2791 else if (sym_name_len == 1)
2792 return collseqmb[br_elem->opr.name[0]];
2797 /* Local function for parse_bracket_exp used in _LIBC environement.
2798 Build the range expression which starts from START_ELEM, and ends
2799 at END_ELEM. The result are written to MBCSET and SBCSET.
2800 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2801 mbcset->range_ends, is a pointer argument sinse we may
2804 auto inline reg_errcode_t
2805 __attribute ((always_inline))
2806 build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2808 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2811 uint32_t start_collseq;
2812 uint32_t end_collseq;
2814 /* Equivalence Classes and Character Classes can't be a range
2816 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2817 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2821 start_collseq = lookup_collation_sequence_value (start_elem);
2822 end_collseq = lookup_collation_sequence_value (end_elem);
2823 /* Check start/end collation sequence values. */
2824 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2825 return REG_ECOLLATE;
2826 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2829 /* Got valid collation sequence values, add them as a new entry.
2830 However, if we have no collation elements, and the character set
2831 is single byte, the single byte character set that we
2832 build below suffices. */
2833 if (nrules > 0 || dfa->mb_cur_max > 1)
2835 /* Check the space of the arrays. */
2836 if (BE (*range_alloc == mbcset->nranges, 0))
2838 /* There is not enough space, need realloc. */
2839 uint32_t *new_array_start;
2840 uint32_t *new_array_end;
2843 /* +1 in case of mbcset->nranges is 0. */
2844 new_nranges = 2 * mbcset->nranges + 1;
2845 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2847 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2850 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2853 mbcset->range_starts = new_array_start;
2854 mbcset->range_ends = new_array_end;
2855 *range_alloc = new_nranges;
2858 mbcset->range_starts[mbcset->nranges] = start_collseq;
2859 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2862 /* Build the table for single byte characters. */
2863 for (ch = 0; ch < SBC_MAX; ch++)
2865 uint32_t ch_collseq;
2867 if (MB_CUR_MAX == 1)
2870 ch_collseq = collseqmb[ch];
2872 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2873 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2874 bitset_set (sbcset, ch);
2879 /* Local function for parse_bracket_exp used in _LIBC environement.
2880 Build the collating element which is represented by NAME.
2881 The result are written to MBCSET and SBCSET.
2882 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2883 pointer argument sinse we may update it. */
2885 auto inline reg_errcode_t
2886 __attribute ((always_inline))
2887 build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2888 Idx *coll_sym_alloc, const unsigned char *name)
2891 size_t name_len = strlen ((const char *) name);
2894 elem = seek_collating_symbol_entry (name, name_len);
2895 if (symb_table[2 * elem] != 0)
2897 /* We found the entry. */
2898 idx = symb_table[2 * elem + 1];
2899 /* Skip the name of collating element name. */
2900 idx += 1 + extra[idx];
2902 else if (symb_table[2 * elem] == 0 && name_len == 1)
2904 /* No valid character, treat it as a normal
2906 bitset_set (sbcset, name[0]);
2910 return REG_ECOLLATE;
2912 /* Got valid collation sequence, add it as a new entry. */
2913 /* Check the space of the arrays. */
2914 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2916 /* Not enough, realloc it. */
2917 /* +1 in case of mbcset->ncoll_syms is 0. */
2918 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2919 /* Use realloc since mbcset->coll_syms is NULL
2921 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2922 new_coll_sym_alloc);
2923 if (BE (new_coll_syms == NULL, 0))
2925 mbcset->coll_syms = new_coll_syms;
2926 *coll_sym_alloc = new_coll_sym_alloc;
2928 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2933 if (BE (name_len != 1, 0))
2934 return REG_ECOLLATE;
2937 bitset_set (sbcset, name[0]);
2944 re_token_t br_token;
2945 re_bitset_ptr_t sbcset;
2946 #ifdef RE_ENABLE_I18N
2947 re_charset_t *mbcset;
2948 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2949 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2950 #endif /* not RE_ENABLE_I18N */
2951 bool non_match = false;
2952 bin_tree_t *work_tree;
2954 bool first_round = true;
2956 collseqmb = (const unsigned char *)
2957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2958 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2964 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2965 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2966 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2967 _NL_COLLATE_SYMB_TABLEMB);
2968 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2969 _NL_COLLATE_SYMB_EXTRAMB);
2972 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2973 #ifdef RE_ENABLE_I18N
2974 mbcset = re_calloc (re_charset_t, 1);
2975 #endif /* RE_ENABLE_I18N */
2976 #ifdef RE_ENABLE_I18N
2977 if (BE (sbcset == NULL || mbcset == NULL, 0))
2979 if (BE (sbcset == NULL, 0))
2980 #endif /* RE_ENABLE_I18N */
2986 token_len = peek_token_bracket (token, regexp, syntax);
2987 if (BE (token->type == END_OF_RE, 0))
2990 goto parse_bracket_exp_free_return;
2992 if (token->type == OP_NON_MATCH_LIST)
2994 #ifdef RE_ENABLE_I18N
2995 mbcset->non_match = 1;
2996 #endif /* not RE_ENABLE_I18N */
2998 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2999 bitset_set (sbcset, '\0');
3000 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3001 token_len = peek_token_bracket (token, regexp, syntax);
3002 if (BE (token->type == END_OF_RE, 0))
3005 goto parse_bracket_exp_free_return;
3009 /* We treat the first ']' as a normal character. */
3010 if (token->type == OP_CLOSE_BRACKET)
3011 token->type = CHARACTER;
3015 bracket_elem_t start_elem, end_elem;
3016 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3017 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3020 bool is_range_exp = false;
3023 start_elem.opr.name = start_name_buf;
3024 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3025 syntax, first_round);
3026 if (BE (ret != REG_NOERROR, 0))
3029 goto parse_bracket_exp_free_return;
3031 first_round = false;
3033 /* Get information about the next token. We need it in any case. */
3034 token_len = peek_token_bracket (token, regexp, syntax);
3036 /* Do not check for ranges if we know they are not allowed. */
3037 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3039 if (BE (token->type == END_OF_RE, 0))
3042 goto parse_bracket_exp_free_return;
3044 if (token->type == OP_CHARSET_RANGE)
3046 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3047 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3048 if (BE (token2.type == END_OF_RE, 0))
3051 goto parse_bracket_exp_free_return;
3053 if (token2.type == OP_CLOSE_BRACKET)
3055 /* We treat the last '-' as a normal character. */
3056 re_string_skip_bytes (regexp, -token_len);
3057 token->type = CHARACTER;
3060 is_range_exp = true;
3064 if (is_range_exp == true)
3066 end_elem.opr.name = end_name_buf;
3067 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3069 if (BE (ret != REG_NOERROR, 0))
3072 goto parse_bracket_exp_free_return;
3075 token_len = peek_token_bracket (token, regexp, syntax);
3078 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3079 &start_elem, &end_elem);
3081 # ifdef RE_ENABLE_I18N
3082 *err = build_range_exp (sbcset,
3083 dfa->mb_cur_max > 1 ? mbcset : NULL,
3084 &range_alloc, &start_elem, &end_elem);
3086 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3088 #endif /* RE_ENABLE_I18N */
3089 if (BE (*err != REG_NOERROR, 0))
3090 goto parse_bracket_exp_free_return;
3094 switch (start_elem.type)
3097 bitset_set (sbcset, start_elem.opr.ch);
3099 #ifdef RE_ENABLE_I18N
3101 /* Check whether the array has enough space. */
3102 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3104 wchar_t *new_mbchars;
3105 /* Not enough, realloc it. */
3106 /* +1 in case of mbcset->nmbchars is 0. */
3107 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3108 /* Use realloc since array is NULL if *alloc == 0. */
3109 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3111 if (BE (new_mbchars == NULL, 0))
3112 goto parse_bracket_exp_espace;
3113 mbcset->mbchars = new_mbchars;
3115 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3117 #endif /* RE_ENABLE_I18N */
3119 *err = build_equiv_class (sbcset,
3120 #ifdef RE_ENABLE_I18N
3121 mbcset, &equiv_class_alloc,
3122 #endif /* RE_ENABLE_I18N */
3123 start_elem.opr.name);
3124 if (BE (*err != REG_NOERROR, 0))
3125 goto parse_bracket_exp_free_return;
3128 *err = build_collating_symbol (sbcset,
3129 #ifdef RE_ENABLE_I18N
3130 mbcset, &coll_sym_alloc,
3131 #endif /* RE_ENABLE_I18N */
3132 start_elem.opr.name);
3133 if (BE (*err != REG_NOERROR, 0))
3134 goto parse_bracket_exp_free_return;
3137 *err = build_charclass (regexp->trans, sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset, &char_class_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 start_elem.opr.name, syntax);
3142 if (BE (*err != REG_NOERROR, 0))
3143 goto parse_bracket_exp_free_return;
3150 if (BE (token->type == END_OF_RE, 0))
3153 goto parse_bracket_exp_free_return;
3155 if (token->type == OP_CLOSE_BRACKET)
3159 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161 /* If it is non-matching list. */
3163 bitset_not (sbcset);
3165 #ifdef RE_ENABLE_I18N
3166 /* Ensure only single byte characters are set. */
3167 if (dfa->mb_cur_max > 1)
3168 bitset_mask (sbcset, dfa->sb_char);
3170 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3171 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3172 || mbcset->non_match)))
3174 bin_tree_t *mbc_tree;
3176 /* Build a tree for complex bracket. */
3177 dfa->has_mb_node = 1;
3178 br_token.type = COMPLEX_BRACKET;
3179 br_token.opr.mbcset = mbcset;
3180 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3181 if (BE (mbc_tree == NULL, 0))
3182 goto parse_bracket_exp_espace;
3183 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3184 if (sbcset[sbc_idx])
3186 /* If there are no bits set in sbcset, there is no point
3187 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3188 if (sbc_idx < BITSET_UINTS)
3190 /* Build a tree for simple bracket. */
3191 br_token.type = SIMPLE_BRACKET;
3192 br_token.opr.sbcset = sbcset;
3193 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3194 if (BE (work_tree == NULL, 0))
3195 goto parse_bracket_exp_espace;
3197 /* Then join them by ALT node. */
3198 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3199 if (BE (work_tree == NULL, 0))
3200 goto parse_bracket_exp_espace;
3205 work_tree = mbc_tree;
3209 #endif /* not RE_ENABLE_I18N */
3211 #ifdef RE_ENABLE_I18N
3212 free_charset (mbcset);
3214 /* Build a tree for simple bracket. */
3215 br_token.type = SIMPLE_BRACKET;
3216 br_token.opr.sbcset = sbcset;
3217 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3223 parse_bracket_exp_espace:
3225 parse_bracket_exp_free_return:
3227 #ifdef RE_ENABLE_I18N
3228 free_charset (mbcset);
3229 #endif /* RE_ENABLE_I18N */
3233 /* Parse an element in the bracket expression. */
3235 static reg_errcode_t
3236 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3237 re_token_t *token, int token_len, re_dfa_t *dfa,
3238 reg_syntax_t syntax, bool accept_hyphen)
3240 #ifdef RE_ENABLE_I18N
3242 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3243 if (cur_char_size > 1)
3245 elem->type = MB_CHAR;
3246 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3247 re_string_skip_bytes (regexp, cur_char_size);
3250 #endif /* RE_ENABLE_I18N */
3251 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3252 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3253 || token->type == OP_OPEN_EQUIV_CLASS)
3254 return parse_bracket_symbol (elem, regexp, token);
3255 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3257 /* A '-' must only appear as anything but a range indicator before
3258 the closing bracket. Everything else is an error. */
3260 (void) peek_token_bracket (&token2, regexp, syntax);
3261 if (token2.type != OP_CLOSE_BRACKET)
3262 /* The actual error value is not standardized since this whole
3263 case is undefined. But ERANGE makes good sense. */
3266 elem->type = SB_CHAR;
3267 elem->opr.ch = token->opr.c;
3271 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3272 such as [:<character_class>:], [.<collating_element>.], and
3273 [=<equivalent_class>=]. */
3275 static reg_errcode_t
3276 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3279 unsigned char ch, delim = token->opr.c;
3281 if (re_string_eoi(regexp))
3285 if (i >= BRACKET_NAME_BUF_SIZE)
3287 if (token->type == OP_OPEN_CHAR_CLASS)
3288 ch = re_string_fetch_byte_case (regexp);
3290 ch = re_string_fetch_byte (regexp);
3291 if (re_string_eoi(regexp))
3293 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3295 elem->opr.name[i] = ch;
3297 re_string_skip_bytes (regexp, 1);
3298 elem->opr.name[i] = '\0';
3299 switch (token->type)
3301 case OP_OPEN_COLL_ELEM:
3302 elem->type = COLL_SYM;
3304 case OP_OPEN_EQUIV_CLASS:
3305 elem->type = EQUIV_CLASS;
3307 case OP_OPEN_CHAR_CLASS:
3308 elem->type = CHAR_CLASS;
3316 /* Helper function for parse_bracket_exp.
3317 Build the equivalence class which is represented by NAME.
3318 The result are written to MBCSET and SBCSET.
3319 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3320 is a pointer argument sinse we may update it. */
3322 static reg_errcode_t
3323 build_equiv_class (re_bitset_ptr_t sbcset,
3324 #ifdef RE_ENABLE_I18N
3325 re_charset_t *mbcset, Idx *equiv_class_alloc,
3327 const unsigned char *name)
3330 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3333 const int32_t *table, *indirect;
3334 const unsigned char *weights, *extra, *cp;
3335 unsigned char char_buf[2];
3339 /* This #include defines a local function! */
3340 # include <locale/weight.h>
3341 /* Calculate the index for equivalence class. */
3343 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3344 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3345 _NL_COLLATE_WEIGHTMB);
3346 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3347 _NL_COLLATE_EXTRAMB);
3348 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3349 _NL_COLLATE_INDIRECTMB);
3350 idx1 = findidx (&cp);
3351 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3352 /* This isn't a valid character. */
3353 return REG_ECOLLATE;
3355 /* Build single byte matcing table for this equivalence class. */
3356 char_buf[1] = (unsigned char) '\0';
3357 len = weights[idx1];
3358 for (ch = 0; ch < SBC_MAX; ++ch)
3362 idx2 = findidx (&cp);
3367 /* This isn't a valid character. */
3369 if (len == weights[idx2])
3372 while (cnt <= len &&
3373 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3377 bitset_set (sbcset, ch);
3380 /* Check whether the array has enough space. */
3381 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3383 /* Not enough, realloc it. */
3384 /* +1 in case of mbcset->nequiv_classes is 0. */
3385 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3386 /* Use realloc since the array is NULL if *alloc == 0. */
3387 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3389 new_equiv_class_alloc);
3390 if (BE (new_equiv_classes == NULL, 0))
3392 mbcset->equiv_classes = new_equiv_classes;
3393 *equiv_class_alloc = new_equiv_class_alloc;
3395 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3400 if (BE (strlen ((const char *) name) != 1, 0))
3401 return REG_ECOLLATE;
3402 bitset_set (sbcset, *name);
3407 /* Helper function for parse_bracket_exp.
3408 Build the character class which is represented by NAME.
3409 The result are written to MBCSET and SBCSET.
3410 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3411 is a pointer argument sinse we may update it. */
3413 static reg_errcode_t
3414 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3415 #ifdef RE_ENABLE_I18N
3416 re_charset_t *mbcset, Idx *char_class_alloc,
3418 const unsigned char *class_name, reg_syntax_t syntax)
3421 const char *name = (const char *) class_name;
3423 /* In case of REG_ICASE "upper" and "lower" match the both of
3424 upper and lower cases. */
3425 if ((syntax & REG_IGNORE_CASE)
3426 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3429 #ifdef RE_ENABLE_I18N
3430 /* Check the space of the arrays. */
3431 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3433 /* Not enough, realloc it. */
3434 /* +1 in case of mbcset->nchar_classes is 0. */
3435 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3436 /* Use realloc since array is NULL if *alloc == 0. */
3437 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3438 new_char_class_alloc);
3439 if (BE (new_char_classes == NULL, 0))
3441 mbcset->char_classes = new_char_classes;
3442 *char_class_alloc = new_char_class_alloc;
3444 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3445 #endif /* RE_ENABLE_I18N */
3447 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3448 for (i = 0; i < SBC_MAX; ++i) \
3450 if (ctype_func (i)) \
3452 int ch = trans ? trans[i] : i; \
3453 bitset_set (sbcset, ch); \
3457 if (strcmp (name, "alnum") == 0)
3458 BUILD_CHARCLASS_LOOP (isalnum)
3459 else if (strcmp (name, "cntrl") == 0)
3460 BUILD_CHARCLASS_LOOP (iscntrl)
3461 else if (strcmp (name, "lower") == 0)
3462 BUILD_CHARCLASS_LOOP (islower)
3463 else if (strcmp (name, "space") == 0)
3464 BUILD_CHARCLASS_LOOP (isspace)
3465 else if (strcmp (name, "alpha") == 0)
3466 BUILD_CHARCLASS_LOOP (isalpha)
3467 else if (strcmp (name, "digit") == 0)
3468 BUILD_CHARCLASS_LOOP (isdigit)
3469 else if (strcmp (name, "print") == 0)
3470 BUILD_CHARCLASS_LOOP (isprint)
3471 else if (strcmp (name, "upper") == 0)
3472 BUILD_CHARCLASS_LOOP (isupper)
3473 else if (strcmp (name, "blank") == 0)
3474 BUILD_CHARCLASS_LOOP (isblank)
3475 else if (strcmp (name, "graph") == 0)
3476 BUILD_CHARCLASS_LOOP (isgraph)
3477 else if (strcmp (name, "punct") == 0)
3478 BUILD_CHARCLASS_LOOP (ispunct)
3479 else if (strcmp (name, "xdigit") == 0)
3480 BUILD_CHARCLASS_LOOP (isxdigit)
3488 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3489 const unsigned char *class_name,
3490 const unsigned char *extra,
3491 bool non_match, reg_errcode_t *err)
3493 re_bitset_ptr_t sbcset;
3494 #ifdef RE_ENABLE_I18N
3495 re_charset_t *mbcset;
3497 #endif /* not RE_ENABLE_I18N */
3499 re_token_t br_token;
3502 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3503 #ifdef RE_ENABLE_I18N
3504 mbcset = re_calloc (re_charset_t, 1);
3505 #endif /* RE_ENABLE_I18N */
3507 #ifdef RE_ENABLE_I18N
3508 if (BE (sbcset == NULL || mbcset == NULL, 0))
3509 #else /* not RE_ENABLE_I18N */
3510 if (BE (sbcset == NULL, 0))
3511 #endif /* not RE_ENABLE_I18N */
3519 #ifdef RE_ENABLE_I18N
3521 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3522 bitset_set(cset->sbcset, '\0');
3524 mbcset->non_match = 1;
3525 #endif /* not RE_ENABLE_I18N */
3528 /* We don't care the syntax in this case. */
3529 ret = build_charclass (trans, sbcset,
3530 #ifdef RE_ENABLE_I18N
3532 #endif /* RE_ENABLE_I18N */
3535 if (BE (ret != REG_NOERROR, 0))
3538 #ifdef RE_ENABLE_I18N
3539 free_charset (mbcset);
3540 #endif /* RE_ENABLE_I18N */
3544 /* \w match '_' also. */
3545 for (; *extra; extra++)
3546 bitset_set (sbcset, *extra);
3548 /* If it is non-matching list. */
3550 bitset_not (sbcset);
3552 #ifdef RE_ENABLE_I18N
3553 /* Ensure only single byte characters are set. */
3554 if (dfa->mb_cur_max > 1)
3555 bitset_mask (sbcset, dfa->sb_char);
3558 /* Build a tree for simple bracket. */
3559 br_token.type = SIMPLE_BRACKET;
3560 br_token.opr.sbcset = sbcset;
3561 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3562 if (BE (tree == NULL, 0))
3563 goto build_word_op_espace;
3565 #ifdef RE_ENABLE_I18N
3566 if (dfa->mb_cur_max > 1)
3568 bin_tree_t *mbc_tree;
3569 /* Build a tree for complex bracket. */
3570 br_token.type = COMPLEX_BRACKET;
3571 br_token.opr.mbcset = mbcset;
3572 dfa->has_mb_node = 1;
3573 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3574 if (BE (mbc_tree == NULL, 0))
3575 goto build_word_op_espace;
3576 /* Then join them by ALT node. */
3577 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3578 if (BE (mbc_tree != NULL, 1))
3583 free_charset (mbcset);
3586 #else /* not RE_ENABLE_I18N */
3588 #endif /* not RE_ENABLE_I18N */
3590 build_word_op_espace:
3592 #ifdef RE_ENABLE_I18N
3593 free_charset (mbcset);
3594 #endif /* RE_ENABLE_I18N */
3599 /* This is intended for the expressions like "a{1,3}".
3600 Fetch a number from `input', and return the number.
3601 Return REG_MISSING if the number field is empty like "{,1}".
3602 Return REG_ERROR if an error occurred. */
3605 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3607 Idx num = REG_MISSING;
3611 fetch_token (token, input, syntax);
3613 if (BE (token->type == END_OF_RE, 0))
3615 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3617 num = ((token->type != CHARACTER || c < '0' || '9' < c
3618 || num == REG_ERROR)
3620 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3621 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3626 #ifdef RE_ENABLE_I18N
3628 free_charset (re_charset_t *cset)
3630 re_free (cset->mbchars);
3632 re_free (cset->coll_syms);
3633 re_free (cset->equiv_classes);
3634 re_free (cset->range_starts);
3635 re_free (cset->range_ends);
3637 re_free (cset->char_classes);
3640 #endif /* RE_ENABLE_I18N */
3642 /* Functions for binary tree operation. */
3644 /* Create a tree node. */
3647 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3648 re_token_type_t type)
3652 return create_token_tree (dfa, left, right, &t);
3656 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657 const re_token_t *token)
3660 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3662 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3664 if (storage == NULL)
3666 storage->next = dfa->str_tree_storage;
3667 dfa->str_tree_storage = storage;
3668 dfa->str_tree_storage_idx = 0;
3670 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3672 tree->parent = NULL;
3674 tree->right = right;
3675 tree->token = *token;
3676 tree->token.duplicated = 0;
3677 tree->token.opt_subexp = 0;
3680 tree->node_idx = REG_MISSING;
3683 left->parent = tree;
3685 right->parent = tree;
3689 /* Mark the tree SRC as an optional subexpression.
3690 To be called from preorder or postorder. */
3692 static reg_errcode_t
3693 mark_opt_subexp (void *extra, bin_tree_t *node)
3695 Idx idx = (Idx) (long) extra;
3696 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3697 node->token.opt_subexp = 1;
3702 /* Free the allocated memory inside NODE. */
3705 free_token (re_token_t *node)
3707 #ifdef RE_ENABLE_I18N
3708 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3709 free_charset (node->opr.mbcset);
3711 #endif /* RE_ENABLE_I18N */
3712 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3713 re_free (node->opr.sbcset);
3716 /* Worker function for tree walking. Free the allocated memory inside NODE
3717 and its children. */
3719 static reg_errcode_t
3720 free_tree (void *extra, bin_tree_t *node)
3722 free_token (&node->token);
3727 /* Duplicate the node SRC, and return new node. This is a preorder
3728 visit similar to the one implemented by the generic visitor, but
3729 we need more infrastructure to maintain two parallel trees --- so,
3730 it's easier to duplicate. */
3733 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3735 const bin_tree_t *node;
3736 bin_tree_t *dup_root;
3737 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3739 for (node = root; ; )
3741 /* Create a new tree and link it back to the current parent. */
3742 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3745 (*p_new)->parent = dup_node;
3746 (*p_new)->token.duplicated = 1;
3749 /* Go to the left node, or up and to the right. */
3753 p_new = &dup_node->left;
3757 const bin_tree_t *prev = NULL;
3758 while (node->right == prev || node->right == NULL)
3761 node = node->parent;
3762 dup_node = dup_node->parent;
3767 p_new = &dup_node->right;