1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 int node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, 0, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
650 if (!re_comp_buf.re_buffer)
651 return gettext ("No previous regular expression");
655 if (re_comp_buf.re_buffer)
657 fastmap = re_comp_buf.re_fastmap;
658 re_comp_buf.re_fastmap = NULL;
659 __regfree (&re_comp_buf);
660 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
661 re_comp_buf.re_fastmap = fastmap;
664 if (re_comp_buf.re_fastmap == NULL)
666 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
667 if (re_comp_buf.re_fastmap == NULL)
668 return (char *) gettext (__re_error_msgid
669 + __re_error_msgid_idx[(int) REG_ESPACE]);
672 /* Since `re_exec' always passes NULL for the `regs' argument, we
673 don't need to initialize the pattern buffer fields which affect it. */
675 /* Match anchors at newlines. */
676 re_comp_buf.re_newline_anchor = 1;
678 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
683 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
684 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
688 libc_freeres_fn (free_mem)
690 __regfree (&re_comp_buf);
694 #endif /* _REGEX_RE_COMP */
696 /* Internal entry point.
697 Compile the regular expression PATTERN, whose length is LENGTH.
698 SYNTAX indicate regular expression's syntax. */
701 re_compile_internal (regex_t *preg, const char * pattern, int length,
704 reg_errcode_t err = REG_NOERROR;
708 /* Initialize the pattern buffer. */
709 preg->re_fastmap_accurate = 0;
710 preg->re_syntax = syntax;
711 preg->re_not_bol = preg->re_not_eol = 0;
714 preg->re_can_be_null = 0;
715 preg->re_regs_allocated = REG_UNALLOCATED;
717 /* Initialize the dfa. */
718 dfa = (re_dfa_t *) preg->re_buffer;
719 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
721 /* If zero allocated, but buffer is non-null, try to realloc
722 enough space. This loses if buffer's address is bogus, but
723 that is the user's responsibility. If buffer is null this
724 is a simple allocation. */
725 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
728 preg->re_allocated = sizeof (re_dfa_t);
729 preg->re_buffer = (unsigned char *) dfa;
731 preg->re_used = sizeof (re_dfa_t);
733 __libc_lock_init (dfa->lock);
735 err = init_dfa (dfa, length);
736 if (BE (err != REG_NOERROR, 0))
738 free_dfa_content (dfa);
739 preg->re_buffer = NULL;
740 preg->re_allocated = 0;
744 dfa->re_str = re_malloc (char, length + 1);
745 strncpy (dfa->re_str, pattern, length + 1);
748 err = re_string_construct (®exp, pattern, length, preg->re_translate,
749 syntax & REG_IGNORE_CASE, dfa);
750 if (BE (err != REG_NOERROR, 0))
752 re_compile_internal_free_return:
753 free_workarea_compile (preg);
754 re_string_destruct (®exp);
755 free_dfa_content (dfa);
756 preg->re_buffer = NULL;
757 preg->re_allocated = 0;
761 /* Parse the regular expression, and build a structure tree. */
763 dfa->str_tree = parse (®exp, preg, syntax, &err);
764 if (BE (dfa->str_tree == NULL, 0))
765 goto re_compile_internal_free_return;
767 /* Analyze the tree and create the nfa. */
768 err = analyze (preg);
769 if (BE (err != REG_NOERROR, 0))
770 goto re_compile_internal_free_return;
772 #ifdef RE_ENABLE_I18N
773 /* If possible, do searching in single byte encoding to speed things up. */
774 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
778 /* Then create the initial state of the dfa. */
779 err = create_initial_state (dfa);
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (®exp);
785 if (BE (err != REG_NOERROR, 0))
787 free_dfa_content (dfa);
788 preg->re_buffer = NULL;
789 preg->re_allocated = 0;
795 /* Initialize DFA. We use the length of the regular expression PAT_LEN
796 as the initial length of some arrays. */
799 init_dfa (re_dfa_t *dfa, int pat_len)
806 memset (dfa, '\0', sizeof (re_dfa_t));
808 /* Force allocation of str_tree_storage the first time. */
809 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
811 dfa->nodes_alloc = pat_len + 1;
812 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
814 dfa->states_alloc = pat_len + 1;
816 /* table_size = 2 ^ ceil(log pat_len) */
817 for (table_size = 1; table_size > 0; table_size <<= 1)
818 if (table_size > pat_len)
821 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
822 dfa->state_hash_mask = table_size - 1;
824 dfa->mb_cur_max = MB_CUR_MAX;
826 if (dfa->mb_cur_max == 6
827 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
829 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
832 # ifdef HAVE_LANGINFO_CODESET
833 codeset_name = nl_langinfo (CODESET);
835 codeset_name = getenv ("LC_ALL");
836 if (codeset_name == NULL || codeset_name[0] == '\0')
837 codeset_name = getenv ("LC_CTYPE");
838 if (codeset_name == NULL || codeset_name[0] == '\0')
839 codeset_name = getenv ("LANG");
840 if (codeset_name == NULL)
842 else if (strchr (codeset_name, '.') != NULL)
843 codeset_name = strchr (codeset_name, '.') + 1;
846 if (strcasecmp (codeset_name, "UTF-8") == 0
847 || strcasecmp (codeset_name, "UTF8") == 0)
850 /* We check exhaustively in the loop below if this charset is a
851 superset of ASCII. */
852 dfa->map_notascii = 0;
855 #ifdef RE_ENABLE_I18N
856 if (dfa->mb_cur_max > 1)
859 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
864 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
865 if (BE (dfa->sb_char == NULL, 0))
868 /* Clear all bits by, then set those corresponding to single
870 bitset_empty (dfa->sb_char);
872 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
873 for (j = 0; j < UINT_BITS; ++j, ++ch)
875 wint_t wch = __btowc (ch);
877 dfa->sb_char[i] |= 1 << j;
879 if (isascii (ch) && wch != ch)
880 dfa->map_notascii = 1;
887 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
892 /* Initialize WORD_CHAR table, which indicate which character is
893 "word". In this case "word" means that it is the word construction
894 character used by some operators like "\<", "\>", etc. */
897 init_word_char (re_dfa_t *dfa)
900 dfa->word_ops_used = 1;
901 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
902 for (j = 0; j < UINT_BITS; ++j, ++ch)
903 if (isalnum (ch) || ch == '_')
904 dfa->word_char[i] |= 1 << j;
907 /* Free the work area which are only used while compiling. */
910 free_workarea_compile (regex_t *preg)
912 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
913 bin_tree_storage_t *storage, *next;
914 for (storage = dfa->str_tree_storage; storage; storage = next)
916 next = storage->next;
919 dfa->str_tree_storage = NULL;
920 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
921 dfa->str_tree = NULL;
922 re_free (dfa->org_indices);
923 dfa->org_indices = NULL;
926 /* Create initial states for all contexts. */
929 create_initial_state (re_dfa_t *dfa)
933 re_node_set init_nodes;
935 /* Initial states have the epsilon closure of the node which is
936 the first node of the regular expression. */
937 first = dfa->str_tree->first->node_idx;
938 dfa->init_node = first;
939 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
940 if (BE (err != REG_NOERROR, 0))
943 /* The back-references which are in initial states can epsilon transit,
944 since in this case all of the subexpressions can be null.
945 Then we add epsilon closures of the nodes which are the next nodes of
946 the back-references. */
947 if (dfa->nbackref > 0)
948 for (i = 0; i < init_nodes.nelem; ++i)
950 int node_idx = init_nodes.elems[i];
951 re_token_type_t type = dfa->nodes[node_idx].type;
954 if (type != OP_BACK_REF)
956 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
958 re_token_t *clexp_node;
959 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
960 if (clexp_node->type == OP_CLOSE_SUBEXP
961 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
964 if (clexp_idx == init_nodes.nelem)
967 if (type == OP_BACK_REF)
969 int dest_idx = dfa->edests[node_idx].elems[0];
970 if (!re_node_set_contains (&init_nodes, dest_idx))
972 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
978 /* It must be the first time to invoke acquire_state. */
979 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
980 /* We don't check ERR here, since the initial state must not be NULL. */
981 if (BE (dfa->init_state == NULL, 0))
983 if (dfa->init_state->has_constraint)
985 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
987 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
989 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
993 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
994 || dfa->init_state_begbuf == NULL, 0))
998 dfa->init_state_word = dfa->init_state_nl
999 = dfa->init_state_begbuf = dfa->init_state;
1001 re_node_set_free (&init_nodes);
1005 #ifdef RE_ENABLE_I18N
1006 /* If it is possible to do searching in single byte encoding instead of UTF-8
1007 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1008 DFA nodes where needed. */
1011 optimize_utf8 (re_dfa_t *dfa)
1013 int node, i, mb_chars = 0, has_period = 0;
1015 for (node = 0; node < dfa->nodes_len; ++node)
1016 switch (dfa->nodes[node].type)
1019 if (dfa->nodes[node].opr.c >= 0x80)
1023 switch (dfa->nodes[node].opr.idx)
1031 /* Word anchors etc. cannot be handled. */
1041 case OP_DUP_ASTERISK:
1042 case OP_OPEN_SUBEXP:
1043 case OP_CLOSE_SUBEXP:
1045 case COMPLEX_BRACKET:
1047 case SIMPLE_BRACKET:
1048 /* Just double check. */
1049 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1050 if (dfa->nodes[node].opr.sbcset[i])
1057 if (mb_chars || has_period)
1058 for (node = 0; node < dfa->nodes_len; ++node)
1060 if (dfa->nodes[node].type == CHARACTER
1061 && dfa->nodes[node].opr.c >= 0x80)
1062 dfa->nodes[node].mb_partial = 0;
1063 else if (dfa->nodes[node].type == OP_PERIOD)
1064 dfa->nodes[node].type = OP_UTF8_PERIOD;
1067 /* The search can be in single byte locale. */
1068 dfa->mb_cur_max = 1;
1070 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075 "eclosure", and "inveclosure". */
1077 static reg_errcode_t
1078 analyze (regex_t *preg)
1080 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1083 /* Allocate arrays. */
1084 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1085 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1086 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089 || dfa->eclosures == NULL, 0))
1092 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1093 if (dfa->subexp_map != NULL)
1096 for (i = 0; i < preg->re_nsub; i++)
1097 dfa->subexp_map[i] = i;
1098 preorder (dfa->str_tree, optimize_subexps, dfa);
1099 for (i = 0; i < preg->re_nsub; i++)
1100 if (dfa->subexp_map[i] != i)
1102 if (i == preg->re_nsub)
1104 free (dfa->subexp_map);
1105 dfa->subexp_map = NULL;
1109 ret = postorder (dfa->str_tree, lower_subexps, preg);
1110 if (BE (ret != REG_NOERROR, 0))
1112 ret = postorder (dfa->str_tree, calc_first, dfa);
1113 if (BE (ret != REG_NOERROR, 0))
1115 preorder (dfa->str_tree, calc_next, dfa);
1116 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1119 ret = calc_eclosure (dfa);
1120 if (BE (ret != REG_NOERROR, 0))
1123 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1125 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1128 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129 if (BE (dfa->inveclosures == NULL, 0))
1131 ret = calc_inveclosure (dfa);
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138 implement parse tree visits. Instead, we use parent pointers and
1139 some hairy code in these two functions. */
1140 static reg_errcode_t
1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1144 bin_tree_t *node, *prev;
1146 for (node = root; ; )
1148 /* Descend down the tree, preferably to the left (or to the right
1149 if that's the only child). */
1150 while (node->left || node->right)
1158 reg_errcode_t err = fn (extra, node);
1159 if (BE (err != REG_NOERROR, 0))
1161 if (node->parent == NULL)
1164 node = node->parent;
1166 /* Go up while we have a node that is reached from the right. */
1167 while (node->right == prev || node->right == NULL);
1172 static reg_errcode_t
1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1178 for (node = root; ; )
1180 reg_errcode_t err = fn (extra, node);
1181 if (BE (err != REG_NOERROR, 0))
1184 /* Go to the left node, or up and to the right. */
1189 bin_tree_t *prev = NULL;
1190 while (node->right == prev || node->right == NULL)
1193 node = node->parent;
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1204 backreferences as well. Requires a preorder visit. */
1205 static reg_errcode_t
1206 optimize_subexps (void *extra, bin_tree_t *node)
1208 re_dfa_t *dfa = (re_dfa_t *) extra;
1210 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1212 int idx = node->token.opr.idx;
1213 node->token.opr.idx = dfa->subexp_map[idx];
1214 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1217 else if (node->token.type == SUBEXP
1218 && node->left && node->left->token.type == SUBEXP)
1220 int other_idx = node->left->token.opr.idx;
1222 node->left = node->left->left;
1224 node->left->parent = node;
1226 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1228 dfa->used_bkref_map &= ~(1 << other_idx);
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1236 static reg_errcode_t
1237 lower_subexps (void *extra, bin_tree_t *node)
1239 regex_t *preg = (regex_t *) extra;
1240 reg_errcode_t err = REG_NOERROR;
1242 if (node->left && node->left->token.type == SUBEXP)
1244 node->left = lower_subexp (&err, preg, node->left);
1246 node->left->parent = node;
1248 if (node->right && node->right->token.type == SUBEXP)
1250 node->right = lower_subexp (&err, preg, node->right);
1252 node->right->parent = node;
1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1261 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1262 bin_tree_t *body = node->left;
1263 bin_tree_t *op, *cls, *tree1, *tree;
1266 /* We do not optimize empty subexpressions, because otherwise we may
1267 have bad CONCAT nodes with NULL children. This is obviously not
1268 very common, so we do not lose much. An example that triggers
1269 this case is the sed "script" /\(\)/x. */
1270 && node->left != NULL
1271 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1272 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1275 /* Convert the SUBEXP node to the concatenation of an
1276 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1277 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1278 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1279 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1280 tree = create_tree (dfa, op, tree1, CONCAT);
1281 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1287 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1288 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1292 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1293 nodes. Requires a postorder visit. */
1294 static reg_errcode_t
1295 calc_first (void *extra, bin_tree_t *node)
1297 re_dfa_t *dfa = (re_dfa_t *) extra;
1298 if (node->token.type == CONCAT)
1300 node->first = node->left->first;
1301 node->node_idx = node->left->node_idx;
1306 node->node_idx = re_dfa_add_node (dfa, node->token);
1307 if (BE (node->node_idx == -1, 0))
1313 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1314 static reg_errcode_t
1315 calc_next (void *extra, bin_tree_t *node)
1317 switch (node->token.type)
1319 case OP_DUP_ASTERISK:
1320 node->left->next = node;
1323 node->left->next = node->right->first;
1324 node->right->next = node->next;
1328 node->left->next = node->next;
1330 node->right->next = node->next;
1336 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1337 static reg_errcode_t
1338 link_nfa_nodes (void *extra, bin_tree_t *node)
1340 re_dfa_t *dfa = (re_dfa_t *) extra;
1341 int idx = node->node_idx;
1342 reg_errcode_t err = REG_NOERROR;
1344 switch (node->token.type)
1350 assert (node->next == NULL);
1353 case OP_DUP_ASTERISK:
1357 dfa->has_plural_match = 1;
1358 if (node->left != NULL)
1359 left = node->left->first->node_idx;
1361 left = node->next->node_idx;
1362 if (node->right != NULL)
1363 right = node->right->first->node_idx;
1365 right = node->next->node_idx;
1367 assert (right > -1);
1368 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1373 case OP_OPEN_SUBEXP:
1374 case OP_CLOSE_SUBEXP:
1375 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1379 dfa->nexts[idx] = node->next->node_idx;
1380 if (node->token.type == OP_BACK_REF)
1381 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1385 assert (!IS_EPSILON_NODE (node->token.type));
1386 dfa->nexts[idx] = node->next->node_idx;
1393 /* Duplicate the epsilon closure of the node ROOT_NODE.
1394 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1395 to their own constraint. */
1397 static reg_errcode_t
1398 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1399 int root_node, unsigned int init_constraint)
1401 int org_node, clone_node, ret;
1402 unsigned int constraint = init_constraint;
1403 for (org_node = top_org_node, clone_node = top_clone_node;;)
1405 int org_dest, clone_dest;
1406 if (dfa->nodes[org_node].type == OP_BACK_REF)
1408 /* If the back reference epsilon-transit, its destination must
1409 also have the constraint. Then duplicate the epsilon closure
1410 of the destination of the back reference, and store it in
1411 edests of the back reference. */
1412 org_dest = dfa->nexts[org_node];
1413 re_node_set_empty (dfa->edests + clone_node);
1414 clone_dest = duplicate_node (dfa, org_dest, constraint);
1415 if (BE (clone_dest == -1, 0))
1417 dfa->nexts[clone_node] = dfa->nexts[org_node];
1418 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1419 if (BE (ret < 0, 0))
1422 else if (dfa->edests[org_node].nelem == 0)
1424 /* In case of the node can't epsilon-transit, don't duplicate the
1425 destination and store the original destination as the
1426 destination of the node. */
1427 dfa->nexts[clone_node] = dfa->nexts[org_node];
1430 else if (dfa->edests[org_node].nelem == 1)
1432 /* In case of the node can epsilon-transit, and it has only one
1434 org_dest = dfa->edests[org_node].elems[0];
1435 re_node_set_empty (dfa->edests + clone_node);
1436 if (dfa->nodes[org_node].type == ANCHOR)
1438 /* In case of the node has another constraint, append it. */
1439 if (org_node == root_node && clone_node != org_node)
1441 /* ...but if the node is root_node itself, it means the
1442 epsilon closure have a loop, then tie it to the
1443 destination of the root_node. */
1444 ret = re_node_set_insert (dfa->edests + clone_node,
1446 if (BE (ret < 0, 0))
1450 constraint |= dfa->nodes[org_node].opr.ctx_type;
1452 clone_dest = duplicate_node (dfa, org_dest, constraint);
1453 if (BE (clone_dest == -1, 0))
1455 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1456 if (BE (ret < 0, 0))
1459 else /* dfa->edests[org_node].nelem == 2 */
1461 /* In case of the node can epsilon-transit, and it has two
1462 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1463 org_dest = dfa->edests[org_node].elems[0];
1464 re_node_set_empty (dfa->edests + clone_node);
1465 /* Search for a duplicated node which satisfies the constraint. */
1466 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1467 if (clone_dest == -1)
1469 /* There are no such a duplicated node, create a new one. */
1471 clone_dest = duplicate_node (dfa, org_dest, constraint);
1472 if (BE (clone_dest == -1, 0))
1474 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1475 if (BE (ret < 0, 0))
1477 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1478 root_node, constraint);
1479 if (BE (err != REG_NOERROR, 0))
1484 /* There are a duplicated node which satisfy the constraint,
1485 use it to avoid infinite loop. */
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1491 org_dest = dfa->edests[org_node].elems[1];
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == -1, 0))
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1499 org_node = org_dest;
1500 clone_node = clone_dest;
1505 /* Search for a node which is duplicated from the node ORG_NODE, and
1506 satisfies the constraint CONSTRAINT. */
1509 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1512 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1514 if (org_node == dfa->org_indices[idx]
1515 && constraint == dfa->nodes[idx].constraint)
1516 return idx; /* Found. */
1518 return -1; /* Not found. */
1521 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1522 Return the index of the new node, or -1 if insufficient storage is
1526 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1528 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1529 if (BE (dup_idx != -1, 1))
1531 dfa->nodes[dup_idx].constraint = constraint;
1532 if (dfa->nodes[org_idx].type == ANCHOR)
1533 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1534 dfa->nodes[dup_idx].duplicated = 1;
1536 /* Store the index of the original node. */
1537 dfa->org_indices[dup_idx] = org_idx;
1542 static reg_errcode_t
1543 calc_inveclosure (re_dfa_t *dfa)
1546 for (idx = 0; idx < dfa->nodes_len; ++idx)
1547 re_node_set_init_empty (dfa->inveclosures + idx);
1549 for (src = 0; src < dfa->nodes_len; ++src)
1551 int *elems = dfa->eclosures[src].elems;
1552 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1554 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1555 if (BE (ret == -1, 0))
1563 /* Calculate "eclosure" for all the node in DFA. */
1565 static reg_errcode_t
1566 calc_eclosure (re_dfa_t *dfa)
1568 int node_idx, incomplete;
1570 assert (dfa->nodes_len > 0);
1573 /* For each nodes, calculate epsilon closure. */
1574 for (node_idx = 0; ; ++node_idx)
1577 re_node_set eclosure_elem;
1578 if (node_idx == dfa->nodes_len)
1587 assert (dfa->eclosures[node_idx].nelem != -1);
1590 /* If we have already calculated, skip it. */
1591 if (dfa->eclosures[node_idx].nelem != 0)
1593 /* Calculate epsilon closure of `node_idx'. */
1594 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1595 if (BE (err != REG_NOERROR, 0))
1598 if (dfa->eclosures[node_idx].nelem == 0)
1601 re_node_set_free (&eclosure_elem);
1607 /* Calculate epsilon closure of NODE. */
1609 static reg_errcode_t
1610 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1613 unsigned int constraint;
1615 re_node_set eclosure;
1617 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1618 if (BE (err != REG_NOERROR, 0))
1621 /* This indicates that we are calculating this node now.
1622 We reference this value to avoid infinite loop. */
1623 dfa->eclosures[node].nelem = -1;
1625 constraint = ((dfa->nodes[node].type == ANCHOR)
1626 ? dfa->nodes[node].opr.ctx_type : 0);
1627 /* If the current node has constraints, duplicate all nodes.
1628 Since they must inherit the constraints. */
1630 && dfa->edests[node].nelem
1631 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1633 int org_node, cur_node;
1634 org_node = cur_node = node;
1635 err = duplicate_node_closure (dfa, node, node, node, constraint);
1636 if (BE (err != REG_NOERROR, 0))
1640 /* Expand each epsilon destination nodes. */
1641 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1642 for (i = 0; i < dfa->edests[node].nelem; ++i)
1644 re_node_set eclosure_elem;
1645 int edest = dfa->edests[node].elems[i];
1646 /* If calculating the epsilon closure of `edest' is in progress,
1647 return intermediate result. */
1648 if (dfa->eclosures[edest].nelem == -1)
1653 /* If we haven't calculated the epsilon closure of `edest' yet,
1654 calculate now. Otherwise use calculated epsilon closure. */
1655 if (dfa->eclosures[edest].nelem == 0)
1657 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1658 if (BE (err != REG_NOERROR, 0))
1662 eclosure_elem = dfa->eclosures[edest];
1663 /* Merge the epsilon closure of `edest'. */
1664 re_node_set_merge (&eclosure, &eclosure_elem);
1665 /* If the epsilon closure of `edest' is incomplete,
1666 the epsilon closure of this node is also incomplete. */
1667 if (dfa->eclosures[edest].nelem == 0)
1670 re_node_set_free (&eclosure_elem);
1674 /* Epsilon closures include itself. */
1675 re_node_set_insert (&eclosure, node);
1676 if (incomplete && !root)
1677 dfa->eclosures[node].nelem = 0;
1679 dfa->eclosures[node] = eclosure;
1680 *new_set = eclosure;
1684 /* Functions for token which are used in the parser. */
1686 /* Fetch a token from INPUT.
1687 We must not use this function inside bracket expressions. */
1690 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1692 re_string_skip_bytes (input, peek_token (result, input, syntax));
1695 /* Peek a token from INPUT, and return the length of the token.
1696 We must not use this function inside bracket expressions. */
1699 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1703 if (re_string_eoi (input))
1705 token->type = END_OF_RE;
1709 c = re_string_peek_byte (input, 0);
1712 token->word_char = 0;
1713 #ifdef RE_ENABLE_I18N
1714 token->mb_partial = 0;
1715 if (input->mb_cur_max > 1 &&
1716 !re_string_first_byte (input, re_string_cur_idx (input)))
1718 token->type = CHARACTER;
1719 token->mb_partial = 1;
1726 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1728 token->type = BACK_SLASH;
1732 c2 = re_string_peek_byte_case (input, 1);
1734 token->type = CHARACTER;
1735 #ifdef RE_ENABLE_I18N
1736 if (input->mb_cur_max > 1)
1738 wint_t wc = re_string_wchar_at (input,
1739 re_string_cur_idx (input) + 1);
1740 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1744 token->word_char = IS_WORD_CHAR (c2) != 0;
1749 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1750 token->type = OP_ALT;
1752 case '1': case '2': case '3': case '4': case '5':
1753 case '6': case '7': case '8': case '9':
1754 if (!(syntax & REG_NO_BK_REFS))
1756 token->type = OP_BACK_REF;
1757 token->opr.idx = c2 - '1';
1761 if (!(syntax & REG_NO_GNU_OPS))
1763 token->type = ANCHOR;
1764 token->opr.ctx_type = WORD_FIRST;
1768 if (!(syntax & REG_NO_GNU_OPS))
1770 token->type = ANCHOR;
1771 token->opr.ctx_type = WORD_LAST;
1775 if (!(syntax & REG_NO_GNU_OPS))
1777 token->type = ANCHOR;
1778 token->opr.ctx_type = WORD_DELIM;
1782 if (!(syntax & REG_NO_GNU_OPS))
1784 token->type = ANCHOR;
1785 token->opr.ctx_type = NOT_WORD_DELIM;
1789 if (!(syntax & REG_NO_GNU_OPS))
1790 token->type = OP_WORD;
1793 if (!(syntax & REG_NO_GNU_OPS))
1794 token->type = OP_NOTWORD;
1797 if (!(syntax & REG_NO_GNU_OPS))
1798 token->type = OP_SPACE;
1801 if (!(syntax & REG_NO_GNU_OPS))
1802 token->type = OP_NOTSPACE;
1805 if (!(syntax & REG_NO_GNU_OPS))
1807 token->type = ANCHOR;
1808 token->opr.ctx_type = BUF_FIRST;
1812 if (!(syntax & REG_NO_GNU_OPS))
1814 token->type = ANCHOR;
1815 token->opr.ctx_type = BUF_LAST;
1819 if (!(syntax & REG_NO_BK_PARENS))
1820 token->type = OP_OPEN_SUBEXP;
1823 if (!(syntax & REG_NO_BK_PARENS))
1824 token->type = OP_CLOSE_SUBEXP;
1827 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1828 token->type = OP_DUP_PLUS;
1831 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1832 token->type = OP_DUP_QUESTION;
1835 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1836 token->type = OP_OPEN_DUP_NUM;
1839 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1840 token->type = OP_CLOSE_DUP_NUM;
1848 token->type = CHARACTER;
1849 #ifdef RE_ENABLE_I18N
1850 if (input->mb_cur_max > 1)
1852 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1853 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1857 token->word_char = IS_WORD_CHAR (token->opr.c);
1862 if (syntax & REG_NEWLINE_ALT)
1863 token->type = OP_ALT;
1866 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1867 token->type = OP_ALT;
1870 token->type = OP_DUP_ASTERISK;
1873 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1874 token->type = OP_DUP_PLUS;
1877 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1878 token->type = OP_DUP_QUESTION;
1881 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1882 token->type = OP_OPEN_DUP_NUM;
1885 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1886 token->type = OP_CLOSE_DUP_NUM;
1889 if (syntax & REG_NO_BK_PARENS)
1890 token->type = OP_OPEN_SUBEXP;
1893 if (syntax & REG_NO_BK_PARENS)
1894 token->type = OP_CLOSE_SUBEXP;
1897 token->type = OP_OPEN_BRACKET;
1900 token->type = OP_PERIOD;
1903 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1904 re_string_cur_idx (input) != 0)
1906 char prev = re_string_peek_byte (input, -1);
1907 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1910 token->type = ANCHOR;
1911 token->opr.ctx_type = LINE_FIRST;
1914 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1915 re_string_cur_idx (input) + 1 != re_string_length (input))
1918 re_string_skip_bytes (input, 1);
1919 peek_token (&next, input, syntax);
1920 re_string_skip_bytes (input, -1);
1921 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1924 token->type = ANCHOR;
1925 token->opr.ctx_type = LINE_LAST;
1933 /* Peek a token from INPUT, and return the length of the token.
1934 We must not use this function out of bracket expressions. */
1937 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1940 if (re_string_eoi (input))
1942 token->type = END_OF_RE;
1945 c = re_string_peek_byte (input, 0);
1948 #ifdef RE_ENABLE_I18N
1949 if (input->mb_cur_max > 1 &&
1950 !re_string_first_byte (input, re_string_cur_idx (input)))
1952 token->type = CHARACTER;
1955 #endif /* RE_ENABLE_I18N */
1957 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1958 && re_string_cur_idx (input) + 1 < re_string_length (input))
1960 /* In this case, '\' escape a character. */
1962 re_string_skip_bytes (input, 1);
1963 c2 = re_string_peek_byte (input, 0);
1965 token->type = CHARACTER;
1968 if (c == '[') /* '[' is a special char in a bracket exps. */
1972 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1973 c2 = re_string_peek_byte (input, 1);
1981 token->type = OP_OPEN_COLL_ELEM;
1984 token->type = OP_OPEN_EQUIV_CLASS;
1987 if (syntax & REG_CHAR_CLASSES)
1989 token->type = OP_OPEN_CHAR_CLASS;
1992 /* else fall through. */
1994 token->type = CHARACTER;
2004 token->type = OP_CHARSET_RANGE;
2007 token->type = OP_CLOSE_BRACKET;
2010 token->type = OP_NON_MATCH_LIST;
2013 token->type = CHARACTER;
2018 /* Functions for parser. */
2020 /* Entry point of the parser.
2021 Parse the regular expression REGEXP and return the structure tree.
2022 If an error is occured, ERR is set by error code, and return NULL.
2023 This function build the following tree, from regular expression <reg_exp>:
2029 CAT means concatenation.
2030 EOR means end of regular expression. */
2033 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2036 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2037 bin_tree_t *tree, *eor, *root;
2038 re_token_t current_token;
2039 dfa->syntax = syntax;
2040 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2041 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2042 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2044 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2046 root = create_tree (dfa, tree, eor, CONCAT);
2049 if (BE (eor == NULL || root == NULL, 0))
2057 /* This function build the following tree, from regular expression
2058 <branch1>|<branch2>:
2064 ALT means alternative, which represents the operator `|'. */
2067 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2068 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2070 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2071 bin_tree_t *tree, *branch = NULL;
2072 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2073 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2076 while (token->type == OP_ALT)
2078 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2079 if (token->type != OP_ALT && token->type != END_OF_RE
2080 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2082 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2083 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2088 tree = create_tree (dfa, tree, branch, OP_ALT);
2089 if (BE (tree == NULL, 0))
2098 /* This function build the following tree, from regular expression
2105 CAT means concatenation. */
2108 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2109 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2111 bin_tree_t *tree, *exp;
2112 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2113 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2114 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2117 while (token->type != OP_ALT && token->type != END_OF_RE
2118 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2121 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2125 if (tree != NULL && exp != NULL)
2127 tree = create_tree (dfa, tree, exp, CONCAT);
2134 else if (tree == NULL)
2136 /* Otherwise exp == NULL, we don't need to create new tree. */
2141 /* This function build the following tree, from regular expression a*:
2148 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2151 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2153 switch (token->type)
2156 tree = create_token_tree (dfa, NULL, NULL, token);
2157 if (BE (tree == NULL, 0))
2162 #ifdef RE_ENABLE_I18N
2163 if (dfa->mb_cur_max > 1)
2165 while (!re_string_eoi (regexp)
2166 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2168 bin_tree_t *mbc_remain;
2169 fetch_token (token, regexp, syntax);
2170 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2171 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2172 if (BE (mbc_remain == NULL || tree == NULL, 0))
2181 case OP_OPEN_SUBEXP:
2182 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2183 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186 case OP_OPEN_BRACKET:
2187 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2197 dfa->used_bkref_map |= 1 << token->opr.idx;
2198 tree = create_token_tree (dfa, NULL, NULL, token);
2199 if (BE (tree == NULL, 0))
2205 dfa->has_mb_node = 1;
2207 case OP_OPEN_DUP_NUM:
2208 if (syntax & REG_CONTEXT_INVALID_DUP)
2214 case OP_DUP_ASTERISK:
2216 case OP_DUP_QUESTION:
2217 if (syntax & REG_CONTEXT_INVALID_OPS)
2222 else if (syntax & REG_CONTEXT_INDEP_OPS)
2224 fetch_token (token, regexp, syntax);
2225 return parse_expression (regexp, preg, token, syntax, nest, err);
2227 /* else fall through */
2228 case OP_CLOSE_SUBEXP:
2229 if ((token->type == OP_CLOSE_SUBEXP) &&
2230 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2235 /* else fall through */
2236 case OP_CLOSE_DUP_NUM:
2237 /* We treat it as a normal character. */
2239 /* Then we can these characters as normal characters. */
2240 token->type = CHARACTER;
2241 /* mb_partial and word_char bits should be initialized already
2243 tree = create_token_tree (dfa, NULL, NULL, token);
2244 if (BE (tree == NULL, 0))
2251 if ((token->opr.ctx_type
2252 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2253 && dfa->word_ops_used == 0)
2254 init_word_char (dfa);
2255 if (token->opr.ctx_type == WORD_DELIM
2256 || token->opr.ctx_type == NOT_WORD_DELIM)
2258 bin_tree_t *tree_first, *tree_last;
2259 if (token->opr.ctx_type == WORD_DELIM)
2261 token->opr.ctx_type = WORD_FIRST;
2262 tree_first = create_token_tree (dfa, NULL, NULL, token);
2263 token->opr.ctx_type = WORD_LAST;
2267 token->opr.ctx_type = INSIDE_WORD;
2268 tree_first = create_token_tree (dfa, NULL, NULL, token);
2269 token->opr.ctx_type = INSIDE_NOTWORD;
2271 tree_last = create_token_tree (dfa, NULL, NULL, token);
2272 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2273 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2288 /* We must return here, since ANCHORs can't be followed
2289 by repetition operators.
2290 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2291 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2292 fetch_token (token, regexp, syntax);
2295 tree = create_token_tree (dfa, NULL, NULL, token);
2296 if (BE (tree == NULL, 0))
2301 if (dfa->mb_cur_max > 1)
2302 dfa->has_mb_node = 1;
2306 tree = build_charclass_op (dfa, regexp->trans,
2307 (const unsigned char *) "alnum",
2308 (const unsigned char *) "_",
2309 token->type == OP_NOTWORD, err);
2310 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2315 tree = build_charclass_op (dfa, regexp->trans,
2316 (const unsigned char *) "space",
2317 (const unsigned char *) "",
2318 token->type == OP_NOTSPACE, err);
2319 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2329 /* Must not happen? */
2335 fetch_token (token, regexp, syntax);
2337 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2338 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2340 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2341 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343 /* In BRE consecutive duplications are not allowed. */
2344 if ((syntax & REG_CONTEXT_INVALID_DUP)
2345 && (token->type == OP_DUP_ASTERISK
2346 || token->type == OP_OPEN_DUP_NUM))
2356 /* This function build the following tree, from regular expression
2364 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2365 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2367 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2370 cur_nsub = preg->re_nsub++;
2372 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2374 /* The subexpression may be a null string. */
2375 if (token->type == OP_CLOSE_SUBEXP)
2379 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2380 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2382 if (BE (*err != REG_NOERROR, 0))
2385 dfa->completed_bkref_map |= 1 << cur_nsub;
2387 tree = create_tree (dfa, tree, NULL, SUBEXP);
2388 if (BE (tree == NULL, 0))
2393 tree->token.opr.idx = cur_nsub;
2397 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2400 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2401 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2403 bin_tree_t *tree = NULL, *old_tree = NULL;
2404 int i, start, end, start_idx = re_string_cur_idx (regexp);
2405 re_token_t start_token = *token;
2407 if (token->type == OP_OPEN_DUP_NUM)
2410 start = fetch_number (regexp, token, syntax);
2413 if (token->type == CHARACTER && token->opr.c == ',')
2414 start = 0; /* We treat "{,m}" as "{0,m}". */
2417 *err = REG_BADBR; /* <re>{} is invalid. */
2421 if (BE (start != -2, 1))
2423 /* We treat "{n}" as "{n,n}". */
2424 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2425 : ((token->type == CHARACTER && token->opr.c == ',')
2426 ? fetch_number (regexp, token, syntax) : -2));
2428 if (BE (start == -2 || end == -2, 0))
2430 /* Invalid sequence. */
2431 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2433 if (token->type == END_OF_RE)
2441 /* If the syntax bit is set, rollback. */
2442 re_string_set_index (regexp, start_idx);
2443 *token = start_token;
2444 token->type = CHARACTER;
2445 /* mb_partial and word_char bits should be already initialized by
2450 if (BE (end != -1 && start > end, 0))
2452 /* First number greater than second. */
2459 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2460 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2463 fetch_token (token, regexp, syntax);
2465 if (BE (elem == NULL, 0))
2467 if (BE (start == 0 && end == 0, 0))
2469 postorder (elem, free_tree, NULL);
2473 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2474 if (BE (start > 0, 0))
2477 for (i = 2; i <= start; ++i)
2479 elem = duplicate_tree (elem, dfa);
2480 tree = create_tree (dfa, tree, elem, CONCAT);
2481 if (BE (elem == NULL || tree == NULL, 0))
2482 goto parse_dup_op_espace;
2488 /* Duplicate ELEM before it is marked optional. */
2489 elem = duplicate_tree (elem, dfa);
2495 if (elem->token.type == SUBEXP)
2496 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2498 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2499 if (BE (tree == NULL, 0))
2500 goto parse_dup_op_espace;
2502 /* This loop is actually executed only when end != -1,
2503 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2504 already created the start+1-th copy. */
2505 for (i = start + 2; i <= end; ++i)
2507 elem = duplicate_tree (elem, dfa);
2508 tree = create_tree (dfa, tree, elem, CONCAT);
2509 if (BE (elem == NULL || tree == NULL, 0))
2510 goto parse_dup_op_espace;
2512 tree = create_tree (dfa, tree, NULL, OP_ALT);
2513 if (BE (tree == NULL, 0))
2514 goto parse_dup_op_espace;
2518 tree = create_tree (dfa, old_tree, tree, CONCAT);
2522 parse_dup_op_espace:
2527 /* Size of the names for collating symbol/equivalence_class/character_class.
2528 I'm not sure, but maybe enough. */
2529 #define BRACKET_NAME_BUF_SIZE 32
2532 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2533 Build the range expression which starts from START_ELEM, and ends
2534 at END_ELEM. The result are written to MBCSET and SBCSET.
2535 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2536 mbcset->range_ends, is a pointer argument sinse we may
2539 static reg_errcode_t
2540 build_range_exp (re_bitset_ptr_t sbcset,
2541 # ifdef RE_ENABLE_I18N
2542 re_charset_t *mbcset, int *range_alloc,
2544 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2546 unsigned int start_ch, end_ch;
2547 /* Equivalence Classes and Character Classes can't be a range start/end. */
2548 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2549 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2553 /* We can handle no multi character collating elements without libc
2555 if (BE ((start_elem->type == COLL_SYM
2556 && strlen ((char *) start_elem->opr.name) > 1)
2557 || (end_elem->type == COLL_SYM
2558 && strlen ((char *) end_elem->opr.name) > 1), 0))
2559 return REG_ECOLLATE;
2561 # ifdef RE_ENABLE_I18N
2564 wint_t start_wc, end_wc;
2565 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2567 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2568 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2570 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2571 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2573 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2574 ? __btowc (start_ch) : start_elem->opr.wch);
2575 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2576 ? __btowc (end_ch) : end_elem->opr.wch);
2577 if (start_wc == WEOF || end_wc == WEOF)
2578 return REG_ECOLLATE;
2579 cmp_buf[0] = start_wc;
2580 cmp_buf[4] = end_wc;
2581 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2584 /* Got valid collation sequence values, add them as a new entry.
2585 However, for !_LIBC we have no collation elements: if the
2586 character set is single byte, the single byte character set
2587 that we build below suffices. parse_bracket_exp passes
2588 no MBCSET if dfa->mb_cur_max == 1. */
2591 /* Check the space of the arrays. */
2592 if (BE (*range_alloc == mbcset->nranges, 0))
2594 /* There is not enough space, need realloc. */
2595 wchar_t *new_array_start, *new_array_end;
2598 /* +1 in case of mbcset->nranges is 0. */
2599 new_nranges = 2 * mbcset->nranges + 1;
2600 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2601 are NULL if *range_alloc == 0. */
2602 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2604 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2607 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2610 mbcset->range_starts = new_array_start;
2611 mbcset->range_ends = new_array_end;
2612 *range_alloc = new_nranges;
2615 mbcset->range_starts[mbcset->nranges] = start_wc;
2616 mbcset->range_ends[mbcset->nranges++] = end_wc;
2619 /* Build the table for single byte characters. */
2620 for (wc = 0; wc < SBC_MAX; ++wc)
2623 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2624 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2625 bitset_set (sbcset, wc);
2628 # else /* not RE_ENABLE_I18N */
2631 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2632 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2634 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2635 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2637 if (start_ch > end_ch)
2639 /* Build the table for single byte characters. */
2640 for (ch = 0; ch < SBC_MAX; ++ch)
2641 if (start_ch <= ch && ch <= end_ch)
2642 bitset_set (sbcset, ch);
2644 # endif /* not RE_ENABLE_I18N */
2647 #endif /* not _LIBC */
2650 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2651 Build the collating element which is represented by NAME.
2652 The result are written to MBCSET and SBCSET.
2653 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2654 pointer argument since we may update it. */
2656 static reg_errcode_t
2657 build_collating_symbol (re_bitset_ptr_t sbcset,
2658 # ifdef RE_ENABLE_I18N
2659 re_charset_t *mbcset, int *coll_sym_alloc,
2661 const unsigned char *name)
2663 size_t name_len = strlen ((const char *) name);
2664 if (BE (name_len != 1, 0))
2665 return REG_ECOLLATE;
2668 bitset_set (sbcset, name[0]);
2672 #endif /* not _LIBC */
2674 /* This function parse bracket expression like "[abc]", "[a-c]",
2678 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2679 reg_syntax_t syntax, reg_errcode_t *err)
2682 const unsigned char *collseqmb;
2683 const char *collseqwc;
2686 const int32_t *symb_table;
2687 const unsigned char *extra;
2689 /* Local function for parse_bracket_exp used in _LIBC environement.
2690 Seek the collating symbol entry correspondings to NAME.
2691 Return the index of the symbol in the SYMB_TABLE. */
2694 __attribute ((always_inline))
2695 seek_collating_symbol_entry (name, name_len)
2696 const unsigned char *name;
2699 int32_t hash = elem_hash ((const char *) name, name_len);
2700 int32_t elem = hash % table_size;
2701 int32_t second = hash % (table_size - 2);
2702 while (symb_table[2 * elem] != 0)
2704 /* First compare the hashing value. */
2705 if (symb_table[2 * elem] == hash
2706 /* Compare the length of the name. */
2707 && name_len == extra[symb_table[2 * elem + 1]]
2708 /* Compare the name. */
2709 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2712 /* Yep, this is the entry. */
2722 /* Local function for parse_bracket_exp used in _LIBC environement.
2723 Look up the collation sequence value of BR_ELEM.
2724 Return the value if succeeded, UINT_MAX otherwise. */
2726 auto inline unsigned int
2727 __attribute ((always_inline))
2728 lookup_collation_sequence_value (br_elem)
2729 bracket_elem_t *br_elem;
2731 if (br_elem->type == SB_CHAR)
2734 if (MB_CUR_MAX == 1)
2737 return collseqmb[br_elem->opr.ch];
2740 wint_t wc = __btowc (br_elem->opr.ch);
2741 return __collseq_table_lookup (collseqwc, wc);
2744 else if (br_elem->type == MB_CHAR)
2746 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2748 else if (br_elem->type == COLL_SYM)
2750 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2754 elem = seek_collating_symbol_entry (br_elem->opr.name,
2756 if (symb_table[2 * elem] != 0)
2758 /* We found the entry. */
2759 idx = symb_table[2 * elem + 1];
2760 /* Skip the name of collating element name. */
2761 idx += 1 + extra[idx];
2762 /* Skip the byte sequence of the collating element. */
2763 idx += 1 + extra[idx];
2764 /* Adjust for the alignment. */
2765 idx = (idx + 3) & ~3;
2766 /* Skip the multibyte collation sequence value. */
2767 idx += sizeof (unsigned int);
2768 /* Skip the wide char sequence of the collating element. */
2769 idx += sizeof (unsigned int) *
2770 (1 + *(unsigned int *) (extra + idx));
2771 /* Return the collation sequence value. */
2772 return *(unsigned int *) (extra + idx);
2774 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2776 /* No valid character. Match it as a single byte
2778 return collseqmb[br_elem->opr.name[0]];
2781 else if (sym_name_len == 1)
2782 return collseqmb[br_elem->opr.name[0]];
2787 /* Local function for parse_bracket_exp used in _LIBC environement.
2788 Build the range expression which starts from START_ELEM, and ends
2789 at END_ELEM. The result are written to MBCSET and SBCSET.
2790 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2791 mbcset->range_ends, is a pointer argument sinse we may
2794 auto inline reg_errcode_t
2795 __attribute ((always_inline))
2796 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2797 re_charset_t *mbcset;
2799 re_bitset_ptr_t sbcset;
2800 bracket_elem_t *start_elem, *end_elem;
2803 uint32_t start_collseq;
2804 uint32_t end_collseq;
2806 /* Equivalence Classes and Character Classes can't be a range
2808 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2809 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2813 start_collseq = lookup_collation_sequence_value (start_elem);
2814 end_collseq = lookup_collation_sequence_value (end_elem);
2815 /* Check start/end collation sequence values. */
2816 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2817 return REG_ECOLLATE;
2818 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2821 /* Got valid collation sequence values, add them as a new entry.
2822 However, if we have no collation elements, and the character set
2823 is single byte, the single byte character set that we
2824 build below suffices. */
2825 if (nrules > 0 || dfa->mb_cur_max > 1)
2827 /* Check the space of the arrays. */
2828 if (BE (*range_alloc == mbcset->nranges, 0))
2830 /* There is not enough space, need realloc. */
2831 uint32_t *new_array_start;
2832 uint32_t *new_array_end;
2835 /* +1 in case of mbcset->nranges is 0. */
2836 new_nranges = 2 * mbcset->nranges + 1;
2837 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2839 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2842 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2845 mbcset->range_starts = new_array_start;
2846 mbcset->range_ends = new_array_end;
2847 *range_alloc = new_nranges;
2850 mbcset->range_starts[mbcset->nranges] = start_collseq;
2851 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2854 /* Build the table for single byte characters. */
2855 for (ch = 0; ch < SBC_MAX; ch++)
2857 uint32_t ch_collseq;
2859 if (MB_CUR_MAX == 1)
2862 ch_collseq = collseqmb[ch];
2864 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2865 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2866 bitset_set (sbcset, ch);
2871 /* Local function for parse_bracket_exp used in _LIBC environement.
2872 Build the collating element which is represented by NAME.
2873 The result are written to MBCSET and SBCSET.
2874 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2875 pointer argument sinse we may update it. */
2877 auto inline reg_errcode_t
2878 __attribute ((always_inline))
2879 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2880 re_charset_t *mbcset;
2881 int *coll_sym_alloc;
2882 re_bitset_ptr_t sbcset;
2883 const unsigned char *name;
2886 size_t name_len = strlen ((const char *) name);
2889 elem = seek_collating_symbol_entry (name, name_len);
2890 if (symb_table[2 * elem] != 0)
2892 /* We found the entry. */
2893 idx = symb_table[2 * elem + 1];
2894 /* Skip the name of collating element name. */
2895 idx += 1 + extra[idx];
2897 else if (symb_table[2 * elem] == 0 && name_len == 1)
2899 /* No valid character, treat it as a normal
2901 bitset_set (sbcset, name[0]);
2905 return REG_ECOLLATE;
2907 /* Got valid collation sequence, add it as a new entry. */
2908 /* Check the space of the arrays. */
2909 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2911 /* Not enough, realloc it. */
2912 /* +1 in case of mbcset->ncoll_syms is 0. */
2913 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2914 /* Use realloc since mbcset->coll_syms is NULL
2916 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2917 new_coll_sym_alloc);
2918 if (BE (new_coll_syms == NULL, 0))
2920 mbcset->coll_syms = new_coll_syms;
2921 *coll_sym_alloc = new_coll_sym_alloc;
2923 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2928 if (BE (name_len != 1, 0))
2929 return REG_ECOLLATE;
2932 bitset_set (sbcset, name[0]);
2939 re_token_t br_token;
2940 re_bitset_ptr_t sbcset;
2941 #ifdef RE_ENABLE_I18N
2942 re_charset_t *mbcset;
2943 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2944 int equiv_class_alloc = 0, char_class_alloc = 0;
2945 #endif /* not RE_ENABLE_I18N */
2947 bin_tree_t *work_tree;
2949 int first_round = 1;
2951 collseqmb = (const unsigned char *)
2952 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2953 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2959 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2960 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2961 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2962 _NL_COLLATE_SYMB_TABLEMB);
2963 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2964 _NL_COLLATE_SYMB_EXTRAMB);
2967 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2968 #ifdef RE_ENABLE_I18N
2969 mbcset = re_calloc (re_charset_t, 1);
2970 #endif /* RE_ENABLE_I18N */
2971 #ifdef RE_ENABLE_I18N
2972 if (BE (sbcset == NULL || mbcset == NULL, 0))
2974 if (BE (sbcset == NULL, 0))
2975 #endif /* RE_ENABLE_I18N */
2981 token_len = peek_token_bracket (token, regexp, syntax);
2982 if (BE (token->type == END_OF_RE, 0))
2985 goto parse_bracket_exp_free_return;
2987 if (token->type == OP_NON_MATCH_LIST)
2989 #ifdef RE_ENABLE_I18N
2990 mbcset->non_match = 1;
2991 #endif /* not RE_ENABLE_I18N */
2993 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2994 bitset_set (sbcset, '\0');
2995 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2996 token_len = peek_token_bracket (token, regexp, syntax);
2997 if (BE (token->type == END_OF_RE, 0))
3000 goto parse_bracket_exp_free_return;
3004 /* We treat the first ']' as a normal character. */
3005 if (token->type == OP_CLOSE_BRACKET)
3006 token->type = CHARACTER;
3010 bracket_elem_t start_elem, end_elem;
3011 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3012 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3014 int token_len2 = 0, is_range_exp = 0;
3017 start_elem.opr.name = start_name_buf;
3018 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3019 syntax, first_round);
3020 if (BE (ret != REG_NOERROR, 0))
3023 goto parse_bracket_exp_free_return;
3027 /* Get information about the next token. We need it in any case. */
3028 token_len = peek_token_bracket (token, regexp, syntax);
3030 /* Do not check for ranges if we know they are not allowed. */
3031 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3033 if (BE (token->type == END_OF_RE, 0))
3036 goto parse_bracket_exp_free_return;
3038 if (token->type == OP_CHARSET_RANGE)
3040 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3041 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3042 if (BE (token2.type == END_OF_RE, 0))
3045 goto parse_bracket_exp_free_return;
3047 if (token2.type == OP_CLOSE_BRACKET)
3049 /* We treat the last '-' as a normal character. */
3050 re_string_skip_bytes (regexp, -token_len);
3051 token->type = CHARACTER;
3058 if (is_range_exp == 1)
3060 end_elem.opr.name = end_name_buf;
3061 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3063 if (BE (ret != REG_NOERROR, 0))
3066 goto parse_bracket_exp_free_return;
3069 token_len = peek_token_bracket (token, regexp, syntax);
3072 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3073 &start_elem, &end_elem);
3075 # ifdef RE_ENABLE_I18N
3076 *err = build_range_exp (sbcset,
3077 dfa->mb_cur_max > 1 ? mbcset : NULL,
3078 &range_alloc, &start_elem, &end_elem);
3080 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3082 #endif /* RE_ENABLE_I18N */
3083 if (BE (*err != REG_NOERROR, 0))
3084 goto parse_bracket_exp_free_return;
3088 switch (start_elem.type)
3091 bitset_set (sbcset, start_elem.opr.ch);
3093 #ifdef RE_ENABLE_I18N
3095 /* Check whether the array has enough space. */
3096 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3098 wchar_t *new_mbchars;
3099 /* Not enough, realloc it. */
3100 /* +1 in case of mbcset->nmbchars is 0. */
3101 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3102 /* Use realloc since array is NULL if *alloc == 0. */
3103 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3105 if (BE (new_mbchars == NULL, 0))
3106 goto parse_bracket_exp_espace;
3107 mbcset->mbchars = new_mbchars;
3109 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3111 #endif /* RE_ENABLE_I18N */
3113 *err = build_equiv_class (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115 mbcset, &equiv_class_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117 start_elem.opr.name);
3118 if (BE (*err != REG_NOERROR, 0))
3119 goto parse_bracket_exp_free_return;
3122 *err = build_collating_symbol (sbcset,
3123 #ifdef RE_ENABLE_I18N
3124 mbcset, &coll_sym_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126 start_elem.opr.name);
3127 if (BE (*err != REG_NOERROR, 0))
3128 goto parse_bracket_exp_free_return;
3131 *err = build_charclass (regexp->trans, sbcset,
3132 #ifdef RE_ENABLE_I18N
3133 mbcset, &char_class_alloc,
3134 #endif /* RE_ENABLE_I18N */
3135 start_elem.opr.name, syntax);
3136 if (BE (*err != REG_NOERROR, 0))
3137 goto parse_bracket_exp_free_return;
3144 if (BE (token->type == END_OF_RE, 0))
3147 goto parse_bracket_exp_free_return;
3149 if (token->type == OP_CLOSE_BRACKET)
3153 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3155 /* If it is non-matching list. */
3157 bitset_not (sbcset);
3159 #ifdef RE_ENABLE_I18N
3160 /* Ensure only single byte characters are set. */
3161 if (dfa->mb_cur_max > 1)
3162 bitset_mask (sbcset, dfa->sb_char);
3164 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166 || mbcset->non_match)))
3168 bin_tree_t *mbc_tree;
3170 /* Build a tree for complex bracket. */
3171 dfa->has_mb_node = 1;
3172 br_token.type = COMPLEX_BRACKET;
3173 br_token.opr.mbcset = mbcset;
3174 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3175 if (BE (mbc_tree == NULL, 0))
3176 goto parse_bracket_exp_espace;
3177 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3178 if (sbcset[sbc_idx])
3180 /* If there are no bits set in sbcset, there is no point
3181 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3182 if (sbc_idx < BITSET_UINTS)
3184 /* Build a tree for simple bracket. */
3185 br_token.type = SIMPLE_BRACKET;
3186 br_token.opr.sbcset = sbcset;
3187 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3188 if (BE (work_tree == NULL, 0))
3189 goto parse_bracket_exp_espace;
3191 /* Then join them by ALT node. */
3192 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3193 if (BE (work_tree == NULL, 0))
3194 goto parse_bracket_exp_espace;
3199 work_tree = mbc_tree;
3203 #endif /* not RE_ENABLE_I18N */
3205 #ifdef RE_ENABLE_I18N
3206 free_charset (mbcset);
3208 /* Build a tree for simple bracket. */
3209 br_token.type = SIMPLE_BRACKET;
3210 br_token.opr.sbcset = sbcset;
3211 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212 if (BE (work_tree == NULL, 0))
3213 goto parse_bracket_exp_espace;
3217 parse_bracket_exp_espace:
3219 parse_bracket_exp_free_return:
3221 #ifdef RE_ENABLE_I18N
3222 free_charset (mbcset);
3223 #endif /* RE_ENABLE_I18N */
3227 /* Parse an element in the bracket expression. */
3229 static reg_errcode_t
3230 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3231 re_token_t *token, int token_len, re_dfa_t *dfa,
3232 reg_syntax_t syntax, int accept_hyphen)
3234 #ifdef RE_ENABLE_I18N
3236 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3237 if (cur_char_size > 1)
3239 elem->type = MB_CHAR;
3240 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3241 re_string_skip_bytes (regexp, cur_char_size);
3244 #endif /* RE_ENABLE_I18N */
3245 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3246 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3247 || token->type == OP_OPEN_EQUIV_CLASS)
3248 return parse_bracket_symbol (elem, regexp, token);
3249 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3251 /* A '-' must only appear as anything but a range indicator before
3252 the closing bracket. Everything else is an error. */
3254 (void) peek_token_bracket (&token2, regexp, syntax);
3255 if (token2.type != OP_CLOSE_BRACKET)
3256 /* The actual error value is not standardized since this whole
3257 case is undefined. But ERANGE makes good sense. */
3260 elem->type = SB_CHAR;
3261 elem->opr.ch = token->opr.c;
3265 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3266 such as [:<character_class>:], [.<collating_element>.], and
3267 [=<equivalent_class>=]. */
3269 static reg_errcode_t
3270 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3273 unsigned char ch, delim = token->opr.c;
3275 if (re_string_eoi(regexp))
3279 if (i >= BRACKET_NAME_BUF_SIZE)
3281 if (token->type == OP_OPEN_CHAR_CLASS)
3282 ch = re_string_fetch_byte_case (regexp);
3284 ch = re_string_fetch_byte (regexp);
3285 if (re_string_eoi(regexp))
3287 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3289 elem->opr.name[i] = ch;
3291 re_string_skip_bytes (regexp, 1);
3292 elem->opr.name[i] = '\0';
3293 switch (token->type)
3295 case OP_OPEN_COLL_ELEM:
3296 elem->type = COLL_SYM;
3298 case OP_OPEN_EQUIV_CLASS:
3299 elem->type = EQUIV_CLASS;
3301 case OP_OPEN_CHAR_CLASS:
3302 elem->type = CHAR_CLASS;
3310 /* Helper function for parse_bracket_exp.
3311 Build the equivalence class which is represented by NAME.
3312 The result are written to MBCSET and SBCSET.
3313 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3314 is a pointer argument sinse we may update it. */
3316 static reg_errcode_t
3317 build_equiv_class (re_bitset_ptr_t sbcset,
3318 #ifdef RE_ENABLE_I18N
3319 re_charset_t *mbcset, int *equiv_class_alloc,
3321 const unsigned char *name)
3324 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3327 const int32_t *table, *indirect;
3328 const unsigned char *weights, *extra, *cp;
3329 unsigned char char_buf[2];
3333 /* This #include defines a local function! */
3334 # include <locale/weight.h>
3335 /* Calculate the index for equivalence class. */
3337 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3338 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339 _NL_COLLATE_WEIGHTMB);
3340 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_EXTRAMB);
3342 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3343 _NL_COLLATE_INDIRECTMB);
3344 idx1 = findidx (&cp);
3345 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3346 /* This isn't a valid character. */
3347 return REG_ECOLLATE;
3349 /* Build single byte matcing table for this equivalence class. */
3350 char_buf[1] = (unsigned char) '\0';
3351 len = weights[idx1];
3352 for (ch = 0; ch < SBC_MAX; ++ch)
3356 idx2 = findidx (&cp);
3361 /* This isn't a valid character. */
3363 if (len == weights[idx2])
3366 while (cnt <= len &&
3367 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3371 bitset_set (sbcset, ch);
3374 /* Check whether the array has enough space. */
3375 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3377 /* Not enough, realloc it. */
3378 /* +1 in case of mbcset->nequiv_classes is 0. */
3379 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3380 /* Use realloc since the array is NULL if *alloc == 0. */
3381 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3383 new_equiv_class_alloc);
3384 if (BE (new_equiv_classes == NULL, 0))
3386 mbcset->equiv_classes = new_equiv_classes;
3387 *equiv_class_alloc = new_equiv_class_alloc;
3389 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3394 if (BE (strlen ((const char *) name) != 1, 0))
3395 return REG_ECOLLATE;
3396 bitset_set (sbcset, *name);
3401 /* Helper function for parse_bracket_exp.
3402 Build the character class which is represented by NAME.
3403 The result are written to MBCSET and SBCSET.
3404 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3405 is a pointer argument sinse we may update it. */
3407 static reg_errcode_t
3408 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3409 #ifdef RE_ENABLE_I18N
3410 re_charset_t *mbcset, int *char_class_alloc,
3412 const unsigned char *class_name, reg_syntax_t syntax)
3415 const char *name = (const char *) class_name;
3417 /* In case of REG_ICASE "upper" and "lower" match the both of
3418 upper and lower cases. */
3419 if ((syntax & REG_IGNORE_CASE)
3420 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3423 #ifdef RE_ENABLE_I18N
3424 /* Check the space of the arrays. */
3425 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3427 /* Not enough, realloc it. */
3428 /* +1 in case of mbcset->nchar_classes is 0. */
3429 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3430 /* Use realloc since array is NULL if *alloc == 0. */
3431 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3432 new_char_class_alloc);
3433 if (BE (new_char_classes == NULL, 0))
3435 mbcset->char_classes = new_char_classes;
3436 *char_class_alloc = new_char_class_alloc;
3438 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3444 if (ctype_func (i)) \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3451 if (strcmp (name, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum)
3453 else if (strcmp (name, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl)
3455 else if (strcmp (name, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower)
3457 else if (strcmp (name, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace)
3459 else if (strcmp (name, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha)
3461 else if (strcmp (name, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit)
3463 else if (strcmp (name, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint)
3465 else if (strcmp (name, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper)
3467 else if (strcmp (name, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank)
3469 else if (strcmp (name, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph)
3471 else if (strcmp (name, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct)
3473 else if (strcmp (name, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit)
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483 const unsigned char *class_name,
3484 const unsigned char *extra,
3485 int non_match, reg_errcode_t *err)
3487 re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489 re_charset_t *mbcset;
3491 #endif /* not RE_ENABLE_I18N */
3493 re_token_t br_token;
3496 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498 mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3501 #ifdef RE_ENABLE_I18N
3502 if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504 if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3513 #ifdef RE_ENABLE_I18N
3515 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516 bitset_set(cset->sbcset, '\0');
3518 mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3522 /* We don't care the syntax in this case. */
3523 ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3526 #endif /* RE_ENABLE_I18N */
3529 if (BE (ret != REG_NOERROR, 0))
3532 #ifdef RE_ENABLE_I18N
3533 free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3538 /* \w match '_' also. */
3539 for (; *extra; extra++)
3540 bitset_set (sbcset, *extra);
3542 /* If it is non-matching list. */
3544 bitset_not (sbcset);
3546 #ifdef RE_ENABLE_I18N
3547 /* Ensure only single byte characters are set. */
3548 if (dfa->mb_cur_max > 1)
3549 bitset_mask (sbcset, dfa->sb_char);
3552 /* Build a tree for simple bracket. */
3553 br_token.type = SIMPLE_BRACKET;
3554 br_token.opr.sbcset = sbcset;
3555 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556 if (BE (tree == NULL, 0))
3557 goto build_word_op_espace;
3559 #ifdef RE_ENABLE_I18N
3560 if (dfa->mb_cur_max > 1)
3562 bin_tree_t *mbc_tree;
3563 /* Build a tree for complex bracket. */
3564 br_token.type = COMPLEX_BRACKET;
3565 br_token.opr.mbcset = mbcset;
3566 dfa->has_mb_node = 1;
3567 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568 if (BE (mbc_tree == NULL, 0))
3569 goto build_word_op_espace;
3570 /* Then join them by ALT node. */
3571 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572 if (BE (mbc_tree != NULL, 1))
3577 free_charset (mbcset);
3580 #else /* not RE_ENABLE_I18N */
3582 #endif /* not RE_ENABLE_I18N */
3584 build_word_op_espace:
3586 #ifdef RE_ENABLE_I18N
3587 free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3593 /* This is intended for the expressions like "a{1,3}".
3594 Fetch a number from `input', and return the number.
3595 Return -1, if the number field is empty like "{,1}".
3596 Return -2, If an error is occured. */
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3605 fetch_token (token, input, syntax);
3607 if (BE (token->type == END_OF_RE, 0))
3609 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3611 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3612 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3613 num = (num > REG_DUP_MAX) ? -2 : num;
3618 #ifdef RE_ENABLE_I18N
3620 free_charset (re_charset_t *cset)
3622 re_free (cset->mbchars);
3624 re_free (cset->coll_syms);
3625 re_free (cset->equiv_classes);
3626 re_free (cset->range_starts);
3627 re_free (cset->range_ends);
3629 re_free (cset->char_classes);
3632 #endif /* RE_ENABLE_I18N */
3634 /* Functions for binary tree operation. */
3636 /* Create a tree node. */
3639 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3640 re_token_type_t type)
3644 return create_token_tree (dfa, left, right, &t);
3648 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3649 const re_token_t *token)
3652 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3654 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3656 if (storage == NULL)
3658 storage->next = dfa->str_tree_storage;
3659 dfa->str_tree_storage = storage;
3660 dfa->str_tree_storage_idx = 0;
3662 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3664 tree->parent = NULL;
3666 tree->right = right;
3667 tree->token = *token;
3668 tree->token.duplicated = 0;
3669 tree->token.opt_subexp = 0;
3672 tree->node_idx = -1;
3675 left->parent = tree;
3677 right->parent = tree;
3681 /* Mark the tree SRC as an optional subexpression.
3682 To be called from preorder or postorder. */
3684 static reg_errcode_t
3685 mark_opt_subexp (void *extra, bin_tree_t *node)
3687 int idx = (int) (long) extra;
3688 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3689 node->token.opt_subexp = 1;
3694 /* Free the allocated memory inside NODE. */
3697 free_token (re_token_t *node)
3699 #ifdef RE_ENABLE_I18N
3700 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3701 free_charset (node->opr.mbcset);
3703 #endif /* RE_ENABLE_I18N */
3704 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3705 re_free (node->opr.sbcset);
3708 /* Worker function for tree walking. Free the allocated memory inside NODE
3709 and its children. */
3711 static reg_errcode_t
3712 free_tree (void *extra, bin_tree_t *node)
3714 free_token (&node->token);
3719 /* Duplicate the node SRC, and return new node. This is a preorder
3720 visit similar to the one implemented by the generic visitor, but
3721 we need more infrastructure to maintain two parallel trees --- so,
3722 it's easier to duplicate. */
3725 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3727 const bin_tree_t *node;
3728 bin_tree_t *dup_root;
3729 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3731 for (node = root; ; )
3733 /* Create a new tree and link it back to the current parent. */
3734 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3737 (*p_new)->parent = dup_node;
3738 (*p_new)->token.duplicated = 1;
3741 /* Go to the left node, or up and to the right. */
3745 p_new = &dup_node->left;
3749 const bin_tree_t *prev = NULL;
3750 while (node->right == prev || node->right == NULL)
3753 node = node->parent;
3754 dup_node = dup_node->parent;
3759 p_new = &dup_node->right;