1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 int node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319 *p++ = dfa->nodes[node].opr.c;
320 while (++node < dfa->nodes_len
321 && dfa->nodes[node].type == CHARACTER
322 && dfa->nodes[node].mb_partial)
323 *p++ = dfa->nodes[node].opr.c;
324 memset (&state, 0, sizeof (state));
325 if (mbrtowc (&wc, (const char *) buf, p - buf,
327 && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 re_set_fastmap (fastmap, 0, buf[0]);
333 else if (type == SIMPLE_BRACKET)
336 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
337 for (j = 0; j < UINT_BITS; ++j, ++ch)
338 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
339 re_set_fastmap (fastmap, icase, ch);
341 #ifdef RE_ENABLE_I18N
342 else if (type == COMPLEX_BRACKET)
345 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
346 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
347 || cset->nranges || cset->nchar_classes)
350 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352 /* In this case we want to catch the bytes which are
353 the first byte of any collation elements.
354 e.g. In da_DK, we want to catch 'a' since "aa"
355 is a valid collation element, and don't catch
356 'b' since 'b' is the only collation element
357 which starts from 'b'. */
359 const int32_t *table = (const int32_t *)
360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
362 for (j = 0; j < UINT_BITS; ++j, ++ch)
364 re_set_fastmap (fastmap, icase, ch);
367 if (dfa->mb_cur_max > 1)
368 for (i = 0; i < SBC_MAX; ++i)
369 if (__btowc (i) == WEOF)
370 re_set_fastmap (fastmap, icase, i);
371 # endif /* not _LIBC */
373 for (i = 0; i < cset->nmbchars; ++i)
377 memset (&state, '\0', sizeof (state));
378 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
379 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
380 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
382 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
384 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
388 #endif /* RE_ENABLE_I18N */
389 else if (type == OP_PERIOD
390 #ifdef RE_ENABLE_I18N
391 || type == OP_UTF8_PERIOD
392 #endif /* RE_ENABLE_I18N */
393 || type == END_OF_RE)
395 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
396 if (type == END_OF_RE)
397 bufp->re_can_be_null = 1;
403 /* Entry point for POSIX code. */
404 /* regcomp takes a regular expression as a string and compiles it.
406 PREG is a regex_t *. We do not expect any fields to be initialized,
407 since POSIX says we shouldn't. Thus, we set
409 `re_buffer' to the compiled pattern;
410 `re_used' to the length of the compiled pattern;
411 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
412 REG_EXTENDED bit in CFLAGS is set; otherwise, to
413 REG_SYNTAX_POSIX_BASIC;
414 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
415 `re_fastmap' to an allocated space for the fastmap;
416 `re_fastmap_accurate' to zero;
417 `re_nsub' to the number of subexpressions in PATTERN.
419 PATTERN is the address of the pattern string.
421 CFLAGS is a series of bits which affect compilation.
423 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
424 use POSIX basic syntax.
426 If REG_NEWLINE is set, then . and [^...] don't match newline.
427 Also, regexec will try a match beginning after every newline.
429 If REG_ICASE is set, then we considers upper- and lowercase
430 versions of letters to be equivalent when matching.
432 If REG_NOSUB is set, then when PREG is passed to regexec, that
433 routine will report only success or failure, and nothing about the
436 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
437 the return codes and their meanings.) */
440 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
443 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
444 : REG_SYNTAX_POSIX_BASIC);
446 preg->re_buffer = NULL;
447 preg->re_allocated = 0;
450 /* Try to allocate space for the fastmap. */
451 preg->re_fastmap = re_malloc (char, SBC_MAX);
452 if (BE (preg->re_fastmap == NULL, 0))
455 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
457 /* If REG_NEWLINE is set, newlines are treated differently. */
458 if (cflags & REG_NEWLINE)
459 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
460 syntax &= ~REG_DOT_NEWLINE;
461 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
462 /* It also changes the matching behavior. */
463 preg->re_newline_anchor = 1;
466 preg->re_newline_anchor = 0;
467 preg->re_no_sub = !!(cflags & REG_NOSUB);
468 preg->re_translate = NULL;
470 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
472 /* POSIX doesn't distinguish between an unmatched open-group and an
473 unmatched close-group: both are REG_EPAREN. */
474 if (ret == REG_ERPAREN)
477 /* We have already checked preg->re_fastmap != NULL. */
478 if (BE (ret == REG_NOERROR, 1))
479 /* Compute the fastmap now, since regexec cannot modify the pattern
480 buffer. This function never fails in this implementation. */
481 (void) re_compile_fastmap (preg);
484 /* Some error occurred while compiling the expression. */
485 re_free (preg->re_fastmap);
486 preg->re_fastmap = NULL;
492 weak_alias (__regcomp, regcomp)
495 /* Returns a message corresponding to an error code, ERRCODE, returned
496 from either regcomp or regexec. We don't use PREG here. */
499 regerror (int errcode, const regex_t *__restrict preg,
500 char *__restrict errbuf, size_t errbuf_size)
506 || errcode >= (int) (sizeof (__re_error_msgid_idx)
507 / sizeof (__re_error_msgid_idx[0])), 0))
508 /* Only error codes returned by the rest of the code should be passed
509 to this routine. If we are given anything else, or if other regex
510 code generates an invalid error code, then the program has a bug.
511 Dump core so we can fix it. */
514 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
516 msg_size = strlen (msg) + 1; /* Includes the null. */
518 if (BE (errbuf_size != 0, 1))
520 if (BE (msg_size > errbuf_size, 0))
522 #if defined HAVE_MEMPCPY || defined _LIBC
523 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
525 memcpy (errbuf, msg, errbuf_size - 1);
526 errbuf[errbuf_size - 1] = 0;
530 memcpy (errbuf, msg, msg_size);
536 weak_alias (__regerror, regerror)
540 #ifdef RE_ENABLE_I18N
541 /* This static array is used for the map to single-byte characters when
542 UTF-8 is used. Otherwise we would allocate memory just to initialize
543 it the same all the time. UTF-8 is the preferred encoding so this is
544 a worthwhile optimization. */
545 static const bitset utf8_sb_map =
547 /* Set the first 128 bits. */
548 # if UINT_MAX == 0xffffffff
549 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
551 # error "Add case for new unsigned int size"
558 free_dfa_content (re_dfa_t *dfa)
563 for (i = 0; i < dfa->nodes_len; ++i)
564 free_token (dfa->nodes + i);
565 re_free (dfa->nexts);
566 for (i = 0; i < dfa->nodes_len; ++i)
568 if (dfa->eclosures != NULL)
569 re_node_set_free (dfa->eclosures + i);
570 if (dfa->inveclosures != NULL)
571 re_node_set_free (dfa->inveclosures + i);
572 if (dfa->edests != NULL)
573 re_node_set_free (dfa->edests + i);
575 re_free (dfa->edests);
576 re_free (dfa->eclosures);
577 re_free (dfa->inveclosures);
578 re_free (dfa->nodes);
580 if (dfa->state_table)
581 for (i = 0; i <= dfa->state_hash_mask; ++i)
583 struct re_state_table_entry *entry = dfa->state_table + i;
584 for (j = 0; j < entry->num; ++j)
586 re_dfastate_t *state = entry->array[j];
589 re_free (entry->array);
591 re_free (dfa->state_table);
592 #ifdef RE_ENABLE_I18N
593 if (dfa->sb_char != utf8_sb_map)
594 re_free (dfa->sb_char);
596 re_free (dfa->subexp_map);
598 re_free (dfa->re_str);
605 /* Free dynamically allocated space used by PREG. */
608 regfree (regex_t *preg)
610 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
611 if (BE (dfa != NULL, 1))
612 free_dfa_content (dfa);
613 preg->re_buffer = NULL;
614 preg->re_allocated = 0;
616 re_free (preg->re_fastmap);
617 preg->re_fastmap = NULL;
619 re_free (preg->re_translate);
620 preg->re_translate = NULL;
623 weak_alias (__regfree, regfree)
626 /* Entry points compatible with 4.2 BSD regex library. We don't define
627 them unless specifically requested. */
629 #if defined _REGEX_RE_COMP || defined _LIBC
631 /* BSD has one and only one pattern buffer. */
632 static struct re_pattern_buffer re_comp_buf;
636 /* Make these definitions weak in libc, so POSIX programs can redefine
637 these names if they don't use our functions, and still use
638 regcomp/regexec above without link errors. */
649 if (!re_comp_buf.re_buffer)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf.re_buffer)
656 fastmap = re_comp_buf.re_fastmap;
657 re_comp_buf.re_fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.re_fastmap = fastmap;
663 if (re_comp_buf.re_fastmap == NULL)
665 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
666 if (re_comp_buf.re_fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since `re_exec' always passes NULL for the `regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.re_newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
682 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t *preg, const char * pattern, int length,
703 reg_errcode_t err = REG_NOERROR;
707 /* Initialize the pattern buffer. */
708 preg->re_fastmap_accurate = 0;
709 preg->re_syntax = syntax;
710 preg->re_not_bol = preg->re_not_eol = 0;
713 preg->re_can_be_null = 0;
714 preg->re_regs_allocated = REG_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->re_buffer;
718 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If buffer is null this
723 is a simple allocation. */
724 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
727 preg->re_allocated = sizeof (re_dfa_t);
728 preg->re_buffer = (unsigned char *) dfa;
730 preg->re_used = sizeof (re_dfa_t);
732 __libc_lock_init (dfa->lock);
734 err = init_dfa (dfa, length);
735 if (BE (err != REG_NOERROR, 0))
737 free_dfa_content (dfa);
738 preg->re_buffer = NULL;
739 preg->re_allocated = 0;
743 dfa->re_str = re_malloc (char, length + 1);
744 strncpy (dfa->re_str, pattern, length + 1);
747 err = re_string_construct (®exp, pattern, length, preg->re_translate,
748 syntax & REG_IGNORE_CASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->re_buffer = NULL;
756 preg->re_allocated = 0;
760 /* Parse the regular expression, and build a structure tree. */
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->re_buffer = NULL;
788 preg->re_allocated = 0;
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
798 init_dfa (re_dfa_t *dfa, int pat_len)
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 dfa->states_alloc = pat_len + 1;
815 /* table_size = 2 ^ ceil(log pat_len) */
816 for (table_size = 1; table_size > 0; table_size <<= 1)
817 if (table_size > pat_len)
820 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
821 dfa->state_hash_mask = table_size - 1;
823 dfa->mb_cur_max = MB_CUR_MAX;
825 if (dfa->mb_cur_max == 6
826 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
828 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
831 # ifdef HAVE_LANGINFO_CODESET
832 codeset_name = nl_langinfo (CODESET);
834 codeset_name = getenv ("LC_ALL");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LC_CTYPE");
837 if (codeset_name == NULL || codeset_name[0] == '\0')
838 codeset_name = getenv ("LANG");
839 if (codeset_name == NULL)
841 else if (strchr (codeset_name, '.') != NULL)
842 codeset_name = strchr (codeset_name, '.') + 1;
845 if (strcasecmp (codeset_name, "UTF-8") == 0
846 || strcasecmp (codeset_name, "UTF8") == 0)
849 /* We check exhaustively in the loop below if this charset is a
850 superset of ASCII. */
851 dfa->map_notascii = 0;
854 #ifdef RE_ENABLE_I18N
855 if (dfa->mb_cur_max > 1)
858 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
863 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
864 if (BE (dfa->sb_char == NULL, 0))
867 /* Clear all bits by, then set those corresponding to single
869 bitset_empty (dfa->sb_char);
871 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
872 for (j = 0; j < UINT_BITS; ++j, ++ch)
874 wint_t wch = __btowc (ch);
876 dfa->sb_char[i] |= 1 << j;
878 if (isascii (ch) && wch != ch)
879 dfa->map_notascii = 1;
886 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
891 /* Initialize WORD_CHAR table, which indicate which character is
892 "word". In this case "word" means that it is the word construction
893 character used by some operators like "\<", "\>", etc. */
896 init_word_char (re_dfa_t *dfa)
899 dfa->word_ops_used = 1;
900 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
901 for (j = 0; j < UINT_BITS; ++j, ++ch)
902 if (isalnum (ch) || ch == '_')
903 dfa->word_char[i] |= 1 << j;
906 /* Free the work area which are only used while compiling. */
909 free_workarea_compile (regex_t *preg)
911 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
912 bin_tree_storage_t *storage, *next;
913 for (storage = dfa->str_tree_storage; storage; storage = next)
915 next = storage->next;
918 dfa->str_tree_storage = NULL;
919 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
920 dfa->str_tree = NULL;
921 re_free (dfa->org_indices);
922 dfa->org_indices = NULL;
925 /* Create initial states for all contexts. */
928 create_initial_state (re_dfa_t *dfa)
932 re_node_set init_nodes;
934 /* Initial states have the epsilon closure of the node which is
935 the first node of the regular expression. */
936 first = dfa->str_tree->first->node_idx;
937 dfa->init_node = first;
938 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
939 if (BE (err != REG_NOERROR, 0))
942 /* The back-references which are in initial states can epsilon transit,
943 since in this case all of the subexpressions can be null.
944 Then we add epsilon closures of the nodes which are the next nodes of
945 the back-references. */
946 if (dfa->nbackref > 0)
947 for (i = 0; i < init_nodes.nelem; ++i)
949 int node_idx = init_nodes.elems[i];
950 re_token_type_t type = dfa->nodes[node_idx].type;
953 if (type != OP_BACK_REF)
955 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
957 re_token_t *clexp_node;
958 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
959 if (clexp_node->type == OP_CLOSE_SUBEXP
960 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
963 if (clexp_idx == init_nodes.nelem)
966 if (type == OP_BACK_REF)
968 int dest_idx = dfa->edests[node_idx].elems[0];
969 if (!re_node_set_contains (&init_nodes, dest_idx))
971 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
977 /* It must be the first time to invoke acquire_state. */
978 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
979 /* We don't check ERR here, since the initial state must not be NULL. */
980 if (BE (dfa->init_state == NULL, 0))
982 if (dfa->init_state->has_constraint)
984 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
986 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
988 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
992 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
993 || dfa->init_state_begbuf == NULL, 0))
997 dfa->init_state_word = dfa->init_state_nl
998 = dfa->init_state_begbuf = dfa->init_state;
1000 re_node_set_free (&init_nodes);
1004 #ifdef RE_ENABLE_I18N
1005 /* If it is possible to do searching in single byte encoding instead of UTF-8
1006 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1007 DFA nodes where needed. */
1010 optimize_utf8 (re_dfa_t *dfa)
1012 int node, i, mb_chars = 0, has_period = 0;
1014 for (node = 0; node < dfa->nodes_len; ++node)
1015 switch (dfa->nodes[node].type)
1018 if (dfa->nodes[node].opr.c >= 0x80)
1022 switch (dfa->nodes[node].opr.idx)
1030 /* Word anchors etc. cannot be handled. */
1040 case OP_DUP_ASTERISK:
1041 case OP_OPEN_SUBEXP:
1042 case OP_CLOSE_SUBEXP:
1044 case COMPLEX_BRACKET:
1046 case SIMPLE_BRACKET:
1047 /* Just double check. */
1048 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1049 if (dfa->nodes[node].opr.sbcset[i])
1056 if (mb_chars || has_period)
1057 for (node = 0; node < dfa->nodes_len; ++node)
1059 if (dfa->nodes[node].type == CHARACTER
1060 && dfa->nodes[node].opr.c >= 0x80)
1061 dfa->nodes[node].mb_partial = 0;
1062 else if (dfa->nodes[node].type == OP_PERIOD)
1063 dfa->nodes[node].type = OP_UTF8_PERIOD;
1066 /* The search can be in single byte locale. */
1067 dfa->mb_cur_max = 1;
1069 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1073 /* Analyze the structure tree, and calculate "first", "next", "edest",
1074 "eclosure", and "inveclosure". */
1076 static reg_errcode_t
1077 analyze (regex_t *preg)
1079 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1082 /* Allocate arrays. */
1083 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1084 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1085 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1086 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1087 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1088 || dfa->eclosures == NULL, 0))
1091 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1092 if (dfa->subexp_map != NULL)
1095 for (i = 0; i < preg->re_nsub; i++)
1096 dfa->subexp_map[i] = i;
1097 preorder (dfa->str_tree, optimize_subexps, dfa);
1098 for (i = 0; i < preg->re_nsub; i++)
1099 if (dfa->subexp_map[i] != i)
1101 if (i == preg->re_nsub)
1103 free (dfa->subexp_map);
1104 dfa->subexp_map = NULL;
1108 ret = postorder (dfa->str_tree, lower_subexps, preg);
1109 if (BE (ret != REG_NOERROR, 0))
1111 ret = postorder (dfa->str_tree, calc_first, dfa);
1112 if (BE (ret != REG_NOERROR, 0))
1114 preorder (dfa->str_tree, calc_next, dfa);
1115 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1116 if (BE (ret != REG_NOERROR, 0))
1118 ret = calc_eclosure (dfa);
1119 if (BE (ret != REG_NOERROR, 0))
1122 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1123 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1124 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1127 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1128 if (BE (dfa->inveclosures == NULL, 0))
1130 ret = calc_inveclosure (dfa);
1136 /* Our parse trees are very unbalanced, so we cannot use a stack to
1137 implement parse tree visits. Instead, we use parent pointers and
1138 some hairy code in these two functions. */
1139 static reg_errcode_t
1140 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1143 bin_tree_t *node, *prev;
1145 for (node = root; ; )
1147 /* Descend down the tree, preferably to the left (or to the right
1148 if that's the only child). */
1149 while (node->left || node->right)
1157 reg_errcode_t err = fn (extra, node);
1158 if (BE (err != REG_NOERROR, 0))
1160 if (node->parent == NULL)
1163 node = node->parent;
1165 /* Go up while we have a node that is reached from the right. */
1166 while (node->right == prev || node->right == NULL);
1171 static reg_errcode_t
1172 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1177 for (node = root; ; )
1179 reg_errcode_t err = fn (extra, node);
1180 if (BE (err != REG_NOERROR, 0))
1183 /* Go to the left node, or up and to the right. */
1188 bin_tree_t *prev = NULL;
1189 while (node->right == prev || node->right == NULL)
1192 node = node->parent;
1201 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1202 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1203 backreferences as well. Requires a preorder visit. */
1204 static reg_errcode_t
1205 optimize_subexps (void *extra, bin_tree_t *node)
1207 re_dfa_t *dfa = (re_dfa_t *) extra;
1209 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1211 int idx = node->token.opr.idx;
1212 node->token.opr.idx = dfa->subexp_map[idx];
1213 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1216 else if (node->token.type == SUBEXP
1217 && node->left && node->left->token.type == SUBEXP)
1219 int other_idx = node->left->token.opr.idx;
1221 node->left = node->left->left;
1223 node->left->parent = node;
1225 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1226 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1227 dfa->used_bkref_map &= ~(1 << other_idx);
1233 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1234 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1235 static reg_errcode_t
1236 lower_subexps (void *extra, bin_tree_t *node)
1238 regex_t *preg = (regex_t *) extra;
1239 reg_errcode_t err = REG_NOERROR;
1241 if (node->left && node->left->token.type == SUBEXP)
1243 node->left = lower_subexp (&err, preg, node->left);
1245 node->left->parent = node;
1247 if (node->right && node->right->token.type == SUBEXP)
1249 node->right = lower_subexp (&err, preg, node->right);
1251 node->right->parent = node;
1258 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1260 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1261 bin_tree_t *body = node->left;
1262 bin_tree_t *op, *cls, *tree1, *tree;
1265 /* We do not optimize empty subexpressions, because otherwise we may
1266 have bad CONCAT nodes with NULL children. This is obviously not
1267 very common, so we do not lose much. An example that triggers
1268 this case is the sed "script" /\(\)/x. */
1269 && node->left != NULL
1270 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1271 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1274 /* Convert the SUBEXP node to the concatenation of an
1275 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1276 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1277 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1278 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1279 tree = create_tree (dfa, op, tree1, CONCAT);
1280 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1286 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1287 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1291 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1292 nodes. Requires a postorder visit. */
1293 static reg_errcode_t
1294 calc_first (void *extra, bin_tree_t *node)
1296 re_dfa_t *dfa = (re_dfa_t *) extra;
1297 if (node->token.type == CONCAT)
1299 node->first = node->left->first;
1300 node->node_idx = node->left->node_idx;
1305 node->node_idx = re_dfa_add_node (dfa, node->token);
1306 if (BE (node->node_idx == -1, 0))
1312 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1313 static reg_errcode_t
1314 calc_next (void *extra, bin_tree_t *node)
1316 switch (node->token.type)
1318 case OP_DUP_ASTERISK:
1319 node->left->next = node;
1322 node->left->next = node->right->first;
1323 node->right->next = node->next;
1327 node->left->next = node->next;
1329 node->right->next = node->next;
1335 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1336 static reg_errcode_t
1337 link_nfa_nodes (void *extra, bin_tree_t *node)
1339 re_dfa_t *dfa = (re_dfa_t *) extra;
1340 int idx = node->node_idx;
1341 reg_errcode_t err = REG_NOERROR;
1343 switch (node->token.type)
1349 assert (node->next == NULL);
1352 case OP_DUP_ASTERISK:
1356 dfa->has_plural_match = 1;
1357 if (node->left != NULL)
1358 left = node->left->first->node_idx;
1360 left = node->next->node_idx;
1361 if (node->right != NULL)
1362 right = node->right->first->node_idx;
1364 right = node->next->node_idx;
1366 assert (right > -1);
1367 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1372 case OP_OPEN_SUBEXP:
1373 case OP_CLOSE_SUBEXP:
1374 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1378 dfa->nexts[idx] = node->next->node_idx;
1379 if (node->token.type == OP_BACK_REF)
1380 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1384 assert (!IS_EPSILON_NODE (node->token.type));
1385 dfa->nexts[idx] = node->next->node_idx;
1392 /* Duplicate the epsilon closure of the node ROOT_NODE.
1393 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1394 to their own constraint. */
1396 static reg_errcode_t
1397 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1398 int root_node, unsigned int init_constraint)
1400 int org_node, clone_node, ret;
1401 unsigned int constraint = init_constraint;
1402 for (org_node = top_org_node, clone_node = top_clone_node;;)
1404 int org_dest, clone_dest;
1405 if (dfa->nodes[org_node].type == OP_BACK_REF)
1407 /* If the back reference epsilon-transit, its destination must
1408 also have the constraint. Then duplicate the epsilon closure
1409 of the destination of the back reference, and store it in
1410 edests of the back reference. */
1411 org_dest = dfa->nexts[org_node];
1412 re_node_set_empty (dfa->edests + clone_node);
1413 clone_dest = duplicate_node (dfa, org_dest, constraint);
1414 if (BE (clone_dest == -1, 0))
1416 dfa->nexts[clone_node] = dfa->nexts[org_node];
1417 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1418 if (BE (ret < 0, 0))
1421 else if (dfa->edests[org_node].nelem == 0)
1423 /* In case of the node can't epsilon-transit, don't duplicate the
1424 destination and store the original destination as the
1425 destination of the node. */
1426 dfa->nexts[clone_node] = dfa->nexts[org_node];
1429 else if (dfa->edests[org_node].nelem == 1)
1431 /* In case of the node can epsilon-transit, and it has only one
1433 org_dest = dfa->edests[org_node].elems[0];
1434 re_node_set_empty (dfa->edests + clone_node);
1435 if (dfa->nodes[org_node].type == ANCHOR)
1437 /* In case of the node has another constraint, append it. */
1438 if (org_node == root_node && clone_node != org_node)
1440 /* ...but if the node is root_node itself, it means the
1441 epsilon closure have a loop, then tie it to the
1442 destination of the root_node. */
1443 ret = re_node_set_insert (dfa->edests + clone_node,
1445 if (BE (ret < 0, 0))
1449 constraint |= dfa->nodes[org_node].opr.ctx_type;
1451 clone_dest = duplicate_node (dfa, org_dest, constraint);
1452 if (BE (clone_dest == -1, 0))
1454 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1455 if (BE (ret < 0, 0))
1458 else /* dfa->edests[org_node].nelem == 2 */
1460 /* In case of the node can epsilon-transit, and it has two
1461 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1462 org_dest = dfa->edests[org_node].elems[0];
1463 re_node_set_empty (dfa->edests + clone_node);
1464 /* Search for a duplicated node which satisfies the constraint. */
1465 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1466 if (clone_dest == -1)
1468 /* There are no such a duplicated node, create a new one. */
1470 clone_dest = duplicate_node (dfa, org_dest, constraint);
1471 if (BE (clone_dest == -1, 0))
1473 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1474 if (BE (ret < 0, 0))
1476 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1477 root_node, constraint);
1478 if (BE (err != REG_NOERROR, 0))
1483 /* There are a duplicated node which satisfy the constraint,
1484 use it to avoid infinite loop. */
1485 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1486 if (BE (ret < 0, 0))
1490 org_dest = dfa->edests[org_node].elems[1];
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1494 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (ret < 0, 0))
1498 org_node = org_dest;
1499 clone_node = clone_dest;
1504 /* Search for a node which is duplicated from the node ORG_NODE, and
1505 satisfies the constraint CONSTRAINT. */
1508 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1511 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1513 if (org_node == dfa->org_indices[idx]
1514 && constraint == dfa->nodes[idx].constraint)
1515 return idx; /* Found. */
1517 return -1; /* Not found. */
1520 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1521 Return the index of the new node, or -1 if insufficient storage is
1525 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1527 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1528 if (BE (dup_idx != -1, 1))
1530 dfa->nodes[dup_idx].constraint = constraint;
1531 if (dfa->nodes[org_idx].type == ANCHOR)
1532 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1533 dfa->nodes[dup_idx].duplicated = 1;
1535 /* Store the index of the original node. */
1536 dfa->org_indices[dup_idx] = org_idx;
1541 static reg_errcode_t
1542 calc_inveclosure (re_dfa_t *dfa)
1545 for (idx = 0; idx < dfa->nodes_len; ++idx)
1546 re_node_set_init_empty (dfa->inveclosures + idx);
1548 for (src = 0; src < dfa->nodes_len; ++src)
1550 int *elems = dfa->eclosures[src].elems;
1551 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1553 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1554 if (BE (ret == -1, 0))
1562 /* Calculate "eclosure" for all the node in DFA. */
1564 static reg_errcode_t
1565 calc_eclosure (re_dfa_t *dfa)
1567 int node_idx, incomplete;
1569 assert (dfa->nodes_len > 0);
1572 /* For each nodes, calculate epsilon closure. */
1573 for (node_idx = 0; ; ++node_idx)
1576 re_node_set eclosure_elem;
1577 if (node_idx == dfa->nodes_len)
1586 assert (dfa->eclosures[node_idx].nelem != -1);
1589 /* If we have already calculated, skip it. */
1590 if (dfa->eclosures[node_idx].nelem != 0)
1592 /* Calculate epsilon closure of `node_idx'. */
1593 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1594 if (BE (err != REG_NOERROR, 0))
1597 if (dfa->eclosures[node_idx].nelem == 0)
1600 re_node_set_free (&eclosure_elem);
1606 /* Calculate epsilon closure of NODE. */
1608 static reg_errcode_t
1609 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1612 unsigned int constraint;
1614 re_node_set eclosure;
1616 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1617 if (BE (err != REG_NOERROR, 0))
1620 /* This indicates that we are calculating this node now.
1621 We reference this value to avoid infinite loop. */
1622 dfa->eclosures[node].nelem = -1;
1624 constraint = ((dfa->nodes[node].type == ANCHOR)
1625 ? dfa->nodes[node].opr.ctx_type : 0);
1626 /* If the current node has constraints, duplicate all nodes.
1627 Since they must inherit the constraints. */
1629 && dfa->edests[node].nelem
1630 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1632 int org_node, cur_node;
1633 org_node = cur_node = node;
1634 err = duplicate_node_closure (dfa, node, node, node, constraint);
1635 if (BE (err != REG_NOERROR, 0))
1639 /* Expand each epsilon destination nodes. */
1640 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1641 for (i = 0; i < dfa->edests[node].nelem; ++i)
1643 re_node_set eclosure_elem;
1644 int edest = dfa->edests[node].elems[i];
1645 /* If calculating the epsilon closure of `edest' is in progress,
1646 return intermediate result. */
1647 if (dfa->eclosures[edest].nelem == -1)
1652 /* If we haven't calculated the epsilon closure of `edest' yet,
1653 calculate now. Otherwise use calculated epsilon closure. */
1654 if (dfa->eclosures[edest].nelem == 0)
1656 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1657 if (BE (err != REG_NOERROR, 0))
1661 eclosure_elem = dfa->eclosures[edest];
1662 /* Merge the epsilon closure of `edest'. */
1663 re_node_set_merge (&eclosure, &eclosure_elem);
1664 /* If the epsilon closure of `edest' is incomplete,
1665 the epsilon closure of this node is also incomplete. */
1666 if (dfa->eclosures[edest].nelem == 0)
1669 re_node_set_free (&eclosure_elem);
1673 /* Epsilon closures include itself. */
1674 re_node_set_insert (&eclosure, node);
1675 if (incomplete && !root)
1676 dfa->eclosures[node].nelem = 0;
1678 dfa->eclosures[node] = eclosure;
1679 *new_set = eclosure;
1683 /* Functions for token which are used in the parser. */
1685 /* Fetch a token from INPUT.
1686 We must not use this function inside bracket expressions. */
1689 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1691 re_string_skip_bytes (input, peek_token (result, input, syntax));
1694 /* Peek a token from INPUT, and return the length of the token.
1695 We must not use this function inside bracket expressions. */
1698 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1702 if (re_string_eoi (input))
1704 token->type = END_OF_RE;
1708 c = re_string_peek_byte (input, 0);
1711 token->word_char = 0;
1712 #ifdef RE_ENABLE_I18N
1713 token->mb_partial = 0;
1714 if (input->mb_cur_max > 1 &&
1715 !re_string_first_byte (input, re_string_cur_idx (input)))
1717 token->type = CHARACTER;
1718 token->mb_partial = 1;
1725 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1727 token->type = BACK_SLASH;
1731 c2 = re_string_peek_byte_case (input, 1);
1733 token->type = CHARACTER;
1734 #ifdef RE_ENABLE_I18N
1735 if (input->mb_cur_max > 1)
1737 wint_t wc = re_string_wchar_at (input,
1738 re_string_cur_idx (input) + 1);
1739 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1743 token->word_char = IS_WORD_CHAR (c2) != 0;
1748 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1749 token->type = OP_ALT;
1751 case '1': case '2': case '3': case '4': case '5':
1752 case '6': case '7': case '8': case '9':
1753 if (!(syntax & REG_NO_BK_REFS))
1755 token->type = OP_BACK_REF;
1756 token->opr.idx = c2 - '1';
1760 if (!(syntax & REG_NO_GNU_OPS))
1762 token->type = ANCHOR;
1763 token->opr.ctx_type = WORD_FIRST;
1767 if (!(syntax & REG_NO_GNU_OPS))
1769 token->type = ANCHOR;
1770 token->opr.ctx_type = WORD_LAST;
1774 if (!(syntax & REG_NO_GNU_OPS))
1776 token->type = ANCHOR;
1777 token->opr.ctx_type = WORD_DELIM;
1781 if (!(syntax & REG_NO_GNU_OPS))
1783 token->type = ANCHOR;
1784 token->opr.ctx_type = NOT_WORD_DELIM;
1788 if (!(syntax & REG_NO_GNU_OPS))
1789 token->type = OP_WORD;
1792 if (!(syntax & REG_NO_GNU_OPS))
1793 token->type = OP_NOTWORD;
1796 if (!(syntax & REG_NO_GNU_OPS))
1797 token->type = OP_SPACE;
1800 if (!(syntax & REG_NO_GNU_OPS))
1801 token->type = OP_NOTSPACE;
1804 if (!(syntax & REG_NO_GNU_OPS))
1806 token->type = ANCHOR;
1807 token->opr.ctx_type = BUF_FIRST;
1811 if (!(syntax & REG_NO_GNU_OPS))
1813 token->type = ANCHOR;
1814 token->opr.ctx_type = BUF_LAST;
1818 if (!(syntax & REG_NO_BK_PARENS))
1819 token->type = OP_OPEN_SUBEXP;
1822 if (!(syntax & REG_NO_BK_PARENS))
1823 token->type = OP_CLOSE_SUBEXP;
1826 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1827 token->type = OP_DUP_PLUS;
1830 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1831 token->type = OP_DUP_QUESTION;
1834 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1835 token->type = OP_OPEN_DUP_NUM;
1838 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1839 token->type = OP_CLOSE_DUP_NUM;
1847 token->type = CHARACTER;
1848 #ifdef RE_ENABLE_I18N
1849 if (input->mb_cur_max > 1)
1851 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1852 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1856 token->word_char = IS_WORD_CHAR (token->opr.c);
1861 if (syntax & REG_NEWLINE_ALT)
1862 token->type = OP_ALT;
1865 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1866 token->type = OP_ALT;
1869 token->type = OP_DUP_ASTERISK;
1872 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1873 token->type = OP_DUP_PLUS;
1876 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1877 token->type = OP_DUP_QUESTION;
1880 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1881 token->type = OP_OPEN_DUP_NUM;
1884 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1885 token->type = OP_CLOSE_DUP_NUM;
1888 if (syntax & REG_NO_BK_PARENS)
1889 token->type = OP_OPEN_SUBEXP;
1892 if (syntax & REG_NO_BK_PARENS)
1893 token->type = OP_CLOSE_SUBEXP;
1896 token->type = OP_OPEN_BRACKET;
1899 token->type = OP_PERIOD;
1902 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1903 re_string_cur_idx (input) != 0)
1905 char prev = re_string_peek_byte (input, -1);
1906 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1909 token->type = ANCHOR;
1910 token->opr.ctx_type = LINE_FIRST;
1913 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1914 re_string_cur_idx (input) + 1 != re_string_length (input))
1917 re_string_skip_bytes (input, 1);
1918 peek_token (&next, input, syntax);
1919 re_string_skip_bytes (input, -1);
1920 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1923 token->type = ANCHOR;
1924 token->opr.ctx_type = LINE_LAST;
1932 /* Peek a token from INPUT, and return the length of the token.
1933 We must not use this function out of bracket expressions. */
1936 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1939 if (re_string_eoi (input))
1941 token->type = END_OF_RE;
1944 c = re_string_peek_byte (input, 0);
1947 #ifdef RE_ENABLE_I18N
1948 if (input->mb_cur_max > 1 &&
1949 !re_string_first_byte (input, re_string_cur_idx (input)))
1951 token->type = CHARACTER;
1954 #endif /* RE_ENABLE_I18N */
1956 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1957 && re_string_cur_idx (input) + 1 < re_string_length (input))
1959 /* In this case, '\' escape a character. */
1961 re_string_skip_bytes (input, 1);
1962 c2 = re_string_peek_byte (input, 0);
1964 token->type = CHARACTER;
1967 if (c == '[') /* '[' is a special char in a bracket exps. */
1971 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1972 c2 = re_string_peek_byte (input, 1);
1980 token->type = OP_OPEN_COLL_ELEM;
1983 token->type = OP_OPEN_EQUIV_CLASS;
1986 if (syntax & REG_CHAR_CLASSES)
1988 token->type = OP_OPEN_CHAR_CLASS;
1991 /* else fall through. */
1993 token->type = CHARACTER;
2003 token->type = OP_CHARSET_RANGE;
2006 token->type = OP_CLOSE_BRACKET;
2009 token->type = OP_NON_MATCH_LIST;
2012 token->type = CHARACTER;
2017 /* Functions for parser. */
2019 /* Entry point of the parser.
2020 Parse the regular expression REGEXP and return the structure tree.
2021 If an error is occured, ERR is set by error code, and return NULL.
2022 This function build the following tree, from regular expression <reg_exp>:
2028 CAT means concatenation.
2029 EOR means end of regular expression. */
2032 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2035 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2036 bin_tree_t *tree, *eor, *root;
2037 re_token_t current_token;
2038 dfa->syntax = syntax;
2039 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2040 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2041 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2043 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2045 root = create_tree (dfa, tree, eor, CONCAT);
2048 if (BE (eor == NULL || root == NULL, 0))
2056 /* This function build the following tree, from regular expression
2057 <branch1>|<branch2>:
2063 ALT means alternative, which represents the operator `|'. */
2066 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2067 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2069 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2070 bin_tree_t *tree, *branch = NULL;
2071 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2072 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075 while (token->type == OP_ALT)
2077 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2078 if (token->type != OP_ALT && token->type != END_OF_RE
2079 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2081 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2082 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2087 tree = create_tree (dfa, tree, branch, OP_ALT);
2088 if (BE (tree == NULL, 0))
2097 /* This function build the following tree, from regular expression
2104 CAT means concatenation. */
2107 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2108 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2110 bin_tree_t *tree, *exp;
2111 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2112 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2113 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116 while (token->type != OP_ALT && token->type != END_OF_RE
2117 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2119 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2120 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2124 if (tree != NULL && exp != NULL)
2126 tree = create_tree (dfa, tree, exp, CONCAT);
2133 else if (tree == NULL)
2135 /* Otherwise exp == NULL, we don't need to create new tree. */
2140 /* This function build the following tree, from regular expression a*:
2147 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2148 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2152 switch (token->type)
2155 tree = create_token_tree (dfa, NULL, NULL, token);
2156 if (BE (tree == NULL, 0))
2161 #ifdef RE_ENABLE_I18N
2162 if (dfa->mb_cur_max > 1)
2164 while (!re_string_eoi (regexp)
2165 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2167 bin_tree_t *mbc_remain;
2168 fetch_token (token, regexp, syntax);
2169 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2170 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2171 if (BE (mbc_remain == NULL || tree == NULL, 0))
2180 case OP_OPEN_SUBEXP:
2181 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2182 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2185 case OP_OPEN_BRACKET:
2186 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2196 dfa->used_bkref_map |= 1 << token->opr.idx;
2197 tree = create_token_tree (dfa, NULL, NULL, token);
2198 if (BE (tree == NULL, 0))
2204 dfa->has_mb_node = 1;
2206 case OP_OPEN_DUP_NUM:
2207 if (syntax & REG_CONTEXT_INVALID_DUP)
2213 case OP_DUP_ASTERISK:
2215 case OP_DUP_QUESTION:
2216 if (syntax & REG_CONTEXT_INVALID_OPS)
2221 else if (syntax & REG_CONTEXT_INDEP_OPS)
2223 fetch_token (token, regexp, syntax);
2224 return parse_expression (regexp, preg, token, syntax, nest, err);
2226 /* else fall through */
2227 case OP_CLOSE_SUBEXP:
2228 if ((token->type == OP_CLOSE_SUBEXP) &&
2229 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2234 /* else fall through */
2235 case OP_CLOSE_DUP_NUM:
2236 /* We treat it as a normal character. */
2238 /* Then we can these characters as normal characters. */
2239 token->type = CHARACTER;
2240 /* mb_partial and word_char bits should be initialized already
2242 tree = create_token_tree (dfa, NULL, NULL, token);
2243 if (BE (tree == NULL, 0))
2250 if ((token->opr.ctx_type
2251 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2252 && dfa->word_ops_used == 0)
2253 init_word_char (dfa);
2254 if (token->opr.ctx_type == WORD_DELIM
2255 || token->opr.ctx_type == NOT_WORD_DELIM)
2257 bin_tree_t *tree_first, *tree_last;
2258 if (token->opr.ctx_type == WORD_DELIM)
2260 token->opr.ctx_type = WORD_FIRST;
2261 tree_first = create_token_tree (dfa, NULL, NULL, token);
2262 token->opr.ctx_type = WORD_LAST;
2266 token->opr.ctx_type = INSIDE_WORD;
2267 tree_first = create_token_tree (dfa, NULL, NULL, token);
2268 token->opr.ctx_type = INSIDE_NOTWORD;
2270 tree_last = create_token_tree (dfa, NULL, NULL, token);
2271 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2272 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2280 tree = create_token_tree (dfa, NULL, NULL, token);
2281 if (BE (tree == NULL, 0))
2287 /* We must return here, since ANCHORs can't be followed
2288 by repetition operators.
2289 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2290 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2291 fetch_token (token, regexp, syntax);
2294 tree = create_token_tree (dfa, NULL, NULL, token);
2295 if (BE (tree == NULL, 0))
2300 if (dfa->mb_cur_max > 1)
2301 dfa->has_mb_node = 1;
2305 tree = build_charclass_op (dfa, regexp->trans,
2306 (const unsigned char *) "alnum",
2307 (const unsigned char *) "_",
2308 token->type == OP_NOTWORD, err);
2309 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2314 tree = build_charclass_op (dfa, regexp->trans,
2315 (const unsigned char *) "space",
2316 (const unsigned char *) "",
2317 token->type == OP_NOTSPACE, err);
2318 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2328 /* Must not happen? */
2334 fetch_token (token, regexp, syntax);
2336 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2337 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2339 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2340 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2342 /* In BRE consecutive duplications are not allowed. */
2343 if ((syntax & REG_CONTEXT_INVALID_DUP)
2344 && (token->type == OP_DUP_ASTERISK
2345 || token->type == OP_OPEN_DUP_NUM))
2355 /* This function build the following tree, from regular expression
2363 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2364 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2366 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2369 cur_nsub = preg->re_nsub++;
2371 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2373 /* The subexpression may be a null string. */
2374 if (token->type == OP_CLOSE_SUBEXP)
2378 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2379 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2381 if (BE (*err != REG_NOERROR, 0))
2384 dfa->completed_bkref_map |= 1 << cur_nsub;
2386 tree = create_tree (dfa, tree, NULL, SUBEXP);
2387 if (BE (tree == NULL, 0))
2392 tree->token.opr.idx = cur_nsub;
2396 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2399 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2400 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2402 bin_tree_t *tree = NULL, *old_tree = NULL;
2403 int i, start, end, start_idx = re_string_cur_idx (regexp);
2404 re_token_t start_token = *token;
2406 if (token->type == OP_OPEN_DUP_NUM)
2409 start = fetch_number (regexp, token, syntax);
2412 if (token->type == CHARACTER && token->opr.c == ',')
2413 start = 0; /* We treat "{,m}" as "{0,m}". */
2416 *err = REG_BADBR; /* <re>{} is invalid. */
2420 if (BE (start != -2, 1))
2422 /* We treat "{n}" as "{n,n}". */
2423 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2424 : ((token->type == CHARACTER && token->opr.c == ',')
2425 ? fetch_number (regexp, token, syntax) : -2));
2427 if (BE (start == -2 || end == -2, 0))
2429 /* Invalid sequence. */
2430 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2432 if (token->type == END_OF_RE)
2440 /* If the syntax bit is set, rollback. */
2441 re_string_set_index (regexp, start_idx);
2442 *token = start_token;
2443 token->type = CHARACTER;
2444 /* mb_partial and word_char bits should be already initialized by
2449 if (BE (end != -1 && start > end, 0))
2451 /* First number greater than second. */
2458 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2459 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2462 fetch_token (token, regexp, syntax);
2464 if (BE (elem == NULL, 0))
2466 if (BE (start == 0 && end == 0, 0))
2468 postorder (elem, free_tree, NULL);
2472 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2473 if (BE (start > 0, 0))
2476 for (i = 2; i <= start; ++i)
2478 elem = duplicate_tree (elem, dfa);
2479 tree = create_tree (dfa, tree, elem, CONCAT);
2480 if (BE (elem == NULL || tree == NULL, 0))
2481 goto parse_dup_op_espace;
2487 /* Duplicate ELEM before it is marked optional. */
2488 elem = duplicate_tree (elem, dfa);
2494 if (elem->token.type == SUBEXP)
2495 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2497 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2498 if (BE (tree == NULL, 0))
2499 goto parse_dup_op_espace;
2501 /* This loop is actually executed only when end != -1,
2502 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2503 already created the start+1-th copy. */
2504 for (i = start + 2; i <= end; ++i)
2506 elem = duplicate_tree (elem, dfa);
2507 tree = create_tree (dfa, tree, elem, CONCAT);
2508 if (BE (elem == NULL || tree == NULL, 0))
2509 goto parse_dup_op_espace;
2511 tree = create_tree (dfa, tree, NULL, OP_ALT);
2512 if (BE (tree == NULL, 0))
2513 goto parse_dup_op_espace;
2517 tree = create_tree (dfa, old_tree, tree, CONCAT);
2521 parse_dup_op_espace:
2526 /* Size of the names for collating symbol/equivalence_class/character_class.
2527 I'm not sure, but maybe enough. */
2528 #define BRACKET_NAME_BUF_SIZE 32
2531 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2532 Build the range expression which starts from START_ELEM, and ends
2533 at END_ELEM. The result are written to MBCSET and SBCSET.
2534 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2535 mbcset->range_ends, is a pointer argument sinse we may
2538 static reg_errcode_t
2539 build_range_exp (re_bitset_ptr_t sbcset,
2540 # ifdef RE_ENABLE_I18N
2541 re_charset_t *mbcset, int *range_alloc,
2543 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2545 unsigned int start_ch, end_ch;
2546 /* Equivalence Classes and Character Classes can't be a range start/end. */
2547 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2548 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2552 /* We can handle no multi character collating elements without libc
2554 if (BE ((start_elem->type == COLL_SYM
2555 && strlen ((char *) start_elem->opr.name) > 1)
2556 || (end_elem->type == COLL_SYM
2557 && strlen ((char *) end_elem->opr.name) > 1), 0))
2558 return REG_ECOLLATE;
2560 # ifdef RE_ENABLE_I18N
2563 wint_t start_wc, end_wc;
2564 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2566 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2567 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2569 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2570 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2572 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2573 ? __btowc (start_ch) : start_elem->opr.wch);
2574 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2575 ? __btowc (end_ch) : end_elem->opr.wch);
2576 if (start_wc == WEOF || end_wc == WEOF)
2577 return REG_ECOLLATE;
2578 cmp_buf[0] = start_wc;
2579 cmp_buf[4] = end_wc;
2580 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2583 /* Got valid collation sequence values, add them as a new entry.
2584 However, for !_LIBC we have no collation elements: if the
2585 character set is single byte, the single byte character set
2586 that we build below suffices. parse_bracket_exp passes
2587 no MBCSET if dfa->mb_cur_max == 1. */
2590 /* Check the space of the arrays. */
2591 if (BE (*range_alloc == mbcset->nranges, 0))
2593 /* There is not enough space, need realloc. */
2594 wchar_t *new_array_start, *new_array_end;
2597 /* +1 in case of mbcset->nranges is 0. */
2598 new_nranges = 2 * mbcset->nranges + 1;
2599 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2600 are NULL if *range_alloc == 0. */
2601 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2603 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2606 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2609 mbcset->range_starts = new_array_start;
2610 mbcset->range_ends = new_array_end;
2611 *range_alloc = new_nranges;
2614 mbcset->range_starts[mbcset->nranges] = start_wc;
2615 mbcset->range_ends[mbcset->nranges++] = end_wc;
2618 /* Build the table for single byte characters. */
2619 for (wc = 0; wc < SBC_MAX; ++wc)
2622 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2623 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2624 bitset_set (sbcset, wc);
2627 # else /* not RE_ENABLE_I18N */
2630 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2631 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2633 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2634 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2636 if (start_ch > end_ch)
2638 /* Build the table for single byte characters. */
2639 for (ch = 0; ch < SBC_MAX; ++ch)
2640 if (start_ch <= ch && ch <= end_ch)
2641 bitset_set (sbcset, ch);
2643 # endif /* not RE_ENABLE_I18N */
2646 #endif /* not _LIBC */
2649 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2650 Build the collating element which is represented by NAME.
2651 The result are written to MBCSET and SBCSET.
2652 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2653 pointer argument since we may update it. */
2655 static reg_errcode_t
2656 build_collating_symbol (re_bitset_ptr_t sbcset,
2657 # ifdef RE_ENABLE_I18N
2658 re_charset_t *mbcset, int *coll_sym_alloc,
2660 const unsigned char *name)
2662 size_t name_len = strlen ((const char *) name);
2663 if (BE (name_len != 1, 0))
2664 return REG_ECOLLATE;
2667 bitset_set (sbcset, name[0]);
2671 #endif /* not _LIBC */
2673 /* This function parse bracket expression like "[abc]", "[a-c]",
2677 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2678 reg_syntax_t syntax, reg_errcode_t *err)
2681 const unsigned char *collseqmb;
2682 const char *collseqwc;
2685 const int32_t *symb_table;
2686 const unsigned char *extra;
2688 /* Local function for parse_bracket_exp used in _LIBC environement.
2689 Seek the collating symbol entry correspondings to NAME.
2690 Return the index of the symbol in the SYMB_TABLE. */
2693 __attribute ((always_inline))
2694 seek_collating_symbol_entry (name, name_len)
2695 const unsigned char *name;
2698 int32_t hash = elem_hash ((const char *) name, name_len);
2699 int32_t elem = hash % table_size;
2700 int32_t second = hash % (table_size - 2);
2701 while (symb_table[2 * elem] != 0)
2703 /* First compare the hashing value. */
2704 if (symb_table[2 * elem] == hash
2705 /* Compare the length of the name. */
2706 && name_len == extra[symb_table[2 * elem + 1]]
2707 /* Compare the name. */
2708 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2711 /* Yep, this is the entry. */
2721 /* Local function for parse_bracket_exp used in _LIBC environement.
2722 Look up the collation sequence value of BR_ELEM.
2723 Return the value if succeeded, UINT_MAX otherwise. */
2725 auto inline unsigned int
2726 __attribute ((always_inline))
2727 lookup_collation_sequence_value (br_elem)
2728 bracket_elem_t *br_elem;
2730 if (br_elem->type == SB_CHAR)
2733 if (MB_CUR_MAX == 1)
2736 return collseqmb[br_elem->opr.ch];
2739 wint_t wc = __btowc (br_elem->opr.ch);
2740 return __collseq_table_lookup (collseqwc, wc);
2743 else if (br_elem->type == MB_CHAR)
2745 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2747 else if (br_elem->type == COLL_SYM)
2749 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2753 elem = seek_collating_symbol_entry (br_elem->opr.name,
2755 if (symb_table[2 * elem] != 0)
2757 /* We found the entry. */
2758 idx = symb_table[2 * elem + 1];
2759 /* Skip the name of collating element name. */
2760 idx += 1 + extra[idx];
2761 /* Skip the byte sequence of the collating element. */
2762 idx += 1 + extra[idx];
2763 /* Adjust for the alignment. */
2764 idx = (idx + 3) & ~3;
2765 /* Skip the multibyte collation sequence value. */
2766 idx += sizeof (unsigned int);
2767 /* Skip the wide char sequence of the collating element. */
2768 idx += sizeof (unsigned int) *
2769 (1 + *(unsigned int *) (extra + idx));
2770 /* Return the collation sequence value. */
2771 return *(unsigned int *) (extra + idx);
2773 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2775 /* No valid character. Match it as a single byte
2777 return collseqmb[br_elem->opr.name[0]];
2780 else if (sym_name_len == 1)
2781 return collseqmb[br_elem->opr.name[0]];
2786 /* Local function for parse_bracket_exp used in _LIBC environement.
2787 Build the range expression which starts from START_ELEM, and ends
2788 at END_ELEM. The result are written to MBCSET and SBCSET.
2789 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2790 mbcset->range_ends, is a pointer argument sinse we may
2793 auto inline reg_errcode_t
2794 __attribute ((always_inline))
2795 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2796 re_charset_t *mbcset;
2798 re_bitset_ptr_t sbcset;
2799 bracket_elem_t *start_elem, *end_elem;
2802 uint32_t start_collseq;
2803 uint32_t end_collseq;
2805 /* Equivalence Classes and Character Classes can't be a range
2807 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2808 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2812 start_collseq = lookup_collation_sequence_value (start_elem);
2813 end_collseq = lookup_collation_sequence_value (end_elem);
2814 /* Check start/end collation sequence values. */
2815 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2816 return REG_ECOLLATE;
2817 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2820 /* Got valid collation sequence values, add them as a new entry.
2821 However, if we have no collation elements, and the character set
2822 is single byte, the single byte character set that we
2823 build below suffices. */
2824 if (nrules > 0 || dfa->mb_cur_max > 1)
2826 /* Check the space of the arrays. */
2827 if (BE (*range_alloc == mbcset->nranges, 0))
2829 /* There is not enough space, need realloc. */
2830 uint32_t *new_array_start;
2831 uint32_t *new_array_end;
2834 /* +1 in case of mbcset->nranges is 0. */
2835 new_nranges = 2 * mbcset->nranges + 1;
2836 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2838 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2841 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2844 mbcset->range_starts = new_array_start;
2845 mbcset->range_ends = new_array_end;
2846 *range_alloc = new_nranges;
2849 mbcset->range_starts[mbcset->nranges] = start_collseq;
2850 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2853 /* Build the table for single byte characters. */
2854 for (ch = 0; ch < SBC_MAX; ch++)
2856 uint32_t ch_collseq;
2858 if (MB_CUR_MAX == 1)
2861 ch_collseq = collseqmb[ch];
2863 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2864 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2865 bitset_set (sbcset, ch);
2870 /* Local function for parse_bracket_exp used in _LIBC environement.
2871 Build the collating element which is represented by NAME.
2872 The result are written to MBCSET and SBCSET.
2873 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2874 pointer argument sinse we may update it. */
2876 auto inline reg_errcode_t
2877 __attribute ((always_inline))
2878 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2879 re_charset_t *mbcset;
2880 int *coll_sym_alloc;
2881 re_bitset_ptr_t sbcset;
2882 const unsigned char *name;
2885 size_t name_len = strlen ((const char *) name);
2888 elem = seek_collating_symbol_entry (name, name_len);
2889 if (symb_table[2 * elem] != 0)
2891 /* We found the entry. */
2892 idx = symb_table[2 * elem + 1];
2893 /* Skip the name of collating element name. */
2894 idx += 1 + extra[idx];
2896 else if (symb_table[2 * elem] == 0 && name_len == 1)
2898 /* No valid character, treat it as a normal
2900 bitset_set (sbcset, name[0]);
2904 return REG_ECOLLATE;
2906 /* Got valid collation sequence, add it as a new entry. */
2907 /* Check the space of the arrays. */
2908 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2910 /* Not enough, realloc it. */
2911 /* +1 in case of mbcset->ncoll_syms is 0. */
2912 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2913 /* Use realloc since mbcset->coll_syms is NULL
2915 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2916 new_coll_sym_alloc);
2917 if (BE (new_coll_syms == NULL, 0))
2919 mbcset->coll_syms = new_coll_syms;
2920 *coll_sym_alloc = new_coll_sym_alloc;
2922 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2927 if (BE (name_len != 1, 0))
2928 return REG_ECOLLATE;
2931 bitset_set (sbcset, name[0]);
2938 re_token_t br_token;
2939 re_bitset_ptr_t sbcset;
2940 #ifdef RE_ENABLE_I18N
2941 re_charset_t *mbcset;
2942 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2943 int equiv_class_alloc = 0, char_class_alloc = 0;
2944 #endif /* not RE_ENABLE_I18N */
2946 bin_tree_t *work_tree;
2948 int first_round = 1;
2950 collseqmb = (const unsigned char *)
2951 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2952 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2958 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2959 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2960 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2961 _NL_COLLATE_SYMB_TABLEMB);
2962 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2963 _NL_COLLATE_SYMB_EXTRAMB);
2966 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2967 #ifdef RE_ENABLE_I18N
2968 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2969 #endif /* RE_ENABLE_I18N */
2970 #ifdef RE_ENABLE_I18N
2971 if (BE (sbcset == NULL || mbcset == NULL, 0))
2973 if (BE (sbcset == NULL, 0))
2974 #endif /* RE_ENABLE_I18N */
2980 token_len = peek_token_bracket (token, regexp, syntax);
2981 if (BE (token->type == END_OF_RE, 0))
2984 goto parse_bracket_exp_free_return;
2986 if (token->type == OP_NON_MATCH_LIST)
2988 #ifdef RE_ENABLE_I18N
2989 mbcset->non_match = 1;
2990 #endif /* not RE_ENABLE_I18N */
2992 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2993 bitset_set (sbcset, '\0');
2994 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2995 token_len = peek_token_bracket (token, regexp, syntax);
2996 if (BE (token->type == END_OF_RE, 0))
2999 goto parse_bracket_exp_free_return;
3003 /* We treat the first ']' as a normal character. */
3004 if (token->type == OP_CLOSE_BRACKET)
3005 token->type = CHARACTER;
3009 bracket_elem_t start_elem, end_elem;
3010 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3011 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3013 int token_len2 = 0, is_range_exp = 0;
3016 start_elem.opr.name = start_name_buf;
3017 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3018 syntax, first_round);
3019 if (BE (ret != REG_NOERROR, 0))
3022 goto parse_bracket_exp_free_return;
3026 /* Get information about the next token. We need it in any case. */
3027 token_len = peek_token_bracket (token, regexp, syntax);
3029 /* Do not check for ranges if we know they are not allowed. */
3030 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3032 if (BE (token->type == END_OF_RE, 0))
3035 goto parse_bracket_exp_free_return;
3037 if (token->type == OP_CHARSET_RANGE)
3039 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3040 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3041 if (BE (token2.type == END_OF_RE, 0))
3044 goto parse_bracket_exp_free_return;
3046 if (token2.type == OP_CLOSE_BRACKET)
3048 /* We treat the last '-' as a normal character. */
3049 re_string_skip_bytes (regexp, -token_len);
3050 token->type = CHARACTER;
3057 if (is_range_exp == 1)
3059 end_elem.opr.name = end_name_buf;
3060 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3062 if (BE (ret != REG_NOERROR, 0))
3065 goto parse_bracket_exp_free_return;
3068 token_len = peek_token_bracket (token, regexp, syntax);
3071 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3072 &start_elem, &end_elem);
3074 # ifdef RE_ENABLE_I18N
3075 *err = build_range_exp (sbcset,
3076 dfa->mb_cur_max > 1 ? mbcset : NULL,
3077 &range_alloc, &start_elem, &end_elem);
3079 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3081 #endif /* RE_ENABLE_I18N */
3082 if (BE (*err != REG_NOERROR, 0))
3083 goto parse_bracket_exp_free_return;
3087 switch (start_elem.type)
3090 bitset_set (sbcset, start_elem.opr.ch);
3092 #ifdef RE_ENABLE_I18N
3094 /* Check whether the array has enough space. */
3095 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3097 wchar_t *new_mbchars;
3098 /* Not enough, realloc it. */
3099 /* +1 in case of mbcset->nmbchars is 0. */
3100 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3101 /* Use realloc since array is NULL if *alloc == 0. */
3102 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3104 if (BE (new_mbchars == NULL, 0))
3105 goto parse_bracket_exp_espace;
3106 mbcset->mbchars = new_mbchars;
3108 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3110 #endif /* RE_ENABLE_I18N */
3112 *err = build_equiv_class (sbcset,
3113 #ifdef RE_ENABLE_I18N
3114 mbcset, &equiv_class_alloc,
3115 #endif /* RE_ENABLE_I18N */
3116 start_elem.opr.name);
3117 if (BE (*err != REG_NOERROR, 0))
3118 goto parse_bracket_exp_free_return;
3121 *err = build_collating_symbol (sbcset,
3122 #ifdef RE_ENABLE_I18N
3123 mbcset, &coll_sym_alloc,
3124 #endif /* RE_ENABLE_I18N */
3125 start_elem.opr.name);
3126 if (BE (*err != REG_NOERROR, 0))
3127 goto parse_bracket_exp_free_return;
3130 *err = build_charclass (regexp->trans, sbcset,
3131 #ifdef RE_ENABLE_I18N
3132 mbcset, &char_class_alloc,
3133 #endif /* RE_ENABLE_I18N */
3134 start_elem.opr.name, syntax);
3135 if (BE (*err != REG_NOERROR, 0))
3136 goto parse_bracket_exp_free_return;
3143 if (BE (token->type == END_OF_RE, 0))
3146 goto parse_bracket_exp_free_return;
3148 if (token->type == OP_CLOSE_BRACKET)
3152 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3154 /* If it is non-matching list. */
3156 bitset_not (sbcset);
3158 #ifdef RE_ENABLE_I18N
3159 /* Ensure only single byte characters are set. */
3160 if (dfa->mb_cur_max > 1)
3161 bitset_mask (sbcset, dfa->sb_char);
3163 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3164 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3165 || mbcset->non_match)))
3167 bin_tree_t *mbc_tree;
3169 /* Build a tree for complex bracket. */
3170 dfa->has_mb_node = 1;
3171 br_token.type = COMPLEX_BRACKET;
3172 br_token.opr.mbcset = mbcset;
3173 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3174 if (BE (mbc_tree == NULL, 0))
3175 goto parse_bracket_exp_espace;
3176 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3177 if (sbcset[sbc_idx])
3179 /* If there are no bits set in sbcset, there is no point
3180 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3181 if (sbc_idx < BITSET_UINTS)
3183 /* Build a tree for simple bracket. */
3184 br_token.type = SIMPLE_BRACKET;
3185 br_token.opr.sbcset = sbcset;
3186 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3187 if (BE (work_tree == NULL, 0))
3188 goto parse_bracket_exp_espace;
3190 /* Then join them by ALT node. */
3191 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3192 if (BE (work_tree == NULL, 0))
3193 goto parse_bracket_exp_espace;
3198 work_tree = mbc_tree;
3202 #endif /* not RE_ENABLE_I18N */
3204 #ifdef RE_ENABLE_I18N
3205 free_charset (mbcset);
3207 /* Build a tree for simple bracket. */
3208 br_token.type = SIMPLE_BRACKET;
3209 br_token.opr.sbcset = sbcset;
3210 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3211 if (BE (work_tree == NULL, 0))
3212 goto parse_bracket_exp_espace;
3216 parse_bracket_exp_espace:
3218 parse_bracket_exp_free_return:
3220 #ifdef RE_ENABLE_I18N
3221 free_charset (mbcset);
3222 #endif /* RE_ENABLE_I18N */
3226 /* Parse an element in the bracket expression. */
3228 static reg_errcode_t
3229 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3230 re_token_t *token, int token_len, re_dfa_t *dfa,
3231 reg_syntax_t syntax, int accept_hyphen)
3233 #ifdef RE_ENABLE_I18N
3235 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3236 if (cur_char_size > 1)
3238 elem->type = MB_CHAR;
3239 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3240 re_string_skip_bytes (regexp, cur_char_size);
3243 #endif /* RE_ENABLE_I18N */
3244 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3245 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3246 || token->type == OP_OPEN_EQUIV_CLASS)
3247 return parse_bracket_symbol (elem, regexp, token);
3248 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3250 /* A '-' must only appear as anything but a range indicator before
3251 the closing bracket. Everything else is an error. */
3253 (void) peek_token_bracket (&token2, regexp, syntax);
3254 if (token2.type != OP_CLOSE_BRACKET)
3255 /* The actual error value is not standardized since this whole
3256 case is undefined. But ERANGE makes good sense. */
3259 elem->type = SB_CHAR;
3260 elem->opr.ch = token->opr.c;
3264 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3265 such as [:<character_class>:], [.<collating_element>.], and
3266 [=<equivalent_class>=]. */
3268 static reg_errcode_t
3269 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3272 unsigned char ch, delim = token->opr.c;
3274 if (re_string_eoi(regexp))
3278 if (i >= BRACKET_NAME_BUF_SIZE)
3280 if (token->type == OP_OPEN_CHAR_CLASS)
3281 ch = re_string_fetch_byte_case (regexp);
3283 ch = re_string_fetch_byte (regexp);
3284 if (re_string_eoi(regexp))
3286 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3288 elem->opr.name[i] = ch;
3290 re_string_skip_bytes (regexp, 1);
3291 elem->opr.name[i] = '\0';
3292 switch (token->type)
3294 case OP_OPEN_COLL_ELEM:
3295 elem->type = COLL_SYM;
3297 case OP_OPEN_EQUIV_CLASS:
3298 elem->type = EQUIV_CLASS;
3300 case OP_OPEN_CHAR_CLASS:
3301 elem->type = CHAR_CLASS;
3309 /* Helper function for parse_bracket_exp.
3310 Build the equivalence class which is represented by NAME.
3311 The result are written to MBCSET and SBCSET.
3312 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3313 is a pointer argument sinse we may update it. */
3315 static reg_errcode_t
3316 build_equiv_class (re_bitset_ptr_t sbcset,
3317 #ifdef RE_ENABLE_I18N
3318 re_charset_t *mbcset, int *equiv_class_alloc,
3320 const unsigned char *name)
3323 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3326 const int32_t *table, *indirect;
3327 const unsigned char *weights, *extra, *cp;
3328 unsigned char char_buf[2];
3332 /* This #include defines a local function! */
3333 # include <locale/weight.h>
3334 /* Calculate the index for equivalence class. */
3336 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3337 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3338 _NL_COLLATE_WEIGHTMB);
3339 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3340 _NL_COLLATE_EXTRAMB);
3341 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3342 _NL_COLLATE_INDIRECTMB);
3343 idx1 = findidx (&cp);
3344 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3345 /* This isn't a valid character. */
3346 return REG_ECOLLATE;
3348 /* Build single byte matcing table for this equivalence class. */
3349 char_buf[1] = (unsigned char) '\0';
3350 len = weights[idx1];
3351 for (ch = 0; ch < SBC_MAX; ++ch)
3355 idx2 = findidx (&cp);
3360 /* This isn't a valid character. */
3362 if (len == weights[idx2])
3365 while (cnt <= len &&
3366 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3370 bitset_set (sbcset, ch);
3373 /* Check whether the array has enough space. */
3374 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3376 /* Not enough, realloc it. */
3377 /* +1 in case of mbcset->nequiv_classes is 0. */
3378 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3379 /* Use realloc since the array is NULL if *alloc == 0. */
3380 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3382 new_equiv_class_alloc);
3383 if (BE (new_equiv_classes == NULL, 0))
3385 mbcset->equiv_classes = new_equiv_classes;
3386 *equiv_class_alloc = new_equiv_class_alloc;
3388 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3393 if (BE (strlen ((const char *) name) != 1, 0))
3394 return REG_ECOLLATE;
3395 bitset_set (sbcset, *name);
3400 /* Helper function for parse_bracket_exp.
3401 Build the character class which is represented by NAME.
3402 The result are written to MBCSET and SBCSET.
3403 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3404 is a pointer argument sinse we may update it. */
3406 static reg_errcode_t
3407 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3408 #ifdef RE_ENABLE_I18N
3409 re_charset_t *mbcset, int *char_class_alloc,
3411 const unsigned char *class_name, reg_syntax_t syntax)
3414 const char *name = (const char *) class_name;
3416 /* In case of REG_ICASE "upper" and "lower" match the both of
3417 upper and lower cases. */
3418 if ((syntax & REG_IGNORE_CASE)
3419 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3422 #ifdef RE_ENABLE_I18N
3423 /* Check the space of the arrays. */
3424 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3426 /* Not enough, realloc it. */
3427 /* +1 in case of mbcset->nchar_classes is 0. */
3428 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3429 /* Use realloc since array is NULL if *alloc == 0. */
3430 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3431 new_char_class_alloc);
3432 if (BE (new_char_classes == NULL, 0))
3434 mbcset->char_classes = new_char_classes;
3435 *char_class_alloc = new_char_class_alloc;
3437 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3438 #endif /* RE_ENABLE_I18N */
3440 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3441 for (i = 0; i < SBC_MAX; ++i) \
3443 if (ctype_func (i)) \
3445 int ch = trans ? trans[i] : i; \
3446 bitset_set (sbcset, ch); \
3450 if (strcmp (name, "alnum") == 0)
3451 BUILD_CHARCLASS_LOOP (isalnum)
3452 else if (strcmp (name, "cntrl") == 0)
3453 BUILD_CHARCLASS_LOOP (iscntrl)
3454 else if (strcmp (name, "lower") == 0)
3455 BUILD_CHARCLASS_LOOP (islower)
3456 else if (strcmp (name, "space") == 0)
3457 BUILD_CHARCLASS_LOOP (isspace)
3458 else if (strcmp (name, "alpha") == 0)
3459 BUILD_CHARCLASS_LOOP (isalpha)
3460 else if (strcmp (name, "digit") == 0)
3461 BUILD_CHARCLASS_LOOP (isdigit)
3462 else if (strcmp (name, "print") == 0)
3463 BUILD_CHARCLASS_LOOP (isprint)
3464 else if (strcmp (name, "upper") == 0)
3465 BUILD_CHARCLASS_LOOP (isupper)
3466 else if (strcmp (name, "blank") == 0)
3467 BUILD_CHARCLASS_LOOP (isblank)
3468 else if (strcmp (name, "graph") == 0)
3469 BUILD_CHARCLASS_LOOP (isgraph)
3470 else if (strcmp (name, "punct") == 0)
3471 BUILD_CHARCLASS_LOOP (ispunct)
3472 else if (strcmp (name, "xdigit") == 0)
3473 BUILD_CHARCLASS_LOOP (isxdigit)
3481 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3482 const unsigned char *class_name,
3483 const unsigned char *extra,
3484 int non_match, reg_errcode_t *err)
3486 re_bitset_ptr_t sbcset;
3487 #ifdef RE_ENABLE_I18N
3488 re_charset_t *mbcset;
3490 #endif /* not RE_ENABLE_I18N */
3492 re_token_t br_token;
3495 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3496 #ifdef RE_ENABLE_I18N
3497 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3498 #endif /* RE_ENABLE_I18N */
3500 #ifdef RE_ENABLE_I18N
3501 if (BE (sbcset == NULL || mbcset == NULL, 0))
3502 #else /* not RE_ENABLE_I18N */
3503 if (BE (sbcset == NULL, 0))
3504 #endif /* not RE_ENABLE_I18N */
3512 #ifdef RE_ENABLE_I18N
3514 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3515 bitset_set(cset->sbcset, '\0');
3517 mbcset->non_match = 1;
3518 #endif /* not RE_ENABLE_I18N */
3521 /* We don't care the syntax in this case. */
3522 ret = build_charclass (trans, sbcset,
3523 #ifdef RE_ENABLE_I18N
3525 #endif /* RE_ENABLE_I18N */
3528 if (BE (ret != REG_NOERROR, 0))
3531 #ifdef RE_ENABLE_I18N
3532 free_charset (mbcset);
3533 #endif /* RE_ENABLE_I18N */
3537 /* \w match '_' also. */
3538 for (; *extra; extra++)
3539 bitset_set (sbcset, *extra);
3541 /* If it is non-matching list. */
3543 bitset_not (sbcset);
3545 #ifdef RE_ENABLE_I18N
3546 /* Ensure only single byte characters are set. */
3547 if (dfa->mb_cur_max > 1)
3548 bitset_mask (sbcset, dfa->sb_char);
3551 /* Build a tree for simple bracket. */
3552 br_token.type = SIMPLE_BRACKET;
3553 br_token.opr.sbcset = sbcset;
3554 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3555 if (BE (tree == NULL, 0))
3556 goto build_word_op_espace;
3558 #ifdef RE_ENABLE_I18N
3559 if (dfa->mb_cur_max > 1)
3561 bin_tree_t *mbc_tree;
3562 /* Build a tree for complex bracket. */
3563 br_token.type = COMPLEX_BRACKET;
3564 br_token.opr.mbcset = mbcset;
3565 dfa->has_mb_node = 1;
3566 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3567 if (BE (mbc_tree == NULL, 0))
3568 goto build_word_op_espace;
3569 /* Then join them by ALT node. */
3570 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3571 if (BE (mbc_tree != NULL, 1))
3576 free_charset (mbcset);
3579 #else /* not RE_ENABLE_I18N */
3581 #endif /* not RE_ENABLE_I18N */
3583 build_word_op_espace:
3585 #ifdef RE_ENABLE_I18N
3586 free_charset (mbcset);
3587 #endif /* RE_ENABLE_I18N */
3592 /* This is intended for the expressions like "a{1,3}".
3593 Fetch a number from `input', and return the number.
3594 Return -1, if the number field is empty like "{,1}".
3595 Return -2, If an error is occured. */
3598 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3604 fetch_token (token, input, syntax);
3606 if (BE (token->type == END_OF_RE, 0))
3608 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3610 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3611 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3612 num = (num > REG_DUP_MAX) ? -2 : num;
3617 #ifdef RE_ENABLE_I18N
3619 free_charset (re_charset_t *cset)
3621 re_free (cset->mbchars);
3623 re_free (cset->coll_syms);
3624 re_free (cset->equiv_classes);
3625 re_free (cset->range_starts);
3626 re_free (cset->range_ends);
3628 re_free (cset->char_classes);
3631 #endif /* RE_ENABLE_I18N */
3633 /* Functions for binary tree operation. */
3635 /* Create a tree node. */
3638 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3639 re_token_type_t type)
3643 return create_token_tree (dfa, left, right, &t);
3647 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3648 const re_token_t *token)
3651 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3653 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3655 if (storage == NULL)
3657 storage->next = dfa->str_tree_storage;
3658 dfa->str_tree_storage = storage;
3659 dfa->str_tree_storage_idx = 0;
3661 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3663 tree->parent = NULL;
3665 tree->right = right;
3666 tree->token = *token;
3667 tree->token.duplicated = 0;
3668 tree->token.opt_subexp = 0;
3671 tree->node_idx = -1;
3674 left->parent = tree;
3676 right->parent = tree;
3680 /* Mark the tree SRC as an optional subexpression.
3681 To be called from preorder or postorder. */
3683 static reg_errcode_t
3684 mark_opt_subexp (void *extra, bin_tree_t *node)
3686 int idx = (int) (long) extra;
3687 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3688 node->token.opt_subexp = 1;
3693 /* Free the allocated memory inside NODE. */
3696 free_token (re_token_t *node)
3698 #ifdef RE_ENABLE_I18N
3699 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3700 free_charset (node->opr.mbcset);
3702 #endif /* RE_ENABLE_I18N */
3703 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3704 re_free (node->opr.sbcset);
3707 /* Worker function for tree walking. Free the allocated memory inside NODE
3708 and its children. */
3710 static reg_errcode_t
3711 free_tree (void *extra, bin_tree_t *node)
3713 free_token (&node->token);
3718 /* Duplicate the node SRC, and return new node. This is a preorder
3719 visit similar to the one implemented by the generic visitor, but
3720 we need more infrastructure to maintain two parallel trees --- so,
3721 it's easier to duplicate. */
3724 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3726 const bin_tree_t *node;
3727 bin_tree_t *dup_root;
3728 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3730 for (node = root; ; )
3732 /* Create a new tree and link it back to the current parent. */
3733 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3736 (*p_new)->parent = dup_node;
3737 (*p_new)->token.duplicated = 1;
3740 /* Go to the left node, or up and to the right. */
3744 p_new = &dup_node->left;
3748 const bin_tree_t *prev = NULL;
3749 while (node->right == prev || node->right == NULL)
3752 node = node->parent;
3753 dup_node = dup_node->parent;
3758 p_new = &dup_node->right;