1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 int node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, 0, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
642 re_comp (const char *s)
649 if (!re_comp_buf.re_buffer)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf.re_buffer)
656 fastmap = re_comp_buf.re_fastmap;
657 re_comp_buf.re_fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.re_fastmap = fastmap;
663 if (re_comp_buf.re_fastmap == NULL)
665 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
666 if (re_comp_buf.re_fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since `re_exec' always passes NULL for the `regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.re_newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
682 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t *preg, const char * pattern, int length,
703 reg_errcode_t err = REG_NOERROR;
707 /* Initialize the pattern buffer. */
708 preg->re_fastmap_accurate = 0;
709 preg->re_syntax = syntax;
710 preg->re_not_bol = preg->re_not_eol = 0;
713 preg->re_can_be_null = 0;
714 preg->re_regs_allocated = REG_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->re_buffer;
718 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If buffer is null this
723 is a simple allocation. */
724 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
727 preg->re_allocated = sizeof (re_dfa_t);
728 preg->re_buffer = (unsigned char *) dfa;
730 preg->re_used = sizeof (re_dfa_t);
732 __libc_lock_init (dfa->lock);
734 err = init_dfa (dfa, length);
735 if (BE (err != REG_NOERROR, 0))
737 free_dfa_content (dfa);
738 preg->re_buffer = NULL;
739 preg->re_allocated = 0;
743 dfa->re_str = re_malloc (char, length + 1);
744 strncpy (dfa->re_str, pattern, length + 1);
747 err = re_string_construct (®exp, pattern, length, preg->re_translate,
748 syntax & REG_IGNORE_CASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->re_buffer = NULL;
756 preg->re_allocated = 0;
760 /* Parse the regular expression, and build a structure tree. */
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->re_buffer = NULL;
788 preg->re_allocated = 0;
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
798 init_dfa (re_dfa_t *dfa, int pat_len)
800 unsigned int table_size;
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; ; table_size <<= 1)
815 if (table_size > pat_len)
818 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->mb_cur_max = MB_CUR_MAX;
823 if (dfa->mb_cur_max == 6
824 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
826 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name = nl_langinfo (CODESET);
832 codeset_name = getenv ("LC_ALL");
833 if (codeset_name == NULL || codeset_name[0] == '\0')
834 codeset_name = getenv ("LC_CTYPE");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LANG");
837 if (codeset_name == NULL)
839 else if (strchr (codeset_name, '.') != NULL)
840 codeset_name = strchr (codeset_name, '.') + 1;
843 if (strcasecmp (codeset_name, "UTF-8") == 0
844 || strcasecmp (codeset_name, "UTF8") == 0)
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa->map_notascii = 0;
852 #ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
856 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
862 if (BE (dfa->sb_char == NULL, 0))
865 /* Clear all bits by, then set those corresponding to single
867 bitset_empty (dfa->sb_char);
869 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
870 for (j = 0; j < UINT_BITS; ++j, ++ch)
872 wint_t wch = __btowc (ch);
874 dfa->sb_char[i] |= 1u << j;
876 if (isascii (ch) && wch != ch)
877 dfa->map_notascii = 1;
884 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
894 init_word_char (re_dfa_t *dfa)
897 dfa->word_ops_used = 1;
898 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
899 for (j = 0; j < UINT_BITS; ++j, ++ch)
900 if (isalnum (ch) || ch == '_')
901 dfa->word_char[i] |= 1u << j;
904 /* Free the work area which are only used while compiling. */
907 free_workarea_compile (regex_t *preg)
909 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
910 bin_tree_storage_t *storage, *next;
911 for (storage = dfa->str_tree_storage; storage; storage = next)
913 next = storage->next;
916 dfa->str_tree_storage = NULL;
917 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
918 dfa->str_tree = NULL;
919 re_free (dfa->org_indices);
920 dfa->org_indices = NULL;
923 /* Create initial states for all contexts. */
926 create_initial_state (re_dfa_t *dfa)
930 re_node_set init_nodes;
932 /* Initial states have the epsilon closure of the node which is
933 the first node of the regular expression. */
934 first = dfa->str_tree->first->node_idx;
935 dfa->init_node = first;
936 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
937 if (BE (err != REG_NOERROR, 0))
940 /* The back-references which are in initial states can epsilon transit,
941 since in this case all of the subexpressions can be null.
942 Then we add epsilon closures of the nodes which are the next nodes of
943 the back-references. */
944 if (dfa->nbackref > 0)
945 for (i = 0; i < init_nodes.nelem; ++i)
947 int node_idx = init_nodes.elems[i];
948 re_token_type_t type = dfa->nodes[node_idx].type;
951 if (type != OP_BACK_REF)
953 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
955 re_token_t *clexp_node;
956 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
957 if (clexp_node->type == OP_CLOSE_SUBEXP
958 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 if (clexp_idx == init_nodes.nelem)
964 if (type == OP_BACK_REF)
966 int dest_idx = dfa->edests[node_idx].elems[0];
967 if (!re_node_set_contains (&init_nodes, dest_idx))
969 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
975 /* It must be the first time to invoke acquire_state. */
976 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
977 /* We don't check ERR here, since the initial state must not be NULL. */
978 if (BE (dfa->init_state == NULL, 0))
980 if (dfa->init_state->has_constraint)
982 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
984 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
986 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
990 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
991 || dfa->init_state_begbuf == NULL, 0))
995 dfa->init_state_word = dfa->init_state_nl
996 = dfa->init_state_begbuf = dfa->init_state;
998 re_node_set_free (&init_nodes);
1002 #ifdef RE_ENABLE_I18N
1003 /* If it is possible to do searching in single byte encoding instead of UTF-8
1004 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1005 DFA nodes where needed. */
1008 optimize_utf8 (re_dfa_t *dfa)
1010 int node, i, mb_chars = 0, has_period = 0;
1012 for (node = 0; node < dfa->nodes_len; ++node)
1013 switch (dfa->nodes[node].type)
1016 if (dfa->nodes[node].opr.c >= 0x80)
1020 switch (dfa->nodes[node].opr.idx)
1028 /* Word anchors etc. cannot be handled. */
1038 case OP_DUP_ASTERISK:
1039 case OP_OPEN_SUBEXP:
1040 case OP_CLOSE_SUBEXP:
1042 case COMPLEX_BRACKET:
1044 case SIMPLE_BRACKET:
1045 /* Just double check. */
1046 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1047 if (dfa->nodes[node].opr.sbcset[i])
1054 if (mb_chars || has_period)
1055 for (node = 0; node < dfa->nodes_len; ++node)
1057 if (dfa->nodes[node].type == CHARACTER
1058 && dfa->nodes[node].opr.c >= 0x80)
1059 dfa->nodes[node].mb_partial = 0;
1060 else if (dfa->nodes[node].type == OP_PERIOD)
1061 dfa->nodes[node].type = OP_UTF8_PERIOD;
1064 /* The search can be in single byte locale. */
1065 dfa->mb_cur_max = 1;
1067 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1071 /* Analyze the structure tree, and calculate "first", "next", "edest",
1072 "eclosure", and "inveclosure". */
1074 static reg_errcode_t
1075 analyze (regex_t *preg)
1077 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1080 /* Allocate arrays. */
1081 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1082 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1083 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1084 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1085 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1086 || dfa->eclosures == NULL, 0))
1089 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1090 if (dfa->subexp_map != NULL)
1093 for (i = 0; i < preg->re_nsub; i++)
1094 dfa->subexp_map[i] = i;
1095 preorder (dfa->str_tree, optimize_subexps, dfa);
1096 for (i = 0; i < preg->re_nsub; i++)
1097 if (dfa->subexp_map[i] != i)
1099 if (i == preg->re_nsub)
1101 free (dfa->subexp_map);
1102 dfa->subexp_map = NULL;
1106 ret = postorder (dfa->str_tree, lower_subexps, preg);
1107 if (BE (ret != REG_NOERROR, 0))
1109 ret = postorder (dfa->str_tree, calc_first, dfa);
1110 if (BE (ret != REG_NOERROR, 0))
1112 preorder (dfa->str_tree, calc_next, dfa);
1113 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1114 if (BE (ret != REG_NOERROR, 0))
1116 ret = calc_eclosure (dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1120 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1121 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1122 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1125 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1126 if (BE (dfa->inveclosures == NULL, 0))
1128 ret = calc_inveclosure (dfa);
1134 /* Our parse trees are very unbalanced, so we cannot use a stack to
1135 implement parse tree visits. Instead, we use parent pointers and
1136 some hairy code in these two functions. */
1137 static reg_errcode_t
1138 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1141 bin_tree_t *node, *prev;
1143 for (node = root; ; )
1145 /* Descend down the tree, preferably to the left (or to the right
1146 if that's the only child). */
1147 while (node->left || node->right)
1155 reg_errcode_t err = fn (extra, node);
1156 if (BE (err != REG_NOERROR, 0))
1158 if (node->parent == NULL)
1161 node = node->parent;
1163 /* Go up while we have a node that is reached from the right. */
1164 while (node->right == prev || node->right == NULL);
1169 static reg_errcode_t
1170 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1175 for (node = root; ; )
1177 reg_errcode_t err = fn (extra, node);
1178 if (BE (err != REG_NOERROR, 0))
1181 /* Go to the left node, or up and to the right. */
1186 bin_tree_t *prev = NULL;
1187 while (node->right == prev || node->right == NULL)
1190 node = node->parent;
1199 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1200 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1201 backreferences as well. Requires a preorder visit. */
1202 static reg_errcode_t
1203 optimize_subexps (void *extra, bin_tree_t *node)
1205 re_dfa_t *dfa = (re_dfa_t *) extra;
1207 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1209 int idx = node->token.opr.idx;
1210 node->token.opr.idx = dfa->subexp_map[idx];
1211 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1214 else if (node->token.type == SUBEXP
1215 && node->left && node->left->token.type == SUBEXP)
1217 int other_idx = node->left->token.opr.idx;
1219 node->left = node->left->left;
1221 node->left->parent = node;
1223 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1224 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1225 dfa->used_bkref_map &= ~(1u << other_idx);
1231 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1232 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1233 static reg_errcode_t
1234 lower_subexps (void *extra, bin_tree_t *node)
1236 regex_t *preg = (regex_t *) extra;
1237 reg_errcode_t err = REG_NOERROR;
1239 if (node->left && node->left->token.type == SUBEXP)
1241 node->left = lower_subexp (&err, preg, node->left);
1243 node->left->parent = node;
1245 if (node->right && node->right->token.type == SUBEXP)
1247 node->right = lower_subexp (&err, preg, node->right);
1249 node->right->parent = node;
1256 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1258 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1259 bin_tree_t *body = node->left;
1260 bin_tree_t *op, *cls, *tree1, *tree;
1263 /* We do not optimize empty subexpressions, because otherwise we may
1264 have bad CONCAT nodes with NULL children. This is obviously not
1265 very common, so we do not lose much. An example that triggers
1266 this case is the sed "script" /\(\)/x. */
1267 && node->left != NULL
1268 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1269 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1272 /* Convert the SUBEXP node to the concatenation of an
1273 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1274 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1275 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1276 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1277 tree = create_tree (dfa, op, tree1, CONCAT);
1278 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1284 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1285 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1289 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1290 nodes. Requires a postorder visit. */
1291 static reg_errcode_t
1292 calc_first (void *extra, bin_tree_t *node)
1294 re_dfa_t *dfa = (re_dfa_t *) extra;
1295 if (node->token.type == CONCAT)
1297 node->first = node->left->first;
1298 node->node_idx = node->left->node_idx;
1303 node->node_idx = re_dfa_add_node (dfa, node->token);
1304 if (BE (node->node_idx == -1, 0))
1310 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1311 static reg_errcode_t
1312 calc_next (void *extra, bin_tree_t *node)
1314 switch (node->token.type)
1316 case OP_DUP_ASTERISK:
1317 node->left->next = node;
1320 node->left->next = node->right->first;
1321 node->right->next = node->next;
1325 node->left->next = node->next;
1327 node->right->next = node->next;
1333 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1334 static reg_errcode_t
1335 link_nfa_nodes (void *extra, bin_tree_t *node)
1337 re_dfa_t *dfa = (re_dfa_t *) extra;
1338 int idx = node->node_idx;
1339 reg_errcode_t err = REG_NOERROR;
1341 switch (node->token.type)
1347 assert (node->next == NULL);
1350 case OP_DUP_ASTERISK:
1354 dfa->has_plural_match = 1;
1355 if (node->left != NULL)
1356 left = node->left->first->node_idx;
1358 left = node->next->node_idx;
1359 if (node->right != NULL)
1360 right = node->right->first->node_idx;
1362 right = node->next->node_idx;
1364 assert (right > -1);
1365 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1370 case OP_OPEN_SUBEXP:
1371 case OP_CLOSE_SUBEXP:
1372 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1376 dfa->nexts[idx] = node->next->node_idx;
1377 if (node->token.type == OP_BACK_REF)
1378 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1382 assert (!IS_EPSILON_NODE (node->token.type));
1383 dfa->nexts[idx] = node->next->node_idx;
1390 /* Duplicate the epsilon closure of the node ROOT_NODE.
1391 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1392 to their own constraint. */
1394 static reg_errcode_t
1395 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1396 int root_node, unsigned int init_constraint)
1398 int org_node, clone_node, ret;
1399 unsigned int constraint = init_constraint;
1400 for (org_node = top_org_node, clone_node = top_clone_node;;)
1402 int org_dest, clone_dest;
1403 if (dfa->nodes[org_node].type == OP_BACK_REF)
1405 /* If the back reference epsilon-transit, its destination must
1406 also have the constraint. Then duplicate the epsilon closure
1407 of the destination of the back reference, and store it in
1408 edests of the back reference. */
1409 org_dest = dfa->nexts[org_node];
1410 re_node_set_empty (dfa->edests + clone_node);
1411 clone_dest = duplicate_node (dfa, org_dest, constraint);
1412 if (BE (clone_dest == -1, 0))
1414 dfa->nexts[clone_node] = dfa->nexts[org_node];
1415 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1416 if (BE (ret < 0, 0))
1419 else if (dfa->edests[org_node].nelem == 0)
1421 /* In case of the node can't epsilon-transit, don't duplicate the
1422 destination and store the original destination as the
1423 destination of the node. */
1424 dfa->nexts[clone_node] = dfa->nexts[org_node];
1427 else if (dfa->edests[org_node].nelem == 1)
1429 /* In case of the node can epsilon-transit, and it has only one
1431 org_dest = dfa->edests[org_node].elems[0];
1432 re_node_set_empty (dfa->edests + clone_node);
1433 if (dfa->nodes[org_node].type == ANCHOR)
1435 /* In case of the node has another constraint, append it. */
1436 if (org_node == root_node && clone_node != org_node)
1438 /* ...but if the node is root_node itself, it means the
1439 epsilon closure have a loop, then tie it to the
1440 destination of the root_node. */
1441 ret = re_node_set_insert (dfa->edests + clone_node,
1443 if (BE (ret < 0, 0))
1447 constraint |= dfa->nodes[org_node].opr.ctx_type;
1449 clone_dest = duplicate_node (dfa, org_dest, constraint);
1450 if (BE (clone_dest == -1, 0))
1452 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1453 if (BE (ret < 0, 0))
1456 else /* dfa->edests[org_node].nelem == 2 */
1458 /* In case of the node can epsilon-transit, and it has two
1459 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1460 org_dest = dfa->edests[org_node].elems[0];
1461 re_node_set_empty (dfa->edests + clone_node);
1462 /* Search for a duplicated node which satisfies the constraint. */
1463 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1464 if (clone_dest == -1)
1466 /* There are no such a duplicated node, create a new one. */
1468 clone_dest = duplicate_node (dfa, org_dest, constraint);
1469 if (BE (clone_dest == -1, 0))
1471 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1472 if (BE (ret < 0, 0))
1474 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1475 root_node, constraint);
1476 if (BE (err != REG_NOERROR, 0))
1481 /* There are a duplicated node which satisfy the constraint,
1482 use it to avoid infinite loop. */
1483 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1484 if (BE (ret < 0, 0))
1488 org_dest = dfa->edests[org_node].elems[1];
1489 clone_dest = duplicate_node (dfa, org_dest, constraint);
1490 if (BE (clone_dest == -1, 0))
1492 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493 if (BE (ret < 0, 0))
1496 org_node = org_dest;
1497 clone_node = clone_dest;
1502 /* Search for a node which is duplicated from the node ORG_NODE, and
1503 satisfies the constraint CONSTRAINT. */
1506 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1509 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1511 if (org_node == dfa->org_indices[idx]
1512 && constraint == dfa->nodes[idx].constraint)
1513 return idx; /* Found. */
1515 return -1; /* Not found. */
1518 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1519 Return the index of the new node, or -1 if insufficient storage is
1523 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1525 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1526 if (BE (dup_idx != -1, 1))
1528 dfa->nodes[dup_idx].constraint = constraint;
1529 if (dfa->nodes[org_idx].type == ANCHOR)
1530 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1531 dfa->nodes[dup_idx].duplicated = 1;
1533 /* Store the index of the original node. */
1534 dfa->org_indices[dup_idx] = org_idx;
1539 static reg_errcode_t
1540 calc_inveclosure (re_dfa_t *dfa)
1543 for (idx = 0; idx < dfa->nodes_len; ++idx)
1544 re_node_set_init_empty (dfa->inveclosures + idx);
1546 for (src = 0; src < dfa->nodes_len; ++src)
1548 int *elems = dfa->eclosures[src].elems;
1549 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1551 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1552 if (BE (ret == -1, 0))
1560 /* Calculate "eclosure" for all the node in DFA. */
1562 static reg_errcode_t
1563 calc_eclosure (re_dfa_t *dfa)
1565 int node_idx, incomplete;
1567 assert (dfa->nodes_len > 0);
1570 /* For each nodes, calculate epsilon closure. */
1571 for (node_idx = 0; ; ++node_idx)
1574 re_node_set eclosure_elem;
1575 if (node_idx == dfa->nodes_len)
1584 assert (dfa->eclosures[node_idx].nelem != -1);
1587 /* If we have already calculated, skip it. */
1588 if (dfa->eclosures[node_idx].nelem != 0)
1590 /* Calculate epsilon closure of `node_idx'. */
1591 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1592 if (BE (err != REG_NOERROR, 0))
1595 if (dfa->eclosures[node_idx].nelem == 0)
1598 re_node_set_free (&eclosure_elem);
1604 /* Calculate epsilon closure of NODE. */
1606 static reg_errcode_t
1607 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1610 unsigned int constraint;
1612 re_node_set eclosure;
1614 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1615 if (BE (err != REG_NOERROR, 0))
1618 /* This indicates that we are calculating this node now.
1619 We reference this value to avoid infinite loop. */
1620 dfa->eclosures[node].nelem = -1;
1622 constraint = ((dfa->nodes[node].type == ANCHOR)
1623 ? dfa->nodes[node].opr.ctx_type : 0);
1624 /* If the current node has constraints, duplicate all nodes.
1625 Since they must inherit the constraints. */
1627 && dfa->edests[node].nelem
1628 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1630 int org_node, cur_node;
1631 org_node = cur_node = node;
1632 err = duplicate_node_closure (dfa, node, node, node, constraint);
1633 if (BE (err != REG_NOERROR, 0))
1637 /* Expand each epsilon destination nodes. */
1638 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1639 for (i = 0; i < dfa->edests[node].nelem; ++i)
1641 re_node_set eclosure_elem;
1642 int edest = dfa->edests[node].elems[i];
1643 /* If calculating the epsilon closure of `edest' is in progress,
1644 return intermediate result. */
1645 if (dfa->eclosures[edest].nelem == -1)
1650 /* If we haven't calculated the epsilon closure of `edest' yet,
1651 calculate now. Otherwise use calculated epsilon closure. */
1652 if (dfa->eclosures[edest].nelem == 0)
1654 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1655 if (BE (err != REG_NOERROR, 0))
1659 eclosure_elem = dfa->eclosures[edest];
1660 /* Merge the epsilon closure of `edest'. */
1661 re_node_set_merge (&eclosure, &eclosure_elem);
1662 /* If the epsilon closure of `edest' is incomplete,
1663 the epsilon closure of this node is also incomplete. */
1664 if (dfa->eclosures[edest].nelem == 0)
1667 re_node_set_free (&eclosure_elem);
1671 /* Epsilon closures include itself. */
1672 re_node_set_insert (&eclosure, node);
1673 if (incomplete && !root)
1674 dfa->eclosures[node].nelem = 0;
1676 dfa->eclosures[node] = eclosure;
1677 *new_set = eclosure;
1681 /* Functions for token which are used in the parser. */
1683 /* Fetch a token from INPUT.
1684 We must not use this function inside bracket expressions. */
1687 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1689 re_string_skip_bytes (input, peek_token (result, input, syntax));
1692 /* Peek a token from INPUT, and return the length of the token.
1693 We must not use this function inside bracket expressions. */
1696 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1700 if (re_string_eoi (input))
1702 token->type = END_OF_RE;
1706 c = re_string_peek_byte (input, 0);
1709 token->word_char = 0;
1710 #ifdef RE_ENABLE_I18N
1711 token->mb_partial = 0;
1712 if (input->mb_cur_max > 1 &&
1713 !re_string_first_byte (input, re_string_cur_idx (input)))
1715 token->type = CHARACTER;
1716 token->mb_partial = 1;
1723 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1725 token->type = BACK_SLASH;
1729 c2 = re_string_peek_byte_case (input, 1);
1731 token->type = CHARACTER;
1732 #ifdef RE_ENABLE_I18N
1733 if (input->mb_cur_max > 1)
1735 wint_t wc = re_string_wchar_at (input,
1736 re_string_cur_idx (input) + 1);
1737 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1741 token->word_char = IS_WORD_CHAR (c2) != 0;
1746 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1747 token->type = OP_ALT;
1749 case '1': case '2': case '3': case '4': case '5':
1750 case '6': case '7': case '8': case '9':
1751 if (!(syntax & REG_NO_BK_REFS))
1753 token->type = OP_BACK_REF;
1754 token->opr.idx = c2 - '1';
1758 if (!(syntax & REG_NO_GNU_OPS))
1760 token->type = ANCHOR;
1761 token->opr.ctx_type = WORD_FIRST;
1765 if (!(syntax & REG_NO_GNU_OPS))
1767 token->type = ANCHOR;
1768 token->opr.ctx_type = WORD_LAST;
1772 if (!(syntax & REG_NO_GNU_OPS))
1774 token->type = ANCHOR;
1775 token->opr.ctx_type = WORD_DELIM;
1779 if (!(syntax & REG_NO_GNU_OPS))
1781 token->type = ANCHOR;
1782 token->opr.ctx_type = NOT_WORD_DELIM;
1786 if (!(syntax & REG_NO_GNU_OPS))
1787 token->type = OP_WORD;
1790 if (!(syntax & REG_NO_GNU_OPS))
1791 token->type = OP_NOTWORD;
1794 if (!(syntax & REG_NO_GNU_OPS))
1795 token->type = OP_SPACE;
1798 if (!(syntax & REG_NO_GNU_OPS))
1799 token->type = OP_NOTSPACE;
1802 if (!(syntax & REG_NO_GNU_OPS))
1804 token->type = ANCHOR;
1805 token->opr.ctx_type = BUF_FIRST;
1809 if (!(syntax & REG_NO_GNU_OPS))
1811 token->type = ANCHOR;
1812 token->opr.ctx_type = BUF_LAST;
1816 if (!(syntax & REG_NO_BK_PARENS))
1817 token->type = OP_OPEN_SUBEXP;
1820 if (!(syntax & REG_NO_BK_PARENS))
1821 token->type = OP_CLOSE_SUBEXP;
1824 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1825 token->type = OP_DUP_PLUS;
1828 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1829 token->type = OP_DUP_QUESTION;
1832 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1833 token->type = OP_OPEN_DUP_NUM;
1836 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1837 token->type = OP_CLOSE_DUP_NUM;
1845 token->type = CHARACTER;
1846 #ifdef RE_ENABLE_I18N
1847 if (input->mb_cur_max > 1)
1849 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1850 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854 token->word_char = IS_WORD_CHAR (token->opr.c);
1859 if (syntax & REG_NEWLINE_ALT)
1860 token->type = OP_ALT;
1863 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1864 token->type = OP_ALT;
1867 token->type = OP_DUP_ASTERISK;
1870 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1871 token->type = OP_DUP_PLUS;
1874 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1875 token->type = OP_DUP_QUESTION;
1878 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1879 token->type = OP_OPEN_DUP_NUM;
1882 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1883 token->type = OP_CLOSE_DUP_NUM;
1886 if (syntax & REG_NO_BK_PARENS)
1887 token->type = OP_OPEN_SUBEXP;
1890 if (syntax & REG_NO_BK_PARENS)
1891 token->type = OP_CLOSE_SUBEXP;
1894 token->type = OP_OPEN_BRACKET;
1897 token->type = OP_PERIOD;
1900 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1901 re_string_cur_idx (input) != 0)
1903 char prev = re_string_peek_byte (input, -1);
1904 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1907 token->type = ANCHOR;
1908 token->opr.ctx_type = LINE_FIRST;
1911 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1912 re_string_cur_idx (input) + 1 != re_string_length (input))
1915 re_string_skip_bytes (input, 1);
1916 peek_token (&next, input, syntax);
1917 re_string_skip_bytes (input, -1);
1918 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1921 token->type = ANCHOR;
1922 token->opr.ctx_type = LINE_LAST;
1930 /* Peek a token from INPUT, and return the length of the token.
1931 We must not use this function out of bracket expressions. */
1934 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1937 if (re_string_eoi (input))
1939 token->type = END_OF_RE;
1942 c = re_string_peek_byte (input, 0);
1945 #ifdef RE_ENABLE_I18N
1946 if (input->mb_cur_max > 1 &&
1947 !re_string_first_byte (input, re_string_cur_idx (input)))
1949 token->type = CHARACTER;
1952 #endif /* RE_ENABLE_I18N */
1954 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1955 && re_string_cur_idx (input) + 1 < re_string_length (input))
1957 /* In this case, '\' escape a character. */
1959 re_string_skip_bytes (input, 1);
1960 c2 = re_string_peek_byte (input, 0);
1962 token->type = CHARACTER;
1965 if (c == '[') /* '[' is a special char in a bracket exps. */
1969 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1970 c2 = re_string_peek_byte (input, 1);
1978 token->type = OP_OPEN_COLL_ELEM;
1981 token->type = OP_OPEN_EQUIV_CLASS;
1984 if (syntax & REG_CHAR_CLASSES)
1986 token->type = OP_OPEN_CHAR_CLASS;
1989 /* else fall through. */
1991 token->type = CHARACTER;
2001 token->type = OP_CHARSET_RANGE;
2004 token->type = OP_CLOSE_BRACKET;
2007 token->type = OP_NON_MATCH_LIST;
2010 token->type = CHARACTER;
2015 /* Functions for parser. */
2017 /* Entry point of the parser.
2018 Parse the regular expression REGEXP and return the structure tree.
2019 If an error is occured, ERR is set by error code, and return NULL.
2020 This function build the following tree, from regular expression <reg_exp>:
2026 CAT means concatenation.
2027 EOR means end of regular expression. */
2030 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2033 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2034 bin_tree_t *tree, *eor, *root;
2035 re_token_t current_token;
2036 dfa->syntax = syntax;
2037 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2038 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2039 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2041 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2043 root = create_tree (dfa, tree, eor, CONCAT);
2046 if (BE (eor == NULL || root == NULL, 0))
2054 /* This function build the following tree, from regular expression
2055 <branch1>|<branch2>:
2061 ALT means alternative, which represents the operator `|'. */
2064 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2065 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2067 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2068 bin_tree_t *tree, *branch = NULL;
2069 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2070 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2073 while (token->type == OP_ALT)
2075 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2076 if (token->type != OP_ALT && token->type != END_OF_RE
2077 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2079 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2080 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2085 tree = create_tree (dfa, tree, branch, OP_ALT);
2086 if (BE (tree == NULL, 0))
2095 /* This function build the following tree, from regular expression
2102 CAT means concatenation. */
2105 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2106 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2108 bin_tree_t *tree, *exp;
2109 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2110 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2111 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114 while (token->type != OP_ALT && token->type != END_OF_RE
2115 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2117 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2118 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2122 if (tree != NULL && exp != NULL)
2124 tree = create_tree (dfa, tree, exp, CONCAT);
2131 else if (tree == NULL)
2133 /* Otherwise exp == NULL, we don't need to create new tree. */
2138 /* This function build the following tree, from regular expression a*:
2145 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2148 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2150 switch (token->type)
2153 tree = create_token_tree (dfa, NULL, NULL, token);
2154 if (BE (tree == NULL, 0))
2159 #ifdef RE_ENABLE_I18N
2160 if (dfa->mb_cur_max > 1)
2162 while (!re_string_eoi (regexp)
2163 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2165 bin_tree_t *mbc_remain;
2166 fetch_token (token, regexp, syntax);
2167 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2168 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2169 if (BE (mbc_remain == NULL || tree == NULL, 0))
2178 case OP_OPEN_SUBEXP:
2179 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2180 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183 case OP_OPEN_BRACKET:
2184 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2185 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2194 dfa->used_bkref_map |= 1 << token->opr.idx;
2195 tree = create_token_tree (dfa, NULL, NULL, token);
2196 if (BE (tree == NULL, 0))
2202 dfa->has_mb_node = 1;
2204 case OP_OPEN_DUP_NUM:
2205 if (syntax & REG_CONTEXT_INVALID_DUP)
2211 case OP_DUP_ASTERISK:
2213 case OP_DUP_QUESTION:
2214 if (syntax & REG_CONTEXT_INVALID_OPS)
2219 else if (syntax & REG_CONTEXT_INDEP_OPS)
2221 fetch_token (token, regexp, syntax);
2222 return parse_expression (regexp, preg, token, syntax, nest, err);
2224 /* else fall through */
2225 case OP_CLOSE_SUBEXP:
2226 if ((token->type == OP_CLOSE_SUBEXP) &&
2227 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2232 /* else fall through */
2233 case OP_CLOSE_DUP_NUM:
2234 /* We treat it as a normal character. */
2236 /* Then we can these characters as normal characters. */
2237 token->type = CHARACTER;
2238 /* mb_partial and word_char bits should be initialized already
2240 tree = create_token_tree (dfa, NULL, NULL, token);
2241 if (BE (tree == NULL, 0))
2248 if ((token->opr.ctx_type
2249 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2250 && dfa->word_ops_used == 0)
2251 init_word_char (dfa);
2252 if (token->opr.ctx_type == WORD_DELIM
2253 || token->opr.ctx_type == NOT_WORD_DELIM)
2255 bin_tree_t *tree_first, *tree_last;
2256 if (token->opr.ctx_type == WORD_DELIM)
2258 token->opr.ctx_type = WORD_FIRST;
2259 tree_first = create_token_tree (dfa, NULL, NULL, token);
2260 token->opr.ctx_type = WORD_LAST;
2264 token->opr.ctx_type = INSIDE_WORD;
2265 tree_first = create_token_tree (dfa, NULL, NULL, token);
2266 token->opr.ctx_type = INSIDE_NOTWORD;
2268 tree_last = create_token_tree (dfa, NULL, NULL, token);
2269 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2270 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2278 tree = create_token_tree (dfa, NULL, NULL, token);
2279 if (BE (tree == NULL, 0))
2285 /* We must return here, since ANCHORs can't be followed
2286 by repetition operators.
2287 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2288 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2289 fetch_token (token, regexp, syntax);
2292 tree = create_token_tree (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2298 if (dfa->mb_cur_max > 1)
2299 dfa->has_mb_node = 1;
2303 tree = build_charclass_op (dfa, regexp->trans,
2304 (const unsigned char *) "alnum",
2305 (const unsigned char *) "_",
2306 token->type == OP_NOTWORD, err);
2307 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312 tree = build_charclass_op (dfa, regexp->trans,
2313 (const unsigned char *) "space",
2314 (const unsigned char *) "",
2315 token->type == OP_NOTSPACE, err);
2316 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2326 /* Must not happen? */
2332 fetch_token (token, regexp, syntax);
2334 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2335 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2337 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2338 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2340 /* In BRE consecutive duplications are not allowed. */
2341 if ((syntax & REG_CONTEXT_INVALID_DUP)
2342 && (token->type == OP_DUP_ASTERISK
2343 || token->type == OP_OPEN_DUP_NUM))
2353 /* This function build the following tree, from regular expression
2361 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2362 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2364 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2367 cur_nsub = preg->re_nsub++;
2369 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2371 /* The subexpression may be a null string. */
2372 if (token->type == OP_CLOSE_SUBEXP)
2376 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2377 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2379 if (BE (*err != REG_NOERROR, 0))
2383 if (cur_nsub <= '9' - '1')
2384 dfa->completed_bkref_map |= 1 << cur_nsub;
2386 tree = create_tree (dfa, tree, NULL, SUBEXP);
2387 if (BE (tree == NULL, 0))
2392 tree->token.opr.idx = cur_nsub;
2396 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2399 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2400 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2402 bin_tree_t *tree = NULL, *old_tree = NULL;
2403 int i, start, end, start_idx = re_string_cur_idx (regexp);
2404 re_token_t start_token = *token;
2406 if (token->type == OP_OPEN_DUP_NUM)
2409 start = fetch_number (regexp, token, syntax);
2412 if (token->type == CHARACTER && token->opr.c == ',')
2413 start = 0; /* We treat "{,m}" as "{0,m}". */
2416 *err = REG_BADBR; /* <re>{} is invalid. */
2420 if (BE (start != -2, 1))
2422 /* We treat "{n}" as "{n,n}". */
2423 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2424 : ((token->type == CHARACTER && token->opr.c == ',')
2425 ? fetch_number (regexp, token, syntax) : -2));
2427 if (BE (start == -2 || end == -2, 0))
2429 /* Invalid sequence. */
2430 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2432 if (token->type == END_OF_RE)
2440 /* If the syntax bit is set, rollback. */
2441 re_string_set_index (regexp, start_idx);
2442 *token = start_token;
2443 token->type = CHARACTER;
2444 /* mb_partial and word_char bits should be already initialized by
2449 if (BE (end != -1 && start > end, 0))
2451 /* First number greater than second. */
2458 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2459 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2462 fetch_token (token, regexp, syntax);
2464 if (BE (elem == NULL, 0))
2466 if (BE (start == 0 && end == 0, 0))
2468 postorder (elem, free_tree, NULL);
2472 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2473 if (BE (start > 0, 0))
2476 for (i = 2; i <= start; ++i)
2478 elem = duplicate_tree (elem, dfa);
2479 tree = create_tree (dfa, tree, elem, CONCAT);
2480 if (BE (elem == NULL || tree == NULL, 0))
2481 goto parse_dup_op_espace;
2487 /* Duplicate ELEM before it is marked optional. */
2488 elem = duplicate_tree (elem, dfa);
2494 if (elem->token.type == SUBEXP)
2495 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2497 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2498 if (BE (tree == NULL, 0))
2499 goto parse_dup_op_espace;
2501 /* This loop is actually executed only when end != -1,
2502 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2503 already created the start+1-th copy. */
2504 for (i = start + 2; i <= end; ++i)
2506 elem = duplicate_tree (elem, dfa);
2507 tree = create_tree (dfa, tree, elem, CONCAT);
2508 if (BE (elem == NULL || tree == NULL, 0))
2509 goto parse_dup_op_espace;
2511 tree = create_tree (dfa, tree, NULL, OP_ALT);
2512 if (BE (tree == NULL, 0))
2513 goto parse_dup_op_espace;
2517 tree = create_tree (dfa, old_tree, tree, CONCAT);
2521 parse_dup_op_espace:
2526 /* Size of the names for collating symbol/equivalence_class/character_class.
2527 I'm not sure, but maybe enough. */
2528 #define BRACKET_NAME_BUF_SIZE 32
2531 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2532 Build the range expression which starts from START_ELEM, and ends
2533 at END_ELEM. The result are written to MBCSET and SBCSET.
2534 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2535 mbcset->range_ends, is a pointer argument sinse we may
2538 static reg_errcode_t
2539 build_range_exp (re_bitset_ptr_t sbcset,
2540 # ifdef RE_ENABLE_I18N
2541 re_charset_t *mbcset, int *range_alloc,
2543 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2545 unsigned int start_ch, end_ch;
2546 /* Equivalence Classes and Character Classes can't be a range start/end. */
2547 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2548 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2552 /* We can handle no multi character collating elements without libc
2554 if (BE ((start_elem->type == COLL_SYM
2555 && strlen ((char *) start_elem->opr.name) > 1)
2556 || (end_elem->type == COLL_SYM
2557 && strlen ((char *) end_elem->opr.name) > 1), 0))
2558 return REG_ECOLLATE;
2560 # ifdef RE_ENABLE_I18N
2563 wint_t start_wc, end_wc;
2564 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2566 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2567 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2569 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2570 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2572 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2573 ? __btowc (start_ch) : start_elem->opr.wch);
2574 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2575 ? __btowc (end_ch) : end_elem->opr.wch);
2576 if (start_wc == WEOF || end_wc == WEOF)
2577 return REG_ECOLLATE;
2578 cmp_buf[0] = start_wc;
2579 cmp_buf[4] = end_wc;
2580 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2583 /* Got valid collation sequence values, add them as a new entry.
2584 However, for !_LIBC we have no collation elements: if the
2585 character set is single byte, the single byte character set
2586 that we build below suffices. parse_bracket_exp passes
2587 no MBCSET if dfa->mb_cur_max == 1. */
2590 /* Check the space of the arrays. */
2591 if (BE (*range_alloc == mbcset->nranges, 0))
2593 /* There is not enough space, need realloc. */
2594 wchar_t *new_array_start, *new_array_end;
2597 /* +1 in case of mbcset->nranges is 0. */
2598 new_nranges = 2 * mbcset->nranges + 1;
2599 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2600 are NULL if *range_alloc == 0. */
2601 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2603 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2606 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2609 mbcset->range_starts = new_array_start;
2610 mbcset->range_ends = new_array_end;
2611 *range_alloc = new_nranges;
2614 mbcset->range_starts[mbcset->nranges] = start_wc;
2615 mbcset->range_ends[mbcset->nranges++] = end_wc;
2618 /* Build the table for single byte characters. */
2619 for (wc = 0; wc < SBC_MAX; ++wc)
2622 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2623 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2624 bitset_set (sbcset, wc);
2627 # else /* not RE_ENABLE_I18N */
2630 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2631 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2633 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2634 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2636 if (start_ch > end_ch)
2638 /* Build the table for single byte characters. */
2639 for (ch = 0; ch < SBC_MAX; ++ch)
2640 if (start_ch <= ch && ch <= end_ch)
2641 bitset_set (sbcset, ch);
2643 # endif /* not RE_ENABLE_I18N */
2646 #endif /* not _LIBC */
2649 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2650 Build the collating element which is represented by NAME.
2651 The result are written to MBCSET and SBCSET.
2652 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2653 pointer argument since we may update it. */
2655 static reg_errcode_t
2656 build_collating_symbol (re_bitset_ptr_t sbcset,
2657 # ifdef RE_ENABLE_I18N
2658 re_charset_t *mbcset, int *coll_sym_alloc,
2660 const unsigned char *name)
2662 size_t name_len = strlen ((const char *) name);
2663 if (BE (name_len != 1, 0))
2664 return REG_ECOLLATE;
2667 bitset_set (sbcset, name[0]);
2671 #endif /* not _LIBC */
2673 /* This function parse bracket expression like "[abc]", "[a-c]",
2677 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2678 reg_syntax_t syntax, reg_errcode_t *err)
2681 const unsigned char *collseqmb;
2682 const char *collseqwc;
2685 const int32_t *symb_table;
2686 const unsigned char *extra;
2688 /* Local function for parse_bracket_exp used in _LIBC environement.
2689 Seek the collating symbol entry correspondings to NAME.
2690 Return the index of the symbol in the SYMB_TABLE. */
2693 __attribute ((always_inline))
2694 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2696 int32_t hash = elem_hash ((const char *) name, name_len);
2697 int32_t elem = hash % table_size;
2698 int32_t second = hash % (table_size - 2);
2699 while (symb_table[2 * elem] != 0)
2701 /* First compare the hashing value. */
2702 if (symb_table[2 * elem] == hash
2703 /* Compare the length of the name. */
2704 && name_len == extra[symb_table[2 * elem + 1]]
2705 /* Compare the name. */
2706 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2709 /* Yep, this is the entry. */
2719 /* Local function for parse_bracket_exp used in _LIBC environement.
2720 Look up the collation sequence value of BR_ELEM.
2721 Return the value if succeeded, UINT_MAX otherwise. */
2723 auto inline unsigned int
2724 __attribute ((always_inline))
2725 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2727 if (br_elem->type == SB_CHAR)
2730 if (MB_CUR_MAX == 1)
2733 return collseqmb[br_elem->opr.ch];
2736 wint_t wc = __btowc (br_elem->opr.ch);
2737 return __collseq_table_lookup (collseqwc, wc);
2740 else if (br_elem->type == MB_CHAR)
2742 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2744 else if (br_elem->type == COLL_SYM)
2746 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2750 elem = seek_collating_symbol_entry (br_elem->opr.name,
2752 if (symb_table[2 * elem] != 0)
2754 /* We found the entry. */
2755 idx = symb_table[2 * elem + 1];
2756 /* Skip the name of collating element name. */
2757 idx += 1 + extra[idx];
2758 /* Skip the byte sequence of the collating element. */
2759 idx += 1 + extra[idx];
2760 /* Adjust for the alignment. */
2761 idx = (idx + 3) & ~3;
2762 /* Skip the multibyte collation sequence value. */
2763 idx += sizeof (unsigned int);
2764 /* Skip the wide char sequence of the collating element. */
2765 idx += sizeof (unsigned int) *
2766 (1 + *(unsigned int *) (extra + idx));
2767 /* Return the collation sequence value. */
2768 return *(unsigned int *) (extra + idx);
2770 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2772 /* No valid character. Match it as a single byte
2774 return collseqmb[br_elem->opr.name[0]];
2777 else if (sym_name_len == 1)
2778 return collseqmb[br_elem->opr.name[0]];
2783 /* Local function for parse_bracket_exp used in _LIBC environement.
2784 Build the range expression which starts from START_ELEM, and ends
2785 at END_ELEM. The result are written to MBCSET and SBCSET.
2786 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2787 mbcset->range_ends, is a pointer argument sinse we may
2790 auto inline reg_errcode_t
2791 __attribute ((always_inline))
2792 build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2794 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2797 uint32_t start_collseq;
2798 uint32_t end_collseq;
2800 /* Equivalence Classes and Character Classes can't be a range
2802 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2803 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2807 start_collseq = lookup_collation_sequence_value (start_elem);
2808 end_collseq = lookup_collation_sequence_value (end_elem);
2809 /* Check start/end collation sequence values. */
2810 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2811 return REG_ECOLLATE;
2812 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2815 /* Got valid collation sequence values, add them as a new entry.
2816 However, if we have no collation elements, and the character set
2817 is single byte, the single byte character set that we
2818 build below suffices. */
2819 if (nrules > 0 || dfa->mb_cur_max > 1)
2821 /* Check the space of the arrays. */
2822 if (BE (*range_alloc == mbcset->nranges, 0))
2824 /* There is not enough space, need realloc. */
2825 uint32_t *new_array_start;
2826 uint32_t *new_array_end;
2829 /* +1 in case of mbcset->nranges is 0. */
2830 new_nranges = 2 * mbcset->nranges + 1;
2831 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2833 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2836 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2839 mbcset->range_starts = new_array_start;
2840 mbcset->range_ends = new_array_end;
2841 *range_alloc = new_nranges;
2844 mbcset->range_starts[mbcset->nranges] = start_collseq;
2845 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2848 /* Build the table for single byte characters. */
2849 for (ch = 0; ch < SBC_MAX; ch++)
2851 uint32_t ch_collseq;
2853 if (MB_CUR_MAX == 1)
2856 ch_collseq = collseqmb[ch];
2858 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2859 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2860 bitset_set (sbcset, ch);
2865 /* Local function for parse_bracket_exp used in _LIBC environement.
2866 Build the collating element which is represented by NAME.
2867 The result are written to MBCSET and SBCSET.
2868 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2869 pointer argument sinse we may update it. */
2871 auto inline reg_errcode_t
2872 __attribute ((always_inline))
2873 build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2874 int *coll_sym_alloc, const unsigned char *name)
2877 size_t name_len = strlen ((const char *) name);
2880 elem = seek_collating_symbol_entry (name, name_len);
2881 if (symb_table[2 * elem] != 0)
2883 /* We found the entry. */
2884 idx = symb_table[2 * elem + 1];
2885 /* Skip the name of collating element name. */
2886 idx += 1 + extra[idx];
2888 else if (symb_table[2 * elem] == 0 && name_len == 1)
2890 /* No valid character, treat it as a normal
2892 bitset_set (sbcset, name[0]);
2896 return REG_ECOLLATE;
2898 /* Got valid collation sequence, add it as a new entry. */
2899 /* Check the space of the arrays. */
2900 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2902 /* Not enough, realloc it. */
2903 /* +1 in case of mbcset->ncoll_syms is 0. */
2904 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2905 /* Use realloc since mbcset->coll_syms is NULL
2907 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2908 new_coll_sym_alloc);
2909 if (BE (new_coll_syms == NULL, 0))
2911 mbcset->coll_syms = new_coll_syms;
2912 *coll_sym_alloc = new_coll_sym_alloc;
2914 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2919 if (BE (name_len != 1, 0))
2920 return REG_ECOLLATE;
2923 bitset_set (sbcset, name[0]);
2930 re_token_t br_token;
2931 re_bitset_ptr_t sbcset;
2932 #ifdef RE_ENABLE_I18N
2933 re_charset_t *mbcset;
2934 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2935 int equiv_class_alloc = 0, char_class_alloc = 0;
2936 #endif /* not RE_ENABLE_I18N */
2938 bin_tree_t *work_tree;
2940 int first_round = 1;
2942 collseqmb = (const unsigned char *)
2943 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2944 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2950 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2951 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2952 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2953 _NL_COLLATE_SYMB_TABLEMB);
2954 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2955 _NL_COLLATE_SYMB_EXTRAMB);
2958 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2959 #ifdef RE_ENABLE_I18N
2960 mbcset = re_calloc (re_charset_t, 1);
2961 #endif /* RE_ENABLE_I18N */
2962 #ifdef RE_ENABLE_I18N
2963 if (BE (sbcset == NULL || mbcset == NULL, 0))
2965 if (BE (sbcset == NULL, 0))
2966 #endif /* RE_ENABLE_I18N */
2972 token_len = peek_token_bracket (token, regexp, syntax);
2973 if (BE (token->type == END_OF_RE, 0))
2976 goto parse_bracket_exp_free_return;
2978 if (token->type == OP_NON_MATCH_LIST)
2980 #ifdef RE_ENABLE_I18N
2981 mbcset->non_match = 1;
2982 #endif /* not RE_ENABLE_I18N */
2984 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2985 bitset_set (sbcset, '\0');
2986 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2987 token_len = peek_token_bracket (token, regexp, syntax);
2988 if (BE (token->type == END_OF_RE, 0))
2991 goto parse_bracket_exp_free_return;
2995 /* We treat the first ']' as a normal character. */
2996 if (token->type == OP_CLOSE_BRACKET)
2997 token->type = CHARACTER;
3001 bracket_elem_t start_elem, end_elem;
3002 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3003 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3005 int token_len2 = 0, is_range_exp = 0;
3008 start_elem.opr.name = start_name_buf;
3009 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3010 syntax, first_round);
3011 if (BE (ret != REG_NOERROR, 0))
3014 goto parse_bracket_exp_free_return;
3018 /* Get information about the next token. We need it in any case. */
3019 token_len = peek_token_bracket (token, regexp, syntax);
3021 /* Do not check for ranges if we know they are not allowed. */
3022 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3024 if (BE (token->type == END_OF_RE, 0))
3027 goto parse_bracket_exp_free_return;
3029 if (token->type == OP_CHARSET_RANGE)
3031 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3032 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3033 if (BE (token2.type == END_OF_RE, 0))
3036 goto parse_bracket_exp_free_return;
3038 if (token2.type == OP_CLOSE_BRACKET)
3040 /* We treat the last '-' as a normal character. */
3041 re_string_skip_bytes (regexp, -token_len);
3042 token->type = CHARACTER;
3049 if (is_range_exp == 1)
3051 end_elem.opr.name = end_name_buf;
3052 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3054 if (BE (ret != REG_NOERROR, 0))
3057 goto parse_bracket_exp_free_return;
3060 token_len = peek_token_bracket (token, regexp, syntax);
3063 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3064 &start_elem, &end_elem);
3066 # ifdef RE_ENABLE_I18N
3067 *err = build_range_exp (sbcset,
3068 dfa->mb_cur_max > 1 ? mbcset : NULL,
3069 &range_alloc, &start_elem, &end_elem);
3071 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3073 #endif /* RE_ENABLE_I18N */
3074 if (BE (*err != REG_NOERROR, 0))
3075 goto parse_bracket_exp_free_return;
3079 switch (start_elem.type)
3082 bitset_set (sbcset, start_elem.opr.ch);
3084 #ifdef RE_ENABLE_I18N
3086 /* Check whether the array has enough space. */
3087 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3089 wchar_t *new_mbchars;
3090 /* Not enough, realloc it. */
3091 /* +1 in case of mbcset->nmbchars is 0. */
3092 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3093 /* Use realloc since array is NULL if *alloc == 0. */
3094 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3096 if (BE (new_mbchars == NULL, 0))
3097 goto parse_bracket_exp_espace;
3098 mbcset->mbchars = new_mbchars;
3100 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3102 #endif /* RE_ENABLE_I18N */
3104 *err = build_equiv_class (sbcset,
3105 #ifdef RE_ENABLE_I18N
3106 mbcset, &equiv_class_alloc,
3107 #endif /* RE_ENABLE_I18N */
3108 start_elem.opr.name);
3109 if (BE (*err != REG_NOERROR, 0))
3110 goto parse_bracket_exp_free_return;
3113 *err = build_collating_symbol (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115 mbcset, &coll_sym_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117 start_elem.opr.name);
3118 if (BE (*err != REG_NOERROR, 0))
3119 goto parse_bracket_exp_free_return;
3122 *err = build_charclass (regexp->trans, sbcset,
3123 #ifdef RE_ENABLE_I18N
3124 mbcset, &char_class_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126 start_elem.opr.name, syntax);
3127 if (BE (*err != REG_NOERROR, 0))
3128 goto parse_bracket_exp_free_return;
3135 if (BE (token->type == END_OF_RE, 0))
3138 goto parse_bracket_exp_free_return;
3140 if (token->type == OP_CLOSE_BRACKET)
3144 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3146 /* If it is non-matching list. */
3148 bitset_not (sbcset);
3150 #ifdef RE_ENABLE_I18N
3151 /* Ensure only single byte characters are set. */
3152 if (dfa->mb_cur_max > 1)
3153 bitset_mask (sbcset, dfa->sb_char);
3155 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3156 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3157 || mbcset->non_match)))
3159 bin_tree_t *mbc_tree;
3161 /* Build a tree for complex bracket. */
3162 dfa->has_mb_node = 1;
3163 br_token.type = COMPLEX_BRACKET;
3164 br_token.opr.mbcset = mbcset;
3165 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3166 if (BE (mbc_tree == NULL, 0))
3167 goto parse_bracket_exp_espace;
3168 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3169 if (sbcset[sbc_idx])
3171 /* If there are no bits set in sbcset, there is no point
3172 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3173 if (sbc_idx < BITSET_UINTS)
3175 /* Build a tree for simple bracket. */
3176 br_token.type = SIMPLE_BRACKET;
3177 br_token.opr.sbcset = sbcset;
3178 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3179 if (BE (work_tree == NULL, 0))
3180 goto parse_bracket_exp_espace;
3182 /* Then join them by ALT node. */
3183 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3184 if (BE (work_tree == NULL, 0))
3185 goto parse_bracket_exp_espace;
3190 work_tree = mbc_tree;
3194 #endif /* not RE_ENABLE_I18N */
3196 #ifdef RE_ENABLE_I18N
3197 free_charset (mbcset);
3199 /* Build a tree for simple bracket. */
3200 br_token.type = SIMPLE_BRACKET;
3201 br_token.opr.sbcset = sbcset;
3202 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3203 if (BE (work_tree == NULL, 0))
3204 goto parse_bracket_exp_espace;
3208 parse_bracket_exp_espace:
3210 parse_bracket_exp_free_return:
3212 #ifdef RE_ENABLE_I18N
3213 free_charset (mbcset);
3214 #endif /* RE_ENABLE_I18N */
3218 /* Parse an element in the bracket expression. */
3220 static reg_errcode_t
3221 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3222 re_token_t *token, int token_len, re_dfa_t *dfa,
3223 reg_syntax_t syntax, int accept_hyphen)
3225 #ifdef RE_ENABLE_I18N
3227 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3228 if (cur_char_size > 1)
3230 elem->type = MB_CHAR;
3231 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3232 re_string_skip_bytes (regexp, cur_char_size);
3235 #endif /* RE_ENABLE_I18N */
3236 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3237 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3238 || token->type == OP_OPEN_EQUIV_CLASS)
3239 return parse_bracket_symbol (elem, regexp, token);
3240 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3242 /* A '-' must only appear as anything but a range indicator before
3243 the closing bracket. Everything else is an error. */
3245 (void) peek_token_bracket (&token2, regexp, syntax);
3246 if (token2.type != OP_CLOSE_BRACKET)
3247 /* The actual error value is not standardized since this whole
3248 case is undefined. But ERANGE makes good sense. */
3251 elem->type = SB_CHAR;
3252 elem->opr.ch = token->opr.c;
3256 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3257 such as [:<character_class>:], [.<collating_element>.], and
3258 [=<equivalent_class>=]. */
3260 static reg_errcode_t
3261 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3264 unsigned char ch, delim = token->opr.c;
3266 if (re_string_eoi(regexp))
3270 if (i >= BRACKET_NAME_BUF_SIZE)
3272 if (token->type == OP_OPEN_CHAR_CLASS)
3273 ch = re_string_fetch_byte_case (regexp);
3275 ch = re_string_fetch_byte (regexp);
3276 if (re_string_eoi(regexp))
3278 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3280 elem->opr.name[i] = ch;
3282 re_string_skip_bytes (regexp, 1);
3283 elem->opr.name[i] = '\0';
3284 switch (token->type)
3286 case OP_OPEN_COLL_ELEM:
3287 elem->type = COLL_SYM;
3289 case OP_OPEN_EQUIV_CLASS:
3290 elem->type = EQUIV_CLASS;
3292 case OP_OPEN_CHAR_CLASS:
3293 elem->type = CHAR_CLASS;
3301 /* Helper function for parse_bracket_exp.
3302 Build the equivalence class which is represented by NAME.
3303 The result are written to MBCSET and SBCSET.
3304 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3305 is a pointer argument sinse we may update it. */
3307 static reg_errcode_t
3308 build_equiv_class (re_bitset_ptr_t sbcset,
3309 #ifdef RE_ENABLE_I18N
3310 re_charset_t *mbcset, int *equiv_class_alloc,
3312 const unsigned char *name)
3315 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3318 const int32_t *table, *indirect;
3319 const unsigned char *weights, *extra, *cp;
3320 unsigned char char_buf[2];
3324 /* This #include defines a local function! */
3325 # include <locale/weight.h>
3326 /* Calculate the index for equivalence class. */
3328 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3329 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3330 _NL_COLLATE_WEIGHTMB);
3331 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3332 _NL_COLLATE_EXTRAMB);
3333 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3334 _NL_COLLATE_INDIRECTMB);
3335 idx1 = findidx (&cp);
3336 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3337 /* This isn't a valid character. */
3338 return REG_ECOLLATE;
3340 /* Build single byte matcing table for this equivalence class. */
3341 char_buf[1] = (unsigned char) '\0';
3342 len = weights[idx1];
3343 for (ch = 0; ch < SBC_MAX; ++ch)
3347 idx2 = findidx (&cp);
3352 /* This isn't a valid character. */
3354 if (len == weights[idx2])
3357 while (cnt <= len &&
3358 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3362 bitset_set (sbcset, ch);
3365 /* Check whether the array has enough space. */
3366 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3368 /* Not enough, realloc it. */
3369 /* +1 in case of mbcset->nequiv_classes is 0. */
3370 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3371 /* Use realloc since the array is NULL if *alloc == 0. */
3372 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3374 new_equiv_class_alloc);
3375 if (BE (new_equiv_classes == NULL, 0))
3377 mbcset->equiv_classes = new_equiv_classes;
3378 *equiv_class_alloc = new_equiv_class_alloc;
3380 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3385 if (BE (strlen ((const char *) name) != 1, 0))
3386 return REG_ECOLLATE;
3387 bitset_set (sbcset, *name);
3392 /* Helper function for parse_bracket_exp.
3393 Build the character class which is represented by NAME.
3394 The result are written to MBCSET and SBCSET.
3395 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3396 is a pointer argument sinse we may update it. */
3398 static reg_errcode_t
3399 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3400 #ifdef RE_ENABLE_I18N
3401 re_charset_t *mbcset, int *char_class_alloc,
3403 const unsigned char *class_name, reg_syntax_t syntax)
3406 const char *name = (const char *) class_name;
3408 /* In case of REG_ICASE "upper" and "lower" match the both of
3409 upper and lower cases. */
3410 if ((syntax & REG_IGNORE_CASE)
3411 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3414 #ifdef RE_ENABLE_I18N
3415 /* Check the space of the arrays. */
3416 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3418 /* Not enough, realloc it. */
3419 /* +1 in case of mbcset->nchar_classes is 0. */
3420 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3421 /* Use realloc since array is NULL if *alloc == 0. */
3422 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3423 new_char_class_alloc);
3424 if (BE (new_char_classes == NULL, 0))
3426 mbcset->char_classes = new_char_classes;
3427 *char_class_alloc = new_char_class_alloc;
3429 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3430 #endif /* RE_ENABLE_I18N */
3432 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3433 for (i = 0; i < SBC_MAX; ++i) \
3435 if (ctype_func (i)) \
3437 int ch = trans ? trans[i] : i; \
3438 bitset_set (sbcset, ch); \
3442 if (strcmp (name, "alnum") == 0)
3443 BUILD_CHARCLASS_LOOP (isalnum)
3444 else if (strcmp (name, "cntrl") == 0)
3445 BUILD_CHARCLASS_LOOP (iscntrl)
3446 else if (strcmp (name, "lower") == 0)
3447 BUILD_CHARCLASS_LOOP (islower)
3448 else if (strcmp (name, "space") == 0)
3449 BUILD_CHARCLASS_LOOP (isspace)
3450 else if (strcmp (name, "alpha") == 0)
3451 BUILD_CHARCLASS_LOOP (isalpha)
3452 else if (strcmp (name, "digit") == 0)
3453 BUILD_CHARCLASS_LOOP (isdigit)
3454 else if (strcmp (name, "print") == 0)
3455 BUILD_CHARCLASS_LOOP (isprint)
3456 else if (strcmp (name, "upper") == 0)
3457 BUILD_CHARCLASS_LOOP (isupper)
3458 else if (strcmp (name, "blank") == 0)
3459 BUILD_CHARCLASS_LOOP (isblank)
3460 else if (strcmp (name, "graph") == 0)
3461 BUILD_CHARCLASS_LOOP (isgraph)
3462 else if (strcmp (name, "punct") == 0)
3463 BUILD_CHARCLASS_LOOP (ispunct)
3464 else if (strcmp (name, "xdigit") == 0)
3465 BUILD_CHARCLASS_LOOP (isxdigit)
3473 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3474 const unsigned char *class_name,
3475 const unsigned char *extra,
3476 int non_match, reg_errcode_t *err)
3478 re_bitset_ptr_t sbcset;
3479 #ifdef RE_ENABLE_I18N
3480 re_charset_t *mbcset;
3482 #endif /* not RE_ENABLE_I18N */
3484 re_token_t br_token;
3487 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3488 #ifdef RE_ENABLE_I18N
3489 mbcset = re_calloc (re_charset_t, 1);
3490 #endif /* RE_ENABLE_I18N */
3492 #ifdef RE_ENABLE_I18N
3493 if (BE (sbcset == NULL || mbcset == NULL, 0))
3494 #else /* not RE_ENABLE_I18N */
3495 if (BE (sbcset == NULL, 0))
3496 #endif /* not RE_ENABLE_I18N */
3504 #ifdef RE_ENABLE_I18N
3506 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3507 bitset_set(cset->sbcset, '\0');
3509 mbcset->non_match = 1;
3510 #endif /* not RE_ENABLE_I18N */
3513 /* We don't care the syntax in this case. */
3514 ret = build_charclass (trans, sbcset,
3515 #ifdef RE_ENABLE_I18N
3517 #endif /* RE_ENABLE_I18N */
3520 if (BE (ret != REG_NOERROR, 0))
3523 #ifdef RE_ENABLE_I18N
3524 free_charset (mbcset);
3525 #endif /* RE_ENABLE_I18N */
3529 /* \w match '_' also. */
3530 for (; *extra; extra++)
3531 bitset_set (sbcset, *extra);
3533 /* If it is non-matching list. */
3535 bitset_not (sbcset);
3537 #ifdef RE_ENABLE_I18N
3538 /* Ensure only single byte characters are set. */
3539 if (dfa->mb_cur_max > 1)
3540 bitset_mask (sbcset, dfa->sb_char);
3543 /* Build a tree for simple bracket. */
3544 br_token.type = SIMPLE_BRACKET;
3545 br_token.opr.sbcset = sbcset;
3546 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3547 if (BE (tree == NULL, 0))
3548 goto build_word_op_espace;
3550 #ifdef RE_ENABLE_I18N
3551 if (dfa->mb_cur_max > 1)
3553 bin_tree_t *mbc_tree;
3554 /* Build a tree for complex bracket. */
3555 br_token.type = COMPLEX_BRACKET;
3556 br_token.opr.mbcset = mbcset;
3557 dfa->has_mb_node = 1;
3558 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3559 if (BE (mbc_tree == NULL, 0))
3560 goto build_word_op_espace;
3561 /* Then join them by ALT node. */
3562 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3563 if (BE (mbc_tree != NULL, 1))
3568 free_charset (mbcset);
3571 #else /* not RE_ENABLE_I18N */
3573 #endif /* not RE_ENABLE_I18N */
3575 build_word_op_espace:
3577 #ifdef RE_ENABLE_I18N
3578 free_charset (mbcset);
3579 #endif /* RE_ENABLE_I18N */
3584 /* This is intended for the expressions like "a{1,3}".
3585 Fetch a number from `input', and return the number.
3586 Return -1, if the number field is empty like "{,1}".
3587 Return -2, If an error is occured. */
3590 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3596 fetch_token (token, input, syntax);
3598 if (BE (token->type == END_OF_RE, 0))
3600 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3602 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3603 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3604 num = (num > REG_DUP_MAX) ? -2 : num;
3609 #ifdef RE_ENABLE_I18N
3611 free_charset (re_charset_t *cset)
3613 re_free (cset->mbchars);
3615 re_free (cset->coll_syms);
3616 re_free (cset->equiv_classes);
3617 re_free (cset->range_starts);
3618 re_free (cset->range_ends);
3620 re_free (cset->char_classes);
3623 #endif /* RE_ENABLE_I18N */
3625 /* Functions for binary tree operation. */
3627 /* Create a tree node. */
3630 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3631 re_token_type_t type)
3635 return create_token_tree (dfa, left, right, &t);
3639 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3640 const re_token_t *token)
3643 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3645 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3647 if (storage == NULL)
3649 storage->next = dfa->str_tree_storage;
3650 dfa->str_tree_storage = storage;
3651 dfa->str_tree_storage_idx = 0;
3653 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3655 tree->parent = NULL;
3657 tree->right = right;
3658 tree->token = *token;
3659 tree->token.duplicated = 0;
3660 tree->token.opt_subexp = 0;
3663 tree->node_idx = -1;
3666 left->parent = tree;
3668 right->parent = tree;
3672 /* Mark the tree SRC as an optional subexpression.
3673 To be called from preorder or postorder. */
3675 static reg_errcode_t
3676 mark_opt_subexp (void *extra, bin_tree_t *node)
3678 int idx = (int) (long) extra;
3679 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3680 node->token.opt_subexp = 1;
3685 /* Free the allocated memory inside NODE. */
3688 free_token (re_token_t *node)
3690 #ifdef RE_ENABLE_I18N
3691 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3692 free_charset (node->opr.mbcset);
3694 #endif /* RE_ENABLE_I18N */
3695 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3696 re_free (node->opr.sbcset);
3699 /* Worker function for tree walking. Free the allocated memory inside NODE
3700 and its children. */
3702 static reg_errcode_t
3703 free_tree (void *extra, bin_tree_t *node)
3705 free_token (&node->token);
3710 /* Duplicate the node SRC, and return new node. This is a preorder
3711 visit similar to the one implemented by the generic visitor, but
3712 we need more infrastructure to maintain two parallel trees --- so,
3713 it's easier to duplicate. */
3716 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3718 const bin_tree_t *node;
3719 bin_tree_t *dup_root;
3720 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3722 for (node = root; ; )
3724 /* Create a new tree and link it back to the current parent. */
3725 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3728 (*p_new)->parent = dup_node;
3729 (*p_new)->token.duplicated = 1;
3732 /* Go to the left node, or up and to the right. */
3736 p_new = &dup_node->left;
3740 const bin_tree_t *prev = NULL;
3741 while (node->right == prev || node->right == NULL)
3744 node = node->parent;
3745 dup_node = dup_node->parent;
3750 p_new = &dup_node->right;