1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
26 static void init_word_char (re_dfa_t *dfa);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
51 int top_clone_node, int root_node,
52 unsigned int constraint);
53 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
54 unsigned int constraint);
55 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
63 static void fetch_token (re_token_t *result, re_string_t *input,
65 static int peek_token (re_token_t *token, re_string_t *input,
67 static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
70 reg_syntax_t syntax, reg_errcode_t *err);
71 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 int nest, reg_errcode_t *err);
77 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
78 re_token_t *token, reg_syntax_t syntax,
79 int nest, reg_errcode_t *err);
80 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
81 re_token_t *token, reg_syntax_t syntax,
82 int nest, reg_errcode_t *err);
83 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
84 re_dfa_t *dfa, re_token_t *token,
85 reg_syntax_t syntax, reg_errcode_t *err);
86 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
87 re_token_t *token, reg_syntax_t syntax,
89 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_token_t *token, int token_len,
95 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
99 # ifdef RE_ENABLE_I18N
100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
101 re_charset_t *mbcset, int *range_alloc,
102 bracket_elem_t *start_elem,
103 bracket_elem_t *end_elem);
104 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
105 re_charset_t *mbcset,
107 const unsigned char *name);
108 # else /* not RE_ENABLE_I18N */
109 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
110 bracket_elem_t *start_elem,
111 bracket_elem_t *end_elem);
112 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
113 const unsigned char *name);
114 # endif /* not RE_ENABLE_I18N */
115 #endif /* not _LIBC */
116 #ifdef RE_ENABLE_I18N
117 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
118 re_charset_t *mbcset,
119 int *equiv_class_alloc,
120 const unsigned char *name);
121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
123 re_charset_t *mbcset,
124 int *char_class_alloc,
125 const unsigned char *class_name,
126 reg_syntax_t syntax);
127 #else /* not RE_ENABLE_I18N */
128 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
129 const unsigned char *name);
130 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
131 re_bitset_ptr_t sbcset,
132 const unsigned char *class_name,
133 reg_syntax_t syntax);
134 #endif /* not RE_ENABLE_I18N */
135 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
136 unsigned RE_TRANSLATE_TYPE trans,
137 const unsigned char *class_name,
138 const unsigned char *extra,
139 int non_match, reg_errcode_t *err);
140 static bin_tree_t *create_tree (re_dfa_t *dfa,
141 bin_tree_t *left, bin_tree_t *right,
142 re_token_type_t type);
143 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
144 bin_tree_t *left, bin_tree_t *right,
145 const re_token_t *token);
146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
147 static void free_token (re_token_t *node);
148 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
149 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151 /* This table gives an error message for each of the error codes listed
152 in regex.h. Obviously the order here has to be same as there.
153 POSIX doesn't require that we do anything for REG_NOERROR,
154 but why not be nice? */
156 const char __re_error_msgid[] attribute_hidden =
158 #define REG_NOERROR_IDX 0
159 gettext_noop ("Success") /* REG_NOERROR */
161 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
162 gettext_noop ("No match") /* REG_NOMATCH */
164 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
165 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
168 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
171 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
174 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
177 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
180 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
182 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
183 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
186 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
189 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
192 gettext_noop ("Invalid range end") /* REG_ERANGE */
194 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
195 gettext_noop ("Memory exhausted") /* REG_ESPACE */
197 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
198 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
201 gettext_noop ("Premature end of regular expression") /* REG_EEND */
203 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
204 gettext_noop ("Regular expression too big") /* REG_ESIZE */
206 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
207 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
210 const size_t __re_error_msgid_idx[] attribute_hidden =
231 /* Entry points for GNU code. */
233 /* re_compile_pattern is the GNU regular expression compiler: it
234 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
235 Returns 0 if the pattern was valid, otherwise an error string.
237 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
238 are set in BUFP on entry. */
241 re_compile_pattern (pattern, length, bufp)
244 struct re_pattern_buffer *bufp;
248 /* And GNU code determines whether or not to get register information
249 by passing null for the REGS argument to re_match, etc., not by
250 setting no_sub, unless RE_NO_SUB is set. */
251 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
253 /* Match anchors at newline. */
254 bufp->newline_anchor = 1;
256 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
260 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
263 weak_alias (__re_compile_pattern, re_compile_pattern)
266 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
267 also be assigned to arbitrarily: each pattern buffer stores its own
268 syntax, so it can be changed between regex compilations. */
269 /* This has no initializer because initialized variables in Emacs
270 become read-only after dumping. */
271 reg_syntax_t re_syntax_options;
274 /* Specify the precise syntax of regexps for compilation. This provides
275 for compatibility for various utilities which historically have
276 different, incompatible syntaxes.
278 The argument SYNTAX is a bit mask comprised of the various bits
279 defined in regex.h. We return the old syntax. */
282 re_set_syntax (syntax)
285 reg_syntax_t ret = re_syntax_options;
287 re_syntax_options = syntax;
291 weak_alias (__re_set_syntax, re_set_syntax)
295 re_compile_fastmap (bufp)
296 struct re_pattern_buffer *bufp;
298 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
299 char *fastmap = bufp->fastmap;
301 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
302 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
303 if (dfa->init_state != dfa->init_state_word)
304 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
305 if (dfa->init_state != dfa->init_state_nl)
306 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
307 if (dfa->init_state != dfa->init_state_begbuf)
308 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
309 bufp->fastmap_accurate = 1;
313 weak_alias (__re_compile_fastmap, re_compile_fastmap)
317 __attribute ((always_inline))
318 re_set_fastmap (char *fastmap, int icase, int ch)
322 fastmap[tolower (ch)] = 1;
325 /* Helper function for re_compile_fastmap.
326 Compile fastmap for the initial_state INIT_STATE. */
329 re_compile_fastmap_iter (bufp, init_state, fastmap)
331 const re_dfastate_t *init_state;
334 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
337 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
339 int node = init_state->nodes.elems[node_cnt];
340 re_token_type_t type = dfa->nodes[node].type;
342 if (type == CHARACTER)
344 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
345 #ifdef RE_ENABLE_I18N
346 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
348 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
353 *p++ = dfa->nodes[node].opr.c;
354 while (++node < dfa->nodes_len
355 && dfa->nodes[node].type == CHARACTER
356 && dfa->nodes[node].mb_partial)
357 *p++ = dfa->nodes[node].opr.c;
358 memset (&state, 0, sizeof (state));
359 if (mbrtowc (&wc, (const char *) buf, p - buf,
361 && (__wcrtomb ((char *) buf, towlower (wc), &state)
363 re_set_fastmap (fastmap, 0, buf[0]);
367 else if (type == SIMPLE_BRACKET)
370 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
371 for (j = 0; j < UINT_BITS; ++j, ++ch)
372 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
373 re_set_fastmap (fastmap, icase, ch);
375 #ifdef RE_ENABLE_I18N
376 else if (type == COMPLEX_BRACKET)
379 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
380 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
381 || cset->nranges || cset->nchar_classes)
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
393 const int32_t *table = (const int32_t *)
394 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
395 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
396 for (j = 0; j < UINT_BITS; ++j, ++ch)
398 re_set_fastmap (fastmap, icase, ch);
401 if (dfa->mb_cur_max > 1)
402 for (i = 0; i < SBC_MAX; ++i)
403 if (__btowc (i) == WEOF)
404 re_set_fastmap (fastmap, icase, i);
405 # endif /* not _LIBC */
407 for (i = 0; i < cset->nmbchars; ++i)
411 memset (&state, '\0', sizeof (state));
412 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *__restrict preg;
476 const char *__restrict pattern;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
529 weak_alias (__regcomp, regcomp)
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
536 regerror (errcode, preg, errbuf, errbuf_size)
546 || errcode >= (int) (sizeof (__re_error_msgid_idx)
547 / sizeof (__re_error_msgid_idx[0])), 0))
548 /* Only error codes returned by the rest of the code should be passed
549 to this routine. If we are given anything else, or if other regex
550 code generates an invalid error code, then the program has a bug.
551 Dump core so we can fix it. */
554 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
556 msg_size = strlen (msg) + 1; /* Includes the null. */
558 if (BE (errbuf_size != 0, 1))
560 if (BE (msg_size > errbuf_size, 0))
562 #if defined HAVE_MEMPCPY || defined _LIBC
563 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
565 memcpy (errbuf, msg, errbuf_size - 1);
566 errbuf[errbuf_size - 1] = 0;
570 memcpy (errbuf, msg, msg_size);
576 weak_alias (__regerror, regerror)
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset utf8_sb_map =
587 /* Set the first 128 bits. */
588 # if UINT_MAX == 0xffffffff
589 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
591 # error "Add case for new unsigned int size"
598 free_dfa_content (re_dfa_t *dfa)
603 for (i = 0; i < dfa->nodes_len; ++i)
604 free_token (dfa->nodes + i);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
626 re_dfastate_t *state = entry->array[j];
629 re_free (entry->array);
631 re_free (dfa->state_table);
632 #ifdef RE_ENABLE_I18N
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
636 re_free (dfa->subexp_map);
638 re_free (dfa->re_str);
645 /* Free dynamically allocated space used by PREG. */
651 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
652 if (BE (dfa != NULL, 1))
653 free_dfa_content (dfa);
657 re_free (preg->fastmap);
658 preg->fastmap = NULL;
660 re_free (preg->translate);
661 preg->translate = NULL;
664 weak_alias (__regfree, regfree)
667 /* Entry points compatible with 4.2 BSD regex library. We don't define
668 them unless specifically requested. */
670 #if defined _REGEX_RE_COMP || defined _LIBC
672 /* BSD has one and only one pattern buffer. */
673 static struct re_pattern_buffer re_comp_buf;
677 /* Make these definitions weak in libc, so POSIX programs can redefine
678 these names if they don't use our functions, and still use
679 regcomp/regexec above without link errors. */
690 if (!re_comp_buf.buffer)
691 return gettext ("No previous regular expression");
695 if (re_comp_buf.buffer)
697 fastmap = re_comp_buf.fastmap;
698 re_comp_buf.fastmap = NULL;
699 __regfree (&re_comp_buf);
700 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
701 re_comp_buf.fastmap = fastmap;
704 if (re_comp_buf.fastmap == NULL)
706 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
707 if (re_comp_buf.fastmap == NULL)
708 return (char *) gettext (__re_error_msgid
709 + __re_error_msgid_idx[(int) REG_ESPACE]);
712 /* Since `re_exec' always passes NULL for the `regs' argument, we
713 don't need to initialize the pattern buffer fields which affect it. */
715 /* Match anchors at newlines. */
716 re_comp_buf.newline_anchor = 1;
718 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
728 libc_freeres_fn (free_mem)
730 __regfree (&re_comp_buf);
734 #endif /* _REGEX_RE_COMP */
736 /* Internal entry point.
737 Compile the regular expression PATTERN, whose length is LENGTH.
738 SYNTAX indicate regular expression's syntax. */
741 re_compile_internal (preg, pattern, length, syntax)
743 const char * pattern;
747 reg_errcode_t err = REG_NOERROR;
751 /* Initialize the pattern buffer. */
752 preg->fastmap_accurate = 0;
753 preg->syntax = syntax;
754 preg->not_bol = preg->not_eol = 0;
757 preg->can_be_null = 0;
758 preg->regs_allocated = REGS_UNALLOCATED;
760 /* Initialize the dfa. */
761 dfa = (re_dfa_t *) preg->buffer;
762 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764 /* If zero allocated, but buffer is non-null, try to realloc
765 enough space. This loses if buffer's address is bogus, but
766 that is the user's responsibility. If ->buffer is NULL this
767 is a simple allocation. */
768 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
771 preg->allocated = sizeof (re_dfa_t);
772 preg->buffer = (unsigned char *) dfa;
774 preg->used = sizeof (re_dfa_t);
776 __libc_lock_init (dfa->lock);
778 err = init_dfa (dfa, length);
779 if (BE (err != REG_NOERROR, 0))
781 free_dfa_content (dfa);
787 dfa->re_str = re_malloc (char, length + 1);
788 strncpy (dfa->re_str, pattern, length + 1);
791 err = re_string_construct (®exp, pattern, length, preg->translate,
792 syntax & RE_ICASE, dfa);
793 if (BE (err != REG_NOERROR, 0))
795 re_compile_internal_free_return:
796 free_workarea_compile (preg);
797 re_string_destruct (®exp);
798 free_dfa_content (dfa);
804 /* Parse the regular expression, and build a structure tree. */
806 dfa->str_tree = parse (®exp, preg, syntax, &err);
807 if (BE (dfa->str_tree == NULL, 0))
808 goto re_compile_internal_free_return;
810 /* Analyze the tree and create the nfa. */
811 err = analyze (preg);
812 if (BE (err != REG_NOERROR, 0))
813 goto re_compile_internal_free_return;
815 #ifdef RE_ENABLE_I18N
816 /* If possible, do searching in single byte encoding to speed things up. */
817 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
821 /* Then create the initial state of the dfa. */
822 err = create_initial_state (dfa);
824 /* Release work areas. */
825 free_workarea_compile (preg);
826 re_string_destruct (®exp);
828 if (BE (err != REG_NOERROR, 0))
830 free_dfa_content (dfa);
838 /* Initialize DFA. We use the length of the regular expression PAT_LEN
839 as the initial length of some arrays. */
842 init_dfa (dfa, pat_len)
851 memset (dfa, '\0', sizeof (re_dfa_t));
853 /* Force allocation of str_tree_storage the first time. */
854 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
859 dfa->states_alloc = pat_len + 1;
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; table_size > 0; table_size <<= 1)
863 if (table_size > pat_len)
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
877 # ifdef HAVE_LANGINFO_CODESET
878 codeset_name = nl_langinfo (CODESET);
880 codeset_name = getenv ("LC_ALL");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LC_CTYPE");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LANG");
885 if (codeset_name == NULL)
887 else if (strchr (codeset_name, '.') != NULL)
888 codeset_name = strchr (codeset_name, '.') + 1;
891 if (strcasecmp (codeset_name, "UTF-8") == 0
892 || strcasecmp (codeset_name, "UTF8") == 0)
895 /* We check exhaustively in the loop below if this charset is a
896 superset of ASCII. */
897 dfa->map_notascii = 0;
900 #ifdef RE_ENABLE_I18N
901 if (dfa->mb_cur_max > 1)
904 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
909 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
910 if (BE (dfa->sb_char == NULL, 0))
913 /* Clear all bits by, then set those corresponding to single
915 bitset_empty (dfa->sb_char);
917 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
918 for (j = 0; j < UINT_BITS; ++j, ++ch)
920 wchar_t wch = __btowc (ch);
922 dfa->sb_char[i] |= 1 << j;
924 if (isascii (ch) && wch != (wchar_t) ch)
925 dfa->map_notascii = 1;
932 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
937 /* Initialize WORD_CHAR table, which indicate which character is
938 "word". In this case "word" means that it is the word construction
939 character used by some operators like "\<", "\>", etc. */
946 dfa->word_ops_used = 1;
947 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
948 for (j = 0; j < UINT_BITS; ++j, ++ch)
949 if (isalnum (ch) || ch == '_')
950 dfa->word_char[i] |= 1 << j;
953 /* Free the work area which are only used while compiling. */
956 free_workarea_compile (preg)
959 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
960 bin_tree_storage_t *storage, *next;
961 for (storage = dfa->str_tree_storage; storage; storage = next)
963 next = storage->next;
966 dfa->str_tree_storage = NULL;
967 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
968 dfa->str_tree = NULL;
969 re_free (dfa->org_indices);
970 dfa->org_indices = NULL;
973 /* Create initial states for all contexts. */
976 create_initial_state (dfa)
981 re_node_set init_nodes;
983 /* Initial states have the epsilon closure of the node which is
984 the first node of the regular expression. */
985 first = dfa->str_tree->first->node_idx;
986 dfa->init_node = first;
987 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
988 if (BE (err != REG_NOERROR, 0))
991 /* The back-references which are in initial states can epsilon transit,
992 since in this case all of the subexpressions can be null.
993 Then we add epsilon closures of the nodes which are the next nodes of
994 the back-references. */
995 if (dfa->nbackref > 0)
996 for (i = 0; i < init_nodes.nelem; ++i)
998 int node_idx = init_nodes.elems[i];
999 re_token_type_t type = dfa->nodes[node_idx].type;
1002 if (type != OP_BACK_REF)
1004 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1006 re_token_t *clexp_node;
1007 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1008 if (clexp_node->type == OP_CLOSE_SUBEXP
1009 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1012 if (clexp_idx == init_nodes.nelem)
1015 if (type == OP_BACK_REF)
1017 int dest_idx = dfa->edests[node_idx].elems[0];
1018 if (!re_node_set_contains (&init_nodes, dest_idx))
1020 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026 /* It must be the first time to invoke acquire_state. */
1027 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1028 /* We don't check ERR here, since the initial state must not be NULL. */
1029 if (BE (dfa->init_state == NULL, 0))
1031 if (dfa->init_state->has_constraint)
1033 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1035 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1037 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1041 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1042 || dfa->init_state_begbuf == NULL, 0))
1046 dfa->init_state_word = dfa->init_state_nl
1047 = dfa->init_state_begbuf = dfa->init_state;
1049 re_node_set_free (&init_nodes);
1053 #ifdef RE_ENABLE_I18N
1054 /* If it is possible to do searching in single byte encoding instead of UTF-8
1055 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1056 DFA nodes where needed. */
1062 int node, i, mb_chars = 0, has_period = 0;
1064 for (node = 0; node < dfa->nodes_len; ++node)
1065 switch (dfa->nodes[node].type)
1068 if (dfa->nodes[node].opr.c >= 0x80)
1072 switch (dfa->nodes[node].opr.idx)
1080 /* Word anchors etc. cannot be handled. */
1090 case OP_DUP_ASTERISK:
1091 case OP_OPEN_SUBEXP:
1092 case OP_CLOSE_SUBEXP:
1094 case COMPLEX_BRACKET:
1096 case SIMPLE_BRACKET:
1097 /* Just double check. */
1098 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1099 if (dfa->nodes[node].opr.sbcset[i])
1106 if (mb_chars || has_period)
1107 for (node = 0; node < dfa->nodes_len; ++node)
1109 if (dfa->nodes[node].type == CHARACTER
1110 && dfa->nodes[node].opr.c >= 0x80)
1111 dfa->nodes[node].mb_partial = 0;
1112 else if (dfa->nodes[node].type == OP_PERIOD)
1113 dfa->nodes[node].type = OP_UTF8_PERIOD;
1116 /* The search can be in single byte locale. */
1117 dfa->mb_cur_max = 1;
1119 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1123 /* Analyze the structure tree, and calculate "first", "next", "edest",
1124 "eclosure", and "inveclosure". */
1126 static reg_errcode_t
1130 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1133 /* Allocate arrays. */
1134 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1135 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1136 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1137 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1138 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1139 || dfa->eclosures == NULL, 0))
1142 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1143 if (dfa->subexp_map != NULL)
1146 for (i = 0; i < preg->re_nsub; i++)
1147 dfa->subexp_map[i] = i;
1148 preorder (dfa->str_tree, optimize_subexps, dfa);
1149 for (i = 0; i < preg->re_nsub; i++)
1150 if (dfa->subexp_map[i] != i)
1152 if (i == preg->re_nsub)
1154 free (dfa->subexp_map);
1155 dfa->subexp_map = NULL;
1159 ret = postorder (dfa->str_tree, lower_subexps, preg);
1160 if (BE (ret != REG_NOERROR, 0))
1162 ret = postorder (dfa->str_tree, calc_first, dfa);
1163 if (BE (ret != REG_NOERROR, 0))
1165 preorder (dfa->str_tree, calc_next, dfa);
1166 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1167 if (BE (ret != REG_NOERROR, 0))
1169 ret = calc_eclosure (dfa);
1170 if (BE (ret != REG_NOERROR, 0))
1173 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1174 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1175 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1178 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1179 if (BE (dfa->inveclosures == NULL, 0))
1181 ret = calc_inveclosure (dfa);
1187 /* Our parse trees are very unbalanced, so we cannot use a stack to
1188 implement parse tree visits. Instead, we use parent pointers and
1189 some hairy code in these two functions. */
1190 static reg_errcode_t
1191 postorder (root, fn, extra)
1193 reg_errcode_t (fn (void *, bin_tree_t *));
1196 bin_tree_t *node, *prev;
1198 for (node = root; ; )
1200 /* Descend down the tree, preferably to the left (or to the right
1201 if that's the only child). */
1202 while (node->left || node->right)
1210 reg_errcode_t err = fn (extra, node);
1211 if (BE (err != REG_NOERROR, 0))
1213 if (node->parent == NULL)
1216 node = node->parent;
1218 /* Go up while we have a node that is reached from the right. */
1219 while (node->right == prev || node->right == NULL);
1224 static reg_errcode_t
1225 preorder (root, fn, extra)
1227 reg_errcode_t (fn (void *, bin_tree_t *));
1232 for (node = root; ; )
1234 reg_errcode_t err = fn (extra, node);
1235 if (BE (err != REG_NOERROR, 0))
1238 /* Go to the left node, or up and to the right. */
1243 bin_tree_t *prev = NULL;
1244 while (node->right == prev || node->right == NULL)
1247 node = node->parent;
1256 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1257 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1258 backreferences as well. Requires a preorder visit. */
1259 static reg_errcode_t
1260 optimize_subexps (extra, node)
1264 re_dfa_t *dfa = (re_dfa_t *) extra;
1266 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1268 int idx = node->token.opr.idx;
1269 node->token.opr.idx = dfa->subexp_map[idx];
1270 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1273 else if (node->token.type == SUBEXP
1274 && node->left && node->left->token.type == SUBEXP)
1276 int other_idx = node->left->token.opr.idx;
1278 node->left = node->left->left;
1280 node->left->parent = node;
1282 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1283 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1284 dfa->used_bkref_map &= ~(1 << other_idx);
1290 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1291 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1292 static reg_errcode_t
1293 lower_subexps (extra, node)
1297 regex_t *preg = (regex_t *) extra;
1298 reg_errcode_t err = REG_NOERROR;
1300 if (node->left && node->left->token.type == SUBEXP)
1302 node->left = lower_subexp (&err, preg, node->left);
1304 node->left->parent = node;
1306 if (node->right && node->right->token.type == SUBEXP)
1308 node->right = lower_subexp (&err, preg, node->right);
1310 node->right->parent = node;
1317 lower_subexp (err, preg, node)
1322 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323 bin_tree_t *body = node->left;
1324 bin_tree_t *op, *cls, *tree1, *tree;
1327 /* We do not optimize empty subexpressions, because otherwise we may
1328 have bad CONCAT nodes with NULL children. This is obviously not
1329 very common, so we do not lose much. An example that triggers
1330 this case is the sed "script" /\(\)/x. */
1331 && node->left != NULL
1332 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1333 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1336 /* Convert the SUBEXP node to the concatenation of an
1337 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1338 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341 tree = create_tree (dfa, op, tree1, CONCAT);
1342 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1353 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354 nodes. Requires a postorder visit. */
1355 static reg_errcode_t
1356 calc_first (extra, node)
1360 re_dfa_t *dfa = (re_dfa_t *) extra;
1361 if (node->token.type == CONCAT)
1363 node->first = node->left->first;
1364 node->node_idx = node->left->node_idx;
1369 node->node_idx = re_dfa_add_node (dfa, node->token);
1370 if (BE (node->node_idx == -1, 0))
1376 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1377 static reg_errcode_t
1378 calc_next (extra, node)
1382 switch (node->token.type)
1384 case OP_DUP_ASTERISK:
1385 node->left->next = node;
1388 node->left->next = node->right->first;
1389 node->right->next = node->next;
1393 node->left->next = node->next;
1395 node->right->next = node->next;
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1402 static reg_errcode_t
1403 link_nfa_nodes (extra, node)
1407 re_dfa_t *dfa = (re_dfa_t *) extra;
1408 int idx = node->node_idx;
1409 reg_errcode_t err = REG_NOERROR;
1411 switch (node->token.type)
1417 assert (node->next == NULL);
1420 case OP_DUP_ASTERISK:
1424 dfa->has_plural_match = 1;
1425 if (node->left != NULL)
1426 left = node->left->first->node_idx;
1428 left = node->next->node_idx;
1429 if (node->right != NULL)
1430 right = node->right->first->node_idx;
1432 right = node->next->node_idx;
1434 assert (right > -1);
1435 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440 case OP_OPEN_SUBEXP:
1441 case OP_CLOSE_SUBEXP:
1442 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446 dfa->nexts[idx] = node->next->node_idx;
1447 if (node->token.type == OP_BACK_REF)
1448 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452 assert (!IS_EPSILON_NODE (node->token.type));
1453 dfa->nexts[idx] = node->next->node_idx;
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462 to their own constraint. */
1464 static reg_errcode_t
1465 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1468 int top_org_node, top_clone_node, root_node;
1469 unsigned int init_constraint;
1472 int org_node, clone_node, ret;
1473 unsigned int constraint = init_constraint;
1474 for (org_node = top_org_node, clone_node = top_clone_node;;)
1476 int org_dest, clone_dest;
1477 if (dfa->nodes[org_node].type == OP_BACK_REF)
1479 /* If the back reference epsilon-transit, its destination must
1480 also have the constraint. Then duplicate the epsilon closure
1481 of the destination of the back reference, and store it in
1482 edests of the back reference. */
1483 org_dest = dfa->nexts[org_node];
1484 re_node_set_empty (dfa->edests + clone_node);
1485 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1486 if (BE (err != REG_NOERROR, 0))
1488 dfa->nexts[clone_node] = dfa->nexts[org_node];
1489 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1490 if (BE (ret < 0, 0))
1493 else if (dfa->edests[org_node].nelem == 0)
1495 /* In case of the node can't epsilon-transit, don't duplicate the
1496 destination and store the original destination as the
1497 destination of the node. */
1498 dfa->nexts[clone_node] = dfa->nexts[org_node];
1501 else if (dfa->edests[org_node].nelem == 1)
1503 /* In case of the node can epsilon-transit, and it has only one
1505 org_dest = dfa->edests[org_node].elems[0];
1506 re_node_set_empty (dfa->edests + clone_node);
1507 if (dfa->nodes[org_node].type == ANCHOR)
1509 /* In case of the node has another constraint, append it. */
1510 if (org_node == root_node && clone_node != org_node)
1512 /* ...but if the node is root_node itself, it means the
1513 epsilon closure have a loop, then tie it to the
1514 destination of the root_node. */
1515 ret = re_node_set_insert (dfa->edests + clone_node,
1517 if (BE (ret < 0, 0))
1521 constraint |= dfa->nodes[org_node].opr.ctx_type;
1523 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1524 if (BE (err != REG_NOERROR, 0))
1526 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 if (BE (ret < 0, 0))
1530 else /* dfa->edests[org_node].nelem == 2 */
1532 /* In case of the node can epsilon-transit, and it has two
1533 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
1536 /* Search for a duplicated node which satisfies the constraint. */
1537 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1538 if (clone_dest == -1)
1540 /* There are no such a duplicated node, create a new one. */
1541 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1542 if (BE (err != REG_NOERROR, 0))
1544 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545 if (BE (ret < 0, 0))
1547 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548 root_node, constraint);
1549 if (BE (err != REG_NOERROR, 0))
1554 /* There are a duplicated node which satisfy the constraint,
1555 use it to avoid infinite loop. */
1556 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 if (BE (ret < 0, 0))
1561 org_dest = dfa->edests[org_node].elems[1];
1562 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1563 if (BE (err != REG_NOERROR, 0))
1565 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 if (BE (ret < 0, 0))
1569 org_node = org_dest;
1570 clone_node = clone_dest;
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576 satisfies the constraint CONSTRAINT. */
1579 search_duplicated_node (dfa, org_node, constraint)
1582 unsigned int constraint;
1585 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 if (org_node == dfa->org_indices[idx]
1588 && constraint == dfa->nodes[idx].constraint)
1589 return idx; /* Found. */
1591 return -1; /* Not found. */
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1596 otherwise return the error code. */
1598 static reg_errcode_t
1599 duplicate_node (new_idx, dfa, org_idx, constraint)
1601 int *new_idx, org_idx;
1602 unsigned int constraint;
1604 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1605 if (BE (dup_idx == -1, 0))
1607 dfa->nodes[dup_idx].constraint = constraint;
1608 if (dfa->nodes[org_idx].type == ANCHOR)
1609 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1610 dfa->nodes[dup_idx].duplicated = 1;
1612 /* Store the index of the original node. */
1613 dfa->org_indices[dup_idx] = org_idx;
1618 static reg_errcode_t
1619 calc_inveclosure (dfa)
1623 for (idx = 0; idx < dfa->nodes_len; ++idx)
1624 re_node_set_init_empty (dfa->inveclosures + idx);
1626 for (src = 0; src < dfa->nodes_len; ++src)
1628 int *elems = dfa->eclosures[src].elems;
1629 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1631 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632 if (BE (ret == -1, 0))
1640 /* Calculate "eclosure" for all the node in DFA. */
1642 static reg_errcode_t
1646 int node_idx, incomplete;
1648 assert (dfa->nodes_len > 0);
1651 /* For each nodes, calculate epsilon closure. */
1652 for (node_idx = 0; ; ++node_idx)
1655 re_node_set eclosure_elem;
1656 if (node_idx == dfa->nodes_len)
1665 assert (dfa->eclosures[node_idx].nelem != -1);
1668 /* If we have already calculated, skip it. */
1669 if (dfa->eclosures[node_idx].nelem != 0)
1671 /* Calculate epsilon closure of `node_idx'. */
1672 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1673 if (BE (err != REG_NOERROR, 0))
1676 if (dfa->eclosures[node_idx].nelem == 0)
1679 re_node_set_free (&eclosure_elem);
1685 /* Calculate epsilon closure of NODE. */
1687 static reg_errcode_t
1688 calc_eclosure_iter (new_set, dfa, node, root)
1689 re_node_set *new_set;
1694 unsigned int constraint;
1696 re_node_set eclosure;
1698 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1699 if (BE (err != REG_NOERROR, 0))
1702 /* This indicates that we are calculating this node now.
1703 We reference this value to avoid infinite loop. */
1704 dfa->eclosures[node].nelem = -1;
1706 constraint = ((dfa->nodes[node].type == ANCHOR)
1707 ? dfa->nodes[node].opr.ctx_type : 0);
1708 /* If the current node has constraints, duplicate all nodes.
1709 Since they must inherit the constraints. */
1711 && dfa->edests[node].nelem
1712 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1714 int org_node, cur_node;
1715 org_node = cur_node = node;
1716 err = duplicate_node_closure (dfa, node, node, node, constraint);
1717 if (BE (err != REG_NOERROR, 0))
1721 /* Expand each epsilon destination nodes. */
1722 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1723 for (i = 0; i < dfa->edests[node].nelem; ++i)
1725 re_node_set eclosure_elem;
1726 int edest = dfa->edests[node].elems[i];
1727 /* If calculating the epsilon closure of `edest' is in progress,
1728 return intermediate result. */
1729 if (dfa->eclosures[edest].nelem == -1)
1734 /* If we haven't calculated the epsilon closure of `edest' yet,
1735 calculate now. Otherwise use calculated epsilon closure. */
1736 if (dfa->eclosures[edest].nelem == 0)
1738 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1739 if (BE (err != REG_NOERROR, 0))
1743 eclosure_elem = dfa->eclosures[edest];
1744 /* Merge the epsilon closure of `edest'. */
1745 re_node_set_merge (&eclosure, &eclosure_elem);
1746 /* If the epsilon closure of `edest' is incomplete,
1747 the epsilon closure of this node is also incomplete. */
1748 if (dfa->eclosures[edest].nelem == 0)
1751 re_node_set_free (&eclosure_elem);
1755 /* Epsilon closures include itself. */
1756 re_node_set_insert (&eclosure, node);
1757 if (incomplete && !root)
1758 dfa->eclosures[node].nelem = 0;
1760 dfa->eclosures[node] = eclosure;
1761 *new_set = eclosure;
1765 /* Functions for token which are used in the parser. */
1767 /* Fetch a token from INPUT.
1768 We must not use this function inside bracket expressions. */
1771 fetch_token (result, input, syntax)
1774 reg_syntax_t syntax;
1776 re_string_skip_bytes (input, peek_token (result, input, syntax));
1779 /* Peek a token from INPUT, and return the length of the token.
1780 We must not use this function inside bracket expressions. */
1783 peek_token (token, input, syntax)
1786 reg_syntax_t syntax;
1790 if (re_string_eoi (input))
1792 token->type = END_OF_RE;
1796 c = re_string_peek_byte (input, 0);
1799 token->word_char = 0;
1800 #ifdef RE_ENABLE_I18N
1801 token->mb_partial = 0;
1802 if (input->mb_cur_max > 1 &&
1803 !re_string_first_byte (input, re_string_cur_idx (input)))
1805 token->type = CHARACTER;
1806 token->mb_partial = 1;
1813 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1815 token->type = BACK_SLASH;
1819 c2 = re_string_peek_byte_case (input, 1);
1821 token->type = CHARACTER;
1822 #ifdef RE_ENABLE_I18N
1823 if (input->mb_cur_max > 1)
1825 wint_t wc = re_string_wchar_at (input,
1826 re_string_cur_idx (input) + 1);
1827 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1831 token->word_char = IS_WORD_CHAR (c2) != 0;
1836 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1837 token->type = OP_ALT;
1839 case '1': case '2': case '3': case '4': case '5':
1840 case '6': case '7': case '8': case '9':
1841 if (!(syntax & RE_NO_BK_REFS))
1843 token->type = OP_BACK_REF;
1844 token->opr.idx = c2 - '1';
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_FIRST;
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = WORD_LAST;
1862 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = ANCHOR;
1865 token->opr.ctx_type = WORD_DELIM;
1869 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = ANCHOR;
1872 token->opr.ctx_type = NOT_WORD_DELIM;
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_WORD;
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_NOTWORD;
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_SPACE;
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = OP_NOTSPACE;
1892 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = BUF_FIRST;
1899 if (!(syntax & RE_NO_GNU_OPS))
1901 token->type = ANCHOR;
1902 token->opr.ctx_type = BUF_LAST;
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_OPEN_SUBEXP;
1910 if (!(syntax & RE_NO_BK_PARENS))
1911 token->type = OP_CLOSE_SUBEXP;
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_PLUS;
1918 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1919 token->type = OP_DUP_QUESTION;
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_OPEN_DUP_NUM;
1926 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1927 token->type = OP_CLOSE_DUP_NUM;
1935 token->type = CHARACTER;
1936 #ifdef RE_ENABLE_I18N
1937 if (input->mb_cur_max > 1)
1939 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1940 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1944 token->word_char = IS_WORD_CHAR (token->opr.c);
1949 if (syntax & RE_NEWLINE_ALT)
1950 token->type = OP_ALT;
1953 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1954 token->type = OP_ALT;
1957 token->type = OP_DUP_ASTERISK;
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_PLUS;
1964 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1965 token->type = OP_DUP_QUESTION;
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969 token->type = OP_OPEN_DUP_NUM;
1972 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1973 token->type = OP_CLOSE_DUP_NUM;
1976 if (syntax & RE_NO_BK_PARENS)
1977 token->type = OP_OPEN_SUBEXP;
1980 if (syntax & RE_NO_BK_PARENS)
1981 token->type = OP_CLOSE_SUBEXP;
1984 token->type = OP_OPEN_BRACKET;
1987 token->type = OP_PERIOD;
1990 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1991 re_string_cur_idx (input) != 0)
1993 char prev = re_string_peek_byte (input, -1);
1994 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1997 token->type = ANCHOR;
1998 token->opr.ctx_type = LINE_FIRST;
2001 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2002 re_string_cur_idx (input) + 1 != re_string_length (input))
2005 re_string_skip_bytes (input, 1);
2006 peek_token (&next, input, syntax);
2007 re_string_skip_bytes (input, -1);
2008 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2011 token->type = ANCHOR;
2012 token->opr.ctx_type = LINE_LAST;
2020 /* Peek a token from INPUT, and return the length of the token.
2021 We must not use this function out of bracket expressions. */
2024 peek_token_bracket (token, input, syntax)
2027 reg_syntax_t syntax;
2030 if (re_string_eoi (input))
2032 token->type = END_OF_RE;
2035 c = re_string_peek_byte (input, 0);
2038 #ifdef RE_ENABLE_I18N
2039 if (input->mb_cur_max > 1 &&
2040 !re_string_first_byte (input, re_string_cur_idx (input)))
2042 token->type = CHARACTER;
2045 #endif /* RE_ENABLE_I18N */
2047 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2048 && re_string_cur_idx (input) + 1 < re_string_length (input))
2050 /* In this case, '\' escape a character. */
2052 re_string_skip_bytes (input, 1);
2053 c2 = re_string_peek_byte (input, 0);
2055 token->type = CHARACTER;
2058 if (c == '[') /* '[' is a special char in a bracket exps. */
2062 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2063 c2 = re_string_peek_byte (input, 1);
2071 token->type = OP_OPEN_COLL_ELEM;
2074 token->type = OP_OPEN_EQUIV_CLASS;
2077 if (syntax & RE_CHAR_CLASSES)
2079 token->type = OP_OPEN_CHAR_CLASS;
2082 /* else fall through. */
2084 token->type = CHARACTER;
2094 token->type = OP_CHARSET_RANGE;
2097 token->type = OP_CLOSE_BRACKET;
2100 token->type = OP_NON_MATCH_LIST;
2103 token->type = CHARACTER;
2108 /* Functions for parser. */
2110 /* Entry point of the parser.
2111 Parse the regular expression REGEXP and return the structure tree.
2112 If an error is occured, ERR is set by error code, and return NULL.
2113 This function build the following tree, from regular expression <reg_exp>:
2119 CAT means concatenation.
2120 EOR means end of regular expression. */
2123 parse (regexp, preg, syntax, err)
2124 re_string_t *regexp;
2126 reg_syntax_t syntax;
2129 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130 bin_tree_t *tree, *eor, *root;
2131 re_token_t current_token;
2132 dfa->syntax = syntax;
2133 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2134 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2135 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2137 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2139 root = create_tree (dfa, tree, eor, CONCAT);
2142 if (BE (eor == NULL || root == NULL, 0))
2150 /* This function build the following tree, from regular expression
2151 <branch1>|<branch2>:
2157 ALT means alternative, which represents the operator `|'. */
2160 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2161 re_string_t *regexp;
2164 reg_syntax_t syntax;
2168 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2169 bin_tree_t *tree, *branch = NULL;
2170 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2171 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2174 while (token->type == OP_ALT)
2176 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2177 if (token->type != OP_ALT && token->type != END_OF_RE
2178 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2180 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2181 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2186 tree = create_tree (dfa, tree, branch, OP_ALT);
2187 if (BE (tree == NULL, 0))
2196 /* This function build the following tree, from regular expression
2203 CAT means concatenation. */
2206 parse_branch (regexp, preg, token, syntax, nest, err)
2207 re_string_t *regexp;
2210 reg_syntax_t syntax;
2214 bin_tree_t *tree, *exp;
2215 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2216 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2217 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2220 while (token->type != OP_ALT && token->type != END_OF_RE
2221 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2223 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2224 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2228 if (tree != NULL && exp != NULL)
2230 tree = create_tree (dfa, tree, exp, CONCAT);
2237 else if (tree == NULL)
2239 /* Otherwise exp == NULL, we don't need to create new tree. */
2244 /* This function build the following tree, from regular expression a*:
2251 parse_expression (regexp, preg, token, syntax, nest, err)
2252 re_string_t *regexp;
2255 reg_syntax_t syntax;
2259 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2261 switch (token->type)
2264 tree = create_token_tree (dfa, NULL, NULL, token);
2265 if (BE (tree == NULL, 0))
2270 #ifdef RE_ENABLE_I18N
2271 if (dfa->mb_cur_max > 1)
2273 while (!re_string_eoi (regexp)
2274 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2276 bin_tree_t *mbc_remain;
2277 fetch_token (token, regexp, syntax);
2278 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2279 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2280 if (BE (mbc_remain == NULL || tree == NULL, 0))
2289 case OP_OPEN_SUBEXP:
2290 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2291 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2294 case OP_OPEN_BRACKET:
2295 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2296 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2305 dfa->used_bkref_map |= 1 << token->opr.idx;
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (BE (tree == NULL, 0))
2313 dfa->has_mb_node = 1;
2315 case OP_OPEN_DUP_NUM:
2316 if (syntax & RE_CONTEXT_INVALID_DUP)
2322 case OP_DUP_ASTERISK:
2324 case OP_DUP_QUESTION:
2325 if (syntax & RE_CONTEXT_INVALID_OPS)
2330 else if (syntax & RE_CONTEXT_INDEP_OPS)
2332 fetch_token (token, regexp, syntax);
2333 return parse_expression (regexp, preg, token, syntax, nest, err);
2335 /* else fall through */
2336 case OP_CLOSE_SUBEXP:
2337 if ((token->type == OP_CLOSE_SUBEXP) &&
2338 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2343 /* else fall through */
2344 case OP_CLOSE_DUP_NUM:
2345 /* We treat it as a normal character. */
2347 /* Then we can these characters as normal characters. */
2348 token->type = CHARACTER;
2349 /* mb_partial and word_char bits should be initialized already
2351 tree = create_token_tree (dfa, NULL, NULL, token);
2352 if (BE (tree == NULL, 0))
2359 if ((token->opr.ctx_type
2360 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2361 && dfa->word_ops_used == 0)
2362 init_word_char (dfa);
2363 if (token->opr.ctx_type == WORD_DELIM
2364 || token->opr.ctx_type == NOT_WORD_DELIM)
2366 bin_tree_t *tree_first, *tree_last;
2367 if (token->opr.ctx_type == WORD_DELIM)
2369 token->opr.ctx_type = WORD_FIRST;
2370 tree_first = create_token_tree (dfa, NULL, NULL, token);
2371 token->opr.ctx_type = WORD_LAST;
2375 token->opr.ctx_type = INSIDE_WORD;
2376 tree_first = create_token_tree (dfa, NULL, NULL, token);
2377 token->opr.ctx_type = INSIDE_NOTWORD;
2379 tree_last = create_token_tree (dfa, NULL, NULL, token);
2380 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2381 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2389 tree = create_token_tree (dfa, NULL, NULL, token);
2390 if (BE (tree == NULL, 0))
2396 /* We must return here, since ANCHORs can't be followed
2397 by repetition operators.
2398 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2399 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2400 fetch_token (token, regexp, syntax);
2403 tree = create_token_tree (dfa, NULL, NULL, token);
2404 if (BE (tree == NULL, 0))
2409 if (dfa->mb_cur_max > 1)
2410 dfa->has_mb_node = 1;
2414 tree = build_charclass_op (dfa, regexp->trans,
2415 (const unsigned char *) "alnum",
2416 (const unsigned char *) "_",
2417 token->type == OP_NOTWORD, err);
2418 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423 tree = build_charclass_op (dfa, regexp->trans,
2424 (const unsigned char *) "space",
2425 (const unsigned char *) "",
2426 token->type == OP_NOTSPACE, err);
2427 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2437 /* Must not happen? */
2443 fetch_token (token, regexp, syntax);
2445 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2446 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2448 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2449 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2451 /* In BRE consecutive duplications are not allowed. */
2452 if ((syntax & RE_CONTEXT_INVALID_DUP)
2453 && (token->type == OP_DUP_ASTERISK
2454 || token->type == OP_OPEN_DUP_NUM))
2464 /* This function build the following tree, from regular expression
2472 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2473 re_string_t *regexp;
2476 reg_syntax_t syntax;
2480 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2483 cur_nsub = preg->re_nsub++;
2485 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2487 /* The subexpression may be a null string. */
2488 if (token->type == OP_CLOSE_SUBEXP)
2492 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2493 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2495 if (BE (*err != REG_NOERROR, 0))
2498 dfa->completed_bkref_map |= 1 << cur_nsub;
2500 tree = create_tree (dfa, tree, NULL, SUBEXP);
2501 if (BE (tree == NULL, 0))
2506 tree->token.opr.idx = cur_nsub;
2510 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2513 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2515 re_string_t *regexp;
2518 reg_syntax_t syntax;
2521 bin_tree_t *tree = NULL, *old_tree = NULL;
2522 int i, start, end, start_idx = re_string_cur_idx (regexp);
2523 re_token_t start_token = *token;
2525 if (token->type == OP_OPEN_DUP_NUM)
2528 start = fetch_number (regexp, token, syntax);
2531 if (token->type == CHARACTER && token->opr.c == ',')
2532 start = 0; /* We treat "{,m}" as "{0,m}". */
2535 *err = REG_BADBR; /* <re>{} is invalid. */
2539 if (BE (start != -2, 1))
2541 /* We treat "{n}" as "{n,n}". */
2542 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2543 : ((token->type == CHARACTER && token->opr.c == ',')
2544 ? fetch_number (regexp, token, syntax) : -2));
2546 if (BE (start == -2 || end == -2, 0))
2548 /* Invalid sequence. */
2549 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2551 if (token->type == END_OF_RE)
2559 /* If the syntax bit is set, rollback. */
2560 re_string_set_index (regexp, start_idx);
2561 *token = start_token;
2562 token->type = CHARACTER;
2563 /* mb_partial and word_char bits should be already initialized by
2568 if (BE (end != -1 && start > end, 0))
2570 /* First number greater than second. */
2577 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2578 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2581 fetch_token (token, regexp, syntax);
2583 if (BE (elem == NULL, 0))
2585 if (BE (start == 0 && end == 0, 0))
2587 postorder (elem, free_tree, NULL);
2591 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2592 if (BE (start > 0, 0))
2595 for (i = 2; i <= start; ++i)
2597 elem = duplicate_tree (elem, dfa);
2598 tree = create_tree (dfa, tree, elem, CONCAT);
2599 if (BE (elem == NULL || tree == NULL, 0))
2600 goto parse_dup_op_espace;
2606 /* Duplicate ELEM before it is marked optional. */
2607 elem = duplicate_tree (elem, dfa);
2613 if (elem->token.type == SUBEXP)
2614 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2616 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2617 if (BE (tree == NULL, 0))
2618 goto parse_dup_op_espace;
2620 /* This loop is actually executed only when end != -1,
2621 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2622 already created the start+1-th copy. */
2623 for (i = start + 2; i <= end; ++i)
2625 elem = duplicate_tree (elem, dfa);
2626 tree = create_tree (dfa, tree, elem, CONCAT);
2627 if (BE (elem == NULL || tree == NULL, 0))
2628 goto parse_dup_op_espace;
2630 tree = create_tree (dfa, tree, NULL, OP_ALT);
2631 if (BE (tree == NULL, 0))
2632 goto parse_dup_op_espace;
2636 tree = create_tree (dfa, old_tree, tree, CONCAT);
2640 parse_dup_op_espace:
2645 /* Size of the names for collating symbol/equivalence_class/character_class.
2646 I'm not sure, but maybe enough. */
2647 #define BRACKET_NAME_BUF_SIZE 32
2650 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2651 Build the range expression which starts from START_ELEM, and ends
2652 at END_ELEM. The result are written to MBCSET and SBCSET.
2653 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2654 mbcset->range_ends, is a pointer argument sinse we may
2657 static reg_errcode_t
2658 # ifdef RE_ENABLE_I18N
2659 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2660 re_charset_t *mbcset;
2662 # else /* not RE_ENABLE_I18N */
2663 build_range_exp (sbcset, start_elem, end_elem)
2664 # endif /* not RE_ENABLE_I18N */
2665 re_bitset_ptr_t sbcset;
2666 bracket_elem_t *start_elem, *end_elem;
2668 unsigned int start_ch, end_ch;
2669 /* Equivalence Classes and Character Classes can't be a range start/end. */
2670 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2671 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2675 /* We can handle no multi character collating elements without libc
2677 if (BE ((start_elem->type == COLL_SYM
2678 && strlen ((char *) start_elem->opr.name) > 1)
2679 || (end_elem->type == COLL_SYM
2680 && strlen ((char *) end_elem->opr.name) > 1), 0))
2681 return REG_ECOLLATE;
2683 # ifdef RE_ENABLE_I18N
2685 wchar_t wc, start_wc, end_wc;
2686 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2688 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2689 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2691 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2692 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2694 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2695 ? __btowc (start_ch) : start_elem->opr.wch);
2696 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2697 ? __btowc (end_ch) : end_elem->opr.wch);
2698 if (start_wc == WEOF || end_wc == WEOF)
2699 return REG_ECOLLATE;
2700 cmp_buf[0] = start_wc;
2701 cmp_buf[4] = end_wc;
2702 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2705 /* Got valid collation sequence values, add them as a new entry.
2706 However, for !_LIBC we have no collation elements: if the
2707 character set is single byte, the single byte character set
2708 that we build below suffices. parse_bracket_exp passes
2709 no MBCSET if dfa->mb_cur_max == 1. */
2712 /* Check the space of the arrays. */
2713 if (BE (*range_alloc == mbcset->nranges, 0))
2715 /* There is not enough space, need realloc. */
2716 wchar_t *new_array_start, *new_array_end;
2719 /* +1 in case of mbcset->nranges is 0. */
2720 new_nranges = 2 * mbcset->nranges + 1;
2721 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2722 are NULL if *range_alloc == 0. */
2723 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2725 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2728 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2731 mbcset->range_starts = new_array_start;
2732 mbcset->range_ends = new_array_end;
2733 *range_alloc = new_nranges;
2736 mbcset->range_starts[mbcset->nranges] = start_wc;
2737 mbcset->range_ends[mbcset->nranges++] = end_wc;
2740 /* Build the table for single byte characters. */
2741 for (wc = 0; wc < SBC_MAX; ++wc)
2744 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2745 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2746 bitset_set (sbcset, wc);
2749 # else /* not RE_ENABLE_I18N */
2752 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2753 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2755 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2756 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2758 if (start_ch > end_ch)
2760 /* Build the table for single byte characters. */
2761 for (ch = 0; ch < SBC_MAX; ++ch)
2762 if (start_ch <= ch && ch <= end_ch)
2763 bitset_set (sbcset, ch);
2765 # endif /* not RE_ENABLE_I18N */
2768 #endif /* not _LIBC */
2771 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2772 Build the collating element which is represented by NAME.
2773 The result are written to MBCSET and SBCSET.
2774 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2775 pointer argument since we may update it. */
2777 static reg_errcode_t
2778 # ifdef RE_ENABLE_I18N
2779 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2780 re_charset_t *mbcset;
2781 int *coll_sym_alloc;
2782 # else /* not RE_ENABLE_I18N */
2783 build_collating_symbol (sbcset, name)
2784 # endif /* not RE_ENABLE_I18N */
2785 re_bitset_ptr_t sbcset;
2786 const unsigned char *name;
2788 size_t name_len = strlen ((const char *) name);
2789 if (BE (name_len != 1, 0))
2790 return REG_ECOLLATE;
2793 bitset_set (sbcset, name[0]);
2797 #endif /* not _LIBC */
2799 /* This function parse bracket expression like "[abc]", "[a-c]",
2803 parse_bracket_exp (regexp, dfa, token, syntax, err)
2804 re_string_t *regexp;
2807 reg_syntax_t syntax;
2811 const unsigned char *collseqmb;
2812 const char *collseqwc;
2815 const int32_t *symb_table;
2816 const unsigned char *extra;
2818 /* Local function for parse_bracket_exp used in _LIBC environement.
2819 Seek the collating symbol entry correspondings to NAME.
2820 Return the index of the symbol in the SYMB_TABLE. */
2823 __attribute ((always_inline))
2824 seek_collating_symbol_entry (name, name_len)
2825 const unsigned char *name;
2828 int32_t hash = elem_hash ((const char *) name, name_len);
2829 int32_t elem = hash % table_size;
2830 int32_t second = hash % (table_size - 2);
2831 while (symb_table[2 * elem] != 0)
2833 /* First compare the hashing value. */
2834 if (symb_table[2 * elem] == hash
2835 /* Compare the length of the name. */
2836 && name_len == extra[symb_table[2 * elem + 1]]
2837 /* Compare the name. */
2838 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2841 /* Yep, this is the entry. */
2851 /* Local function for parse_bracket_exp used in _LIBC environement.
2852 Look up the collation sequence value of BR_ELEM.
2853 Return the value if succeeded, UINT_MAX otherwise. */
2855 auto inline unsigned int
2856 __attribute ((always_inline))
2857 lookup_collation_sequence_value (br_elem)
2858 bracket_elem_t *br_elem;
2860 if (br_elem->type == SB_CHAR)
2863 if (MB_CUR_MAX == 1)
2866 return collseqmb[br_elem->opr.ch];
2869 wint_t wc = __btowc (br_elem->opr.ch);
2870 return __collseq_table_lookup (collseqwc, wc);
2873 else if (br_elem->type == MB_CHAR)
2875 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2877 else if (br_elem->type == COLL_SYM)
2879 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2883 elem = seek_collating_symbol_entry (br_elem->opr.name,
2885 if (symb_table[2 * elem] != 0)
2887 /* We found the entry. */
2888 idx = symb_table[2 * elem + 1];
2889 /* Skip the name of collating element name. */
2890 idx += 1 + extra[idx];
2891 /* Skip the byte sequence of the collating element. */
2892 idx += 1 + extra[idx];
2893 /* Adjust for the alignment. */
2894 idx = (idx + 3) & ~3;
2895 /* Skip the multibyte collation sequence value. */
2896 idx += sizeof (unsigned int);
2897 /* Skip the wide char sequence of the collating element. */
2898 idx += sizeof (unsigned int) *
2899 (1 + *(unsigned int *) (extra + idx));
2900 /* Return the collation sequence value. */
2901 return *(unsigned int *) (extra + idx);
2903 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2905 /* No valid character. Match it as a single byte
2907 return collseqmb[br_elem->opr.name[0]];
2910 else if (sym_name_len == 1)
2911 return collseqmb[br_elem->opr.name[0]];
2916 /* Local function for parse_bracket_exp used in _LIBC environement.
2917 Build the range expression which starts from START_ELEM, and ends
2918 at END_ELEM. The result are written to MBCSET and SBCSET.
2919 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2920 mbcset->range_ends, is a pointer argument sinse we may
2923 auto inline reg_errcode_t
2924 __attribute ((always_inline))
2925 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2926 re_charset_t *mbcset;
2928 re_bitset_ptr_t sbcset;
2929 bracket_elem_t *start_elem, *end_elem;
2932 uint32_t start_collseq;
2933 uint32_t end_collseq;
2935 /* Equivalence Classes and Character Classes can't be a range
2937 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2938 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2942 start_collseq = lookup_collation_sequence_value (start_elem);
2943 end_collseq = lookup_collation_sequence_value (end_elem);
2944 /* Check start/end collation sequence values. */
2945 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2946 return REG_ECOLLATE;
2947 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2950 /* Got valid collation sequence values, add them as a new entry.
2951 However, if we have no collation elements, and the character set
2952 is single byte, the single byte character set that we
2953 build below suffices. */
2954 if (nrules > 0 || dfa->mb_cur_max > 1)
2956 /* Check the space of the arrays. */
2957 if (BE (*range_alloc == mbcset->nranges, 0))
2959 /* There is not enough space, need realloc. */
2960 uint32_t *new_array_start;
2961 uint32_t *new_array_end;
2964 /* +1 in case of mbcset->nranges is 0. */
2965 new_nranges = 2 * mbcset->nranges + 1;
2966 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2968 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2971 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2974 mbcset->range_starts = new_array_start;
2975 mbcset->range_ends = new_array_end;
2976 *range_alloc = new_nranges;
2979 mbcset->range_starts[mbcset->nranges] = start_collseq;
2980 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2983 /* Build the table for single byte characters. */
2984 for (ch = 0; ch < SBC_MAX; ch++)
2986 uint32_t ch_collseq;
2988 if (MB_CUR_MAX == 1)
2991 ch_collseq = collseqmb[ch];
2993 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2994 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2995 bitset_set (sbcset, ch);
3000 /* Local function for parse_bracket_exp used in _LIBC environement.
3001 Build the collating element which is represented by NAME.
3002 The result are written to MBCSET and SBCSET.
3003 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3004 pointer argument sinse we may update it. */
3006 auto inline reg_errcode_t
3007 __attribute ((always_inline))
3008 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3009 re_charset_t *mbcset;
3010 int *coll_sym_alloc;
3011 re_bitset_ptr_t sbcset;
3012 const unsigned char *name;
3015 size_t name_len = strlen ((const char *) name);
3018 elem = seek_collating_symbol_entry (name, name_len);
3019 if (symb_table[2 * elem] != 0)
3021 /* We found the entry. */
3022 idx = symb_table[2 * elem + 1];
3023 /* Skip the name of collating element name. */
3024 idx += 1 + extra[idx];
3026 else if (symb_table[2 * elem] == 0 && name_len == 1)
3028 /* No valid character, treat it as a normal
3030 bitset_set (sbcset, name[0]);
3034 return REG_ECOLLATE;
3036 /* Got valid collation sequence, add it as a new entry. */
3037 /* Check the space of the arrays. */
3038 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3040 /* Not enough, realloc it. */
3041 /* +1 in case of mbcset->ncoll_syms is 0. */
3042 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3043 /* Use realloc since mbcset->coll_syms is NULL
3045 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3046 new_coll_sym_alloc);
3047 if (BE (new_coll_syms == NULL, 0))
3049 mbcset->coll_syms = new_coll_syms;
3050 *coll_sym_alloc = new_coll_sym_alloc;
3052 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3057 if (BE (name_len != 1, 0))
3058 return REG_ECOLLATE;
3061 bitset_set (sbcset, name[0]);
3068 re_token_t br_token;
3069 re_bitset_ptr_t sbcset;
3070 #ifdef RE_ENABLE_I18N
3071 re_charset_t *mbcset;
3072 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3073 int equiv_class_alloc = 0, char_class_alloc = 0;
3074 #endif /* not RE_ENABLE_I18N */
3076 bin_tree_t *work_tree;
3078 int first_round = 1;
3080 collseqmb = (const unsigned char *)
3081 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3082 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3088 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3089 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3090 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3091 _NL_COLLATE_SYMB_TABLEMB);
3092 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3093 _NL_COLLATE_SYMB_EXTRAMB);
3096 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3097 #ifdef RE_ENABLE_I18N
3098 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3099 #endif /* RE_ENABLE_I18N */
3100 #ifdef RE_ENABLE_I18N
3101 if (BE (sbcset == NULL || mbcset == NULL, 0))
3103 if (BE (sbcset == NULL, 0))
3104 #endif /* RE_ENABLE_I18N */
3110 token_len = peek_token_bracket (token, regexp, syntax);
3111 if (BE (token->type == END_OF_RE, 0))
3114 goto parse_bracket_exp_free_return;
3116 if (token->type == OP_NON_MATCH_LIST)
3118 #ifdef RE_ENABLE_I18N
3119 mbcset->non_match = 1;
3120 #endif /* not RE_ENABLE_I18N */
3122 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3123 bitset_set (sbcset, '\0');
3124 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3125 token_len = peek_token_bracket (token, regexp, syntax);
3126 if (BE (token->type == END_OF_RE, 0))
3129 goto parse_bracket_exp_free_return;
3133 /* We treat the first ']' as a normal character. */
3134 if (token->type == OP_CLOSE_BRACKET)
3135 token->type = CHARACTER;
3139 bracket_elem_t start_elem, end_elem;
3140 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3141 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3143 int token_len2 = 0, is_range_exp = 0;
3146 start_elem.opr.name = start_name_buf;
3147 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3148 syntax, first_round);
3149 if (BE (ret != REG_NOERROR, 0))
3152 goto parse_bracket_exp_free_return;
3156 /* Get information about the next token. We need it in any case. */
3157 token_len = peek_token_bracket (token, regexp, syntax);
3159 /* Do not check for ranges if we know they are not allowed. */
3160 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3162 if (BE (token->type == END_OF_RE, 0))
3165 goto parse_bracket_exp_free_return;
3167 if (token->type == OP_CHARSET_RANGE)
3169 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3170 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3171 if (BE (token2.type == END_OF_RE, 0))
3174 goto parse_bracket_exp_free_return;
3176 if (token2.type == OP_CLOSE_BRACKET)
3178 /* We treat the last '-' as a normal character. */
3179 re_string_skip_bytes (regexp, -token_len);
3180 token->type = CHARACTER;
3187 if (is_range_exp == 1)
3189 end_elem.opr.name = end_name_buf;
3190 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3192 if (BE (ret != REG_NOERROR, 0))
3195 goto parse_bracket_exp_free_return;
3198 token_len = peek_token_bracket (token, regexp, syntax);
3201 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3202 &start_elem, &end_elem);
3204 # ifdef RE_ENABLE_I18N
3205 *err = build_range_exp (sbcset,
3206 dfa->mb_cur_max > 1 ? mbcset : NULL,
3207 &range_alloc, &start_elem, &end_elem);
3209 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3211 #endif /* RE_ENABLE_I18N */
3212 if (BE (*err != REG_NOERROR, 0))
3213 goto parse_bracket_exp_free_return;
3217 switch (start_elem.type)
3220 bitset_set (sbcset, start_elem.opr.ch);
3222 #ifdef RE_ENABLE_I18N
3224 /* Check whether the array has enough space. */
3225 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3227 wchar_t *new_mbchars;
3228 /* Not enough, realloc it. */
3229 /* +1 in case of mbcset->nmbchars is 0. */
3230 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3231 /* Use realloc since array is NULL if *alloc == 0. */
3232 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3234 if (BE (new_mbchars == NULL, 0))
3235 goto parse_bracket_exp_espace;
3236 mbcset->mbchars = new_mbchars;
3238 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3240 #endif /* RE_ENABLE_I18N */
3242 *err = build_equiv_class (sbcset,
3243 #ifdef RE_ENABLE_I18N
3244 mbcset, &equiv_class_alloc,
3245 #endif /* RE_ENABLE_I18N */
3246 start_elem.opr.name);
3247 if (BE (*err != REG_NOERROR, 0))
3248 goto parse_bracket_exp_free_return;
3251 *err = build_collating_symbol (sbcset,
3252 #ifdef RE_ENABLE_I18N
3253 mbcset, &coll_sym_alloc,
3254 #endif /* RE_ENABLE_I18N */
3255 start_elem.opr.name);
3256 if (BE (*err != REG_NOERROR, 0))
3257 goto parse_bracket_exp_free_return;
3260 *err = build_charclass (regexp->trans, sbcset,
3261 #ifdef RE_ENABLE_I18N
3262 mbcset, &char_class_alloc,
3263 #endif /* RE_ENABLE_I18N */
3264 start_elem.opr.name, syntax);
3265 if (BE (*err != REG_NOERROR, 0))
3266 goto parse_bracket_exp_free_return;
3273 if (BE (token->type == END_OF_RE, 0))
3276 goto parse_bracket_exp_free_return;
3278 if (token->type == OP_CLOSE_BRACKET)
3282 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3284 /* If it is non-matching list. */
3286 bitset_not (sbcset);
3288 #ifdef RE_ENABLE_I18N
3289 /* Ensure only single byte characters are set. */
3290 if (dfa->mb_cur_max > 1)
3291 bitset_mask (sbcset, dfa->sb_char);
3293 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3294 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3295 || mbcset->non_match)))
3297 bin_tree_t *mbc_tree;
3299 /* Build a tree for complex bracket. */
3300 dfa->has_mb_node = 1;
3301 br_token.type = COMPLEX_BRACKET;
3302 br_token.opr.mbcset = mbcset;
3303 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304 if (BE (mbc_tree == NULL, 0))
3305 goto parse_bracket_exp_espace;
3306 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3307 if (sbcset[sbc_idx])
3309 /* If there are no bits set in sbcset, there is no point
3310 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3311 if (sbc_idx < BITSET_UINTS)
3313 /* Build a tree for simple bracket. */
3314 br_token.type = SIMPLE_BRACKET;
3315 br_token.opr.sbcset = sbcset;
3316 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317 if (BE (work_tree == NULL, 0))
3318 goto parse_bracket_exp_espace;
3320 /* Then join them by ALT node. */
3321 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3322 if (BE (work_tree == NULL, 0))
3323 goto parse_bracket_exp_espace;
3328 work_tree = mbc_tree;
3332 #endif /* not RE_ENABLE_I18N */
3334 #ifdef RE_ENABLE_I18N
3335 free_charset (mbcset);
3337 /* Build a tree for simple bracket. */
3338 br_token.type = SIMPLE_BRACKET;
3339 br_token.opr.sbcset = sbcset;
3340 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341 if (BE (work_tree == NULL, 0))
3342 goto parse_bracket_exp_espace;
3346 parse_bracket_exp_espace:
3348 parse_bracket_exp_free_return:
3350 #ifdef RE_ENABLE_I18N
3351 free_charset (mbcset);
3352 #endif /* RE_ENABLE_I18N */
3356 /* Parse an element in the bracket expression. */
3358 static reg_errcode_t
3359 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3361 bracket_elem_t *elem;
3362 re_string_t *regexp;
3366 reg_syntax_t syntax;
3369 #ifdef RE_ENABLE_I18N
3371 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3372 if (cur_char_size > 1)
3374 elem->type = MB_CHAR;
3375 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3376 re_string_skip_bytes (regexp, cur_char_size);
3379 #endif /* RE_ENABLE_I18N */
3380 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3381 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3382 || token->type == OP_OPEN_EQUIV_CLASS)
3383 return parse_bracket_symbol (elem, regexp, token);
3384 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3386 /* A '-' must only appear as anything but a range indicator before
3387 the closing bracket. Everything else is an error. */
3389 (void) peek_token_bracket (&token2, regexp, syntax);
3390 if (token2.type != OP_CLOSE_BRACKET)
3391 /* The actual error value is not standardized since this whole
3392 case is undefined. But ERANGE makes good sense. */
3395 elem->type = SB_CHAR;
3396 elem->opr.ch = token->opr.c;
3400 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3401 such as [:<character_class>:], [.<collating_element>.], and
3402 [=<equivalent_class>=]. */
3404 static reg_errcode_t
3405 parse_bracket_symbol (elem, regexp, token)
3406 bracket_elem_t *elem;
3407 re_string_t *regexp;
3410 unsigned char ch, delim = token->opr.c;
3412 if (re_string_eoi(regexp))
3416 if (i >= BRACKET_NAME_BUF_SIZE)
3418 if (token->type == OP_OPEN_CHAR_CLASS)
3419 ch = re_string_fetch_byte_case (regexp);
3421 ch = re_string_fetch_byte (regexp);
3422 if (re_string_eoi(regexp))
3424 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3426 elem->opr.name[i] = ch;
3428 re_string_skip_bytes (regexp, 1);
3429 elem->opr.name[i] = '\0';
3430 switch (token->type)
3432 case OP_OPEN_COLL_ELEM:
3433 elem->type = COLL_SYM;
3435 case OP_OPEN_EQUIV_CLASS:
3436 elem->type = EQUIV_CLASS;
3438 case OP_OPEN_CHAR_CLASS:
3439 elem->type = CHAR_CLASS;
3447 /* Helper function for parse_bracket_exp.
3448 Build the equivalence class which is represented by NAME.
3449 The result are written to MBCSET and SBCSET.
3450 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3451 is a pointer argument sinse we may update it. */
3453 static reg_errcode_t
3454 #ifdef RE_ENABLE_I18N
3455 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3456 re_charset_t *mbcset;
3457 int *equiv_class_alloc;
3458 #else /* not RE_ENABLE_I18N */
3459 build_equiv_class (sbcset, name)
3460 #endif /* not RE_ENABLE_I18N */
3461 re_bitset_ptr_t sbcset;
3462 const unsigned char *name;
3465 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3468 const int32_t *table, *indirect;
3469 const unsigned char *weights, *extra, *cp;
3470 unsigned char char_buf[2];
3474 /* This #include defines a local function! */
3475 # include <locale/weight.h>
3476 /* Calculate the index for equivalence class. */
3478 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3479 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3480 _NL_COLLATE_WEIGHTMB);
3481 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3482 _NL_COLLATE_EXTRAMB);
3483 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3484 _NL_COLLATE_INDIRECTMB);
3485 idx1 = findidx (&cp);
3486 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3487 /* This isn't a valid character. */
3488 return REG_ECOLLATE;
3490 /* Build single byte matcing table for this equivalence class. */
3491 char_buf[1] = (unsigned char) '\0';
3492 len = weights[idx1];
3493 for (ch = 0; ch < SBC_MAX; ++ch)
3497 idx2 = findidx (&cp);
3502 /* This isn't a valid character. */
3504 if (len == weights[idx2])
3507 while (cnt <= len &&
3508 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3512 bitset_set (sbcset, ch);
3515 /* Check whether the array has enough space. */
3516 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3518 /* Not enough, realloc it. */
3519 /* +1 in case of mbcset->nequiv_classes is 0. */
3520 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3521 /* Use realloc since the array is NULL if *alloc == 0. */
3522 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3524 new_equiv_class_alloc);
3525 if (BE (new_equiv_classes == NULL, 0))
3527 mbcset->equiv_classes = new_equiv_classes;
3528 *equiv_class_alloc = new_equiv_class_alloc;
3530 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3535 if (BE (strlen ((const char *) name) != 1, 0))
3536 return REG_ECOLLATE;
3537 bitset_set (sbcset, *name);
3542 /* Helper function for parse_bracket_exp.
3543 Build the character class which is represented by NAME.
3544 The result are written to MBCSET and SBCSET.
3545 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3546 is a pointer argument sinse we may update it. */
3548 static reg_errcode_t
3549 #ifdef RE_ENABLE_I18N
3550 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3551 re_charset_t *mbcset;
3552 int *char_class_alloc;
3553 #else /* not RE_ENABLE_I18N */
3554 build_charclass (trans, sbcset, class_name, syntax)
3555 #endif /* not RE_ENABLE_I18N */
3556 unsigned RE_TRANSLATE_TYPE trans;
3557 re_bitset_ptr_t sbcset;
3558 const unsigned char *class_name;
3559 reg_syntax_t syntax;
3562 const char *name = (const char *) class_name;
3564 /* In case of REG_ICASE "upper" and "lower" match the both of
3565 upper and lower cases. */
3566 if ((syntax & RE_ICASE)
3567 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3570 #ifdef RE_ENABLE_I18N
3571 /* Check the space of the arrays. */
3572 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3574 /* Not enough, realloc it. */
3575 /* +1 in case of mbcset->nchar_classes is 0. */
3576 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3577 /* Use realloc since array is NULL if *alloc == 0. */
3578 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3579 new_char_class_alloc);
3580 if (BE (new_char_classes == NULL, 0))
3582 mbcset->char_classes = new_char_classes;
3583 *char_class_alloc = new_char_class_alloc;
3585 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3586 #endif /* RE_ENABLE_I18N */
3588 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3589 for (i = 0; i < SBC_MAX; ++i) \
3591 if (ctype_func (i)) \
3593 int ch = trans ? trans[i] : i; \
3594 bitset_set (sbcset, ch); \
3598 if (strcmp (name, "alnum") == 0)
3599 BUILD_CHARCLASS_LOOP (isalnum)
3600 else if (strcmp (name, "cntrl") == 0)
3601 BUILD_CHARCLASS_LOOP (iscntrl)
3602 else if (strcmp (name, "lower") == 0)
3603 BUILD_CHARCLASS_LOOP (islower)
3604 else if (strcmp (name, "space") == 0)
3605 BUILD_CHARCLASS_LOOP (isspace)
3606 else if (strcmp (name, "alpha") == 0)
3607 BUILD_CHARCLASS_LOOP (isalpha)
3608 else if (strcmp (name, "digit") == 0)
3609 BUILD_CHARCLASS_LOOP (isdigit)
3610 else if (strcmp (name, "print") == 0)
3611 BUILD_CHARCLASS_LOOP (isprint)
3612 else if (strcmp (name, "upper") == 0)
3613 BUILD_CHARCLASS_LOOP (isupper)
3614 else if (strcmp (name, "blank") == 0)
3615 BUILD_CHARCLASS_LOOP (isblank)
3616 else if (strcmp (name, "graph") == 0)
3617 BUILD_CHARCLASS_LOOP (isgraph)
3618 else if (strcmp (name, "punct") == 0)
3619 BUILD_CHARCLASS_LOOP (ispunct)
3620 else if (strcmp (name, "xdigit") == 0)
3621 BUILD_CHARCLASS_LOOP (isxdigit)
3629 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3631 unsigned RE_TRANSLATE_TYPE trans;
3632 const unsigned char *class_name;
3633 const unsigned char *extra;
3637 re_bitset_ptr_t sbcset;
3638 #ifdef RE_ENABLE_I18N
3639 re_charset_t *mbcset;
3641 #endif /* not RE_ENABLE_I18N */
3643 re_token_t br_token;
3646 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3647 #ifdef RE_ENABLE_I18N
3648 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3649 #endif /* RE_ENABLE_I18N */
3651 #ifdef RE_ENABLE_I18N
3652 if (BE (sbcset == NULL || mbcset == NULL, 0))
3653 #else /* not RE_ENABLE_I18N */
3654 if (BE (sbcset == NULL, 0))
3655 #endif /* not RE_ENABLE_I18N */
3663 #ifdef RE_ENABLE_I18N
3665 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3666 bitset_set(cset->sbcset, '\0');
3668 mbcset->non_match = 1;
3669 #endif /* not RE_ENABLE_I18N */
3672 /* We don't care the syntax in this case. */
3673 ret = build_charclass (trans, sbcset,
3674 #ifdef RE_ENABLE_I18N
3676 #endif /* RE_ENABLE_I18N */
3679 if (BE (ret != REG_NOERROR, 0))
3682 #ifdef RE_ENABLE_I18N
3683 free_charset (mbcset);
3684 #endif /* RE_ENABLE_I18N */
3688 /* \w match '_' also. */
3689 for (; *extra; extra++)
3690 bitset_set (sbcset, *extra);
3692 /* If it is non-matching list. */
3694 bitset_not (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697 /* Ensure only single byte characters are set. */
3698 if (dfa->mb_cur_max > 1)
3699 bitset_mask (sbcset, dfa->sb_char);
3702 /* Build a tree for simple bracket. */
3703 br_token.type = SIMPLE_BRACKET;
3704 br_token.opr.sbcset = sbcset;
3705 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3706 if (BE (tree == NULL, 0))
3707 goto build_word_op_espace;
3709 #ifdef RE_ENABLE_I18N
3710 if (dfa->mb_cur_max > 1)
3712 bin_tree_t *mbc_tree;
3713 /* Build a tree for complex bracket. */
3714 br_token.type = COMPLEX_BRACKET;
3715 br_token.opr.mbcset = mbcset;
3716 dfa->has_mb_node = 1;
3717 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3718 if (BE (mbc_tree == NULL, 0))
3719 goto build_word_op_espace;
3720 /* Then join them by ALT node. */
3721 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3722 if (BE (mbc_tree != NULL, 1))
3727 free_charset (mbcset);
3730 #else /* not RE_ENABLE_I18N */
3732 #endif /* not RE_ENABLE_I18N */
3734 build_word_op_espace:
3736 #ifdef RE_ENABLE_I18N
3737 free_charset (mbcset);
3738 #endif /* RE_ENABLE_I18N */
3743 /* This is intended for the expressions like "a{1,3}".
3744 Fetch a number from `input', and return the number.
3745 Return -1, if the number field is empty like "{,1}".
3746 Return -2, If an error is occured. */
3749 fetch_number (input, token, syntax)
3752 reg_syntax_t syntax;
3758 fetch_token (token, input, syntax);
3760 if (BE (token->type == END_OF_RE, 0))
3762 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3764 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3765 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3766 num = (num > RE_DUP_MAX) ? -2 : num;
3771 #ifdef RE_ENABLE_I18N
3773 free_charset (re_charset_t *cset)
3775 re_free (cset->mbchars);
3777 re_free (cset->coll_syms);
3778 re_free (cset->equiv_classes);
3779 re_free (cset->range_starts);
3780 re_free (cset->range_ends);
3782 re_free (cset->char_classes);
3785 #endif /* RE_ENABLE_I18N */
3787 /* Functions for binary tree operation. */
3789 /* Create a tree node. */
3792 create_tree (dfa, left, right, type)
3796 re_token_type_t type;
3800 return create_token_tree (dfa, left, right, &t);
3804 create_token_tree (dfa, left, right, token)
3808 const re_token_t *token;
3811 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815 if (storage == NULL)
3817 storage->next = dfa->str_tree_storage;
3818 dfa->str_tree_storage = storage;
3819 dfa->str_tree_storage_idx = 0;
3821 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823 tree->parent = NULL;
3825 tree->right = right;
3826 tree->token = *token;
3827 tree->token.duplicated = 0;
3828 tree->token.opt_subexp = 0;
3831 tree->node_idx = -1;
3834 left->parent = tree;
3836 right->parent = tree;
3840 /* Mark the tree SRC as an optional subexpression.
3841 To be called from preorder or postorder. */
3843 static reg_errcode_t
3844 mark_opt_subexp (extra, node)
3848 int idx = (int) (long) extra;
3849 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3850 node->token.opt_subexp = 1;
3855 /* Free the allocated memory inside NODE. */
3858 free_token (re_token_t *node)
3860 #ifdef RE_ENABLE_I18N
3861 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3862 free_charset (node->opr.mbcset);
3864 #endif /* RE_ENABLE_I18N */
3865 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3866 re_free (node->opr.sbcset);
3869 /* Worker function for tree walking. Free the allocated memory inside NODE
3870 and its children. */
3872 static reg_errcode_t
3873 free_tree (void *extra, bin_tree_t *node)
3875 free_token (&node->token);
3880 /* Duplicate the node SRC, and return new node. This is a preorder
3881 visit similar to the one implemented by the generic visitor, but
3882 we need more infrastructure to maintain two parallel trees --- so,
3883 it's easier to duplicate. */
3886 duplicate_tree (root, dfa)
3887 const bin_tree_t *root;
3890 const bin_tree_t *node;
3891 bin_tree_t *dup_root;
3892 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3894 for (node = root; ; )
3896 /* Create a new tree and link it back to the current parent. */
3897 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3900 (*p_new)->parent = dup_node;
3901 (*p_new)->token.duplicated = 1;
3904 /* Go to the left node, or up and to the right. */
3908 p_new = &dup_node->left;
3912 const bin_tree_t *prev = NULL;
3913 while (node->right == prev || node->right == NULL)
3916 node = node->parent;
3917 dup_node = dup_node->parent;
3922 p_new = &dup_node->right;