1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, int ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 int ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 static Idx check_matching (re_match_context_t *mctx, int fl_longest_match,
61 static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
70 re_node_set *eps_via_nodes) internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
83 re_sift_context_t *sctx) internal_function;
84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
85 re_sift_context_t *sctx, Idx str_idx,
86 re_node_set *cur_dest) internal_function;
87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
88 re_sift_context_t *sctx,
90 re_node_set *dest_nodes) internal_function;
91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
92 re_node_set *dest_nodes,
93 const re_node_set *candidates) internal_function;
94 static int check_dst_limits (const re_match_context_t *mctx,
95 const re_node_set *limits,
96 Idx dst_node, Idx dst_idx, Idx src_node,
97 Idx src_idx) internal_function;
98 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
99 int boundaries, Idx subexp_idx,
100 Idx from_node, Idx bkref_idx) internal_function;
101 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
102 Idx limit, Idx subexp_idx,
103 Idx node, Idx str_idx,
104 Idx bkref_idx) internal_function;
105 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
106 re_node_set *dest_nodes,
107 const re_node_set *candidates,
109 struct re_backref_cache_entry *bkref_ents,
110 Idx str_idx) internal_function;
111 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
112 re_sift_context_t *sctx,
113 Idx str_idx, const re_node_set *candidates) internal_function;
114 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
115 re_dfastate_t **src, Idx num) internal_function;
116 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
117 re_match_context_t *mctx) internal_function;
118 static re_dfastate_t *transit_state (reg_errcode_t *err,
119 re_match_context_t *mctx,
120 re_dfastate_t *state) internal_function;
121 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *next_state) internal_function;
124 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
125 re_node_set *cur_nodes,
126 Idx str_idx) internal_function;
128 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *pstate) internal_function;
132 #ifdef RE_ENABLE_I18N
133 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
134 re_dfastate_t *pstate) internal_function;
135 #endif /* RE_ENABLE_I18N */
136 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
137 const re_node_set *nodes) internal_function;
138 static reg_errcode_t get_subexp (re_match_context_t *mctx,
139 Idx bkref_node, Idx bkref_str_idx) internal_function;
140 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
141 const re_sub_match_top_t *sub_top,
142 re_sub_match_last_t *sub_last,
143 Idx bkref_node, Idx bkref_str) internal_function;
144 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
145 Idx subexp_idx, int type) internal_function;
146 static reg_errcode_t check_arrival (re_match_context_t *mctx,
147 state_array_t *path, Idx top_node,
148 Idx top_str, Idx last_node, Idx last_str,
149 int type) internal_function;
150 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
152 re_node_set *cur_nodes,
153 re_node_set *next_nodes) internal_function;
154 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
155 re_node_set *cur_nodes,
156 Idx ex_subexp, int type) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
158 re_node_set *dst_nodes,
159 Idx target, Idx ex_subexp,
160 int type) internal_function;
161 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
162 re_node_set *cur_nodes, Idx cur_str,
163 Idx subexp_num, int type) internal_function;
164 static int build_trtable (re_dfa_t *dfa,
165 re_dfastate_t *state) internal_function;
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
168 const re_string_t *input, Idx idx) internal_function;
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len) internal_function;
173 #endif /* RE_ENABLE_I18N */
174 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch) internal_function;
178 static int check_node_accept (const re_match_context_t *mctx,
179 const re_token_t *node, Idx idx) internal_function;
180 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
182 /* Entry point for POSIX code. */
184 /* regexec searches for a given pattern, specified by PREG, in the
187 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
188 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
189 least NMATCH elements, and we set them to the offsets of the
190 corresponding matched substrings.
192 EFLAGS specifies `execution flags' which affect matching: if
193 REG_NOTBOL is set, then ^ does not match at the beginning of the
194 string; if REG_NOTEOL is set, then $ does not match at the end.
196 We return 0 if we find a match and REG_NOMATCH if not. */
199 regexec (const regex_t *__restrict preg, const char *__restrict string,
200 size_t nmatch, regmatch_t pmatch[], int eflags)
205 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
208 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
211 if (eflags & REG_STARTEND)
213 start = pmatch[0].rm_so;
214 length = pmatch[0].rm_eo;
219 length = strlen (string);
222 __libc_lock_lock (dfa->lock);
224 err = re_search_internal (preg, string, length, start, length,
225 length, 0, NULL, eflags);
227 err = re_search_internal (preg, string, length, start, length,
228 length, nmatch, pmatch, eflags);
229 __libc_lock_unlock (dfa->lock);
230 return err != REG_NOERROR;
234 # include <shlib-compat.h>
235 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
237 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
238 __typeof__ (__regexec) __compat_regexec;
241 attribute_compat_text_section
242 __compat_regexec (const regex_t *__restrict preg,
243 const char *__restrict string, size_t nmatch,
244 regmatch_t pmatch[], int eflags)
246 return regexec (preg, string, nmatch, pmatch,
247 eflags & (REG_NOTBOL | REG_NOTEOL));
249 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
253 /* Entry points for GNU code. */
255 /* re_match, re_search, re_match_2, re_search_2
257 The former two functions operate on STRING with length LENGTH,
258 while the later two operate on concatenation of STRING1 and STRING2
259 with lengths LENGTH1 and LENGTH2, respectively.
261 re_match() matches the compiled pattern in BUFP against the string,
262 starting at index START.
264 re_search() first tries matching at index START, then it tries to match
265 starting from index START + 1, and so on. The last start position tried
266 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
269 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
270 the first STOP characters of the concatenation of the strings should be
273 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
274 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
275 computed relative to the concatenation, not relative to the individual
278 On success, re_match* functions return the length of the match, re_search*
279 return the position of the start of the match. Return value -1 means no
280 match was found and -2 indicates an internal error. */
283 re_match (struct re_pattern_buffer *bufp, const char *string,
284 Idx length, Idx start, struct re_registers *regs)
286 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
289 weak_alias (__re_match, re_match)
293 re_search (struct re_pattern_buffer *bufp, const char *string,
294 Idx length, Idx start, regoff_t range, struct re_registers *regs)
296 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
299 weak_alias (__re_search, re_search)
303 re_match_2 (struct re_pattern_buffer *bufp,
304 const char *string1, Idx length1,
305 const char *string2, Idx length2,
306 Idx start, struct re_registers *regs, Idx stop)
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
309 start, 0, regs, stop, 1);
312 weak_alias (__re_match_2, re_match_2)
316 re_search_2 (struct re_pattern_buffer *bufp,
317 const char *string1, Idx length1,
318 const char *string2, Idx length2,
319 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
321 return re_search_2_stub (bufp, string1, length1, string2, length2,
322 start, range, regs, stop, 0);
325 weak_alias (__re_search_2, re_search_2)
330 re_search_2_stub (struct re_pattern_buffer *bufp,
331 const char *string1, Idx length1,
332 const char *string2, Idx length2,
333 Idx start, regoff_t range, struct re_registers *regs,
334 Idx stop, int ret_len)
338 Idx len = length1 + length2;
341 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
344 /* Concatenate the strings. */
348 char *s = re_malloc (char, len);
350 if (BE (s == NULL, 0))
352 memcpy (s, string1, length1);
353 memcpy (s + length1, string2, length2);
362 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
365 re_free ((char *) str);
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is nonzero the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
376 re_search_stub (struct re_pattern_buffer *bufp,
377 const char *string, Idx length,
378 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
381 reg_errcode_t result;
387 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
389 Idx last_start = start + range;
391 /* Check for out-of-range. */
392 if (BE (start < 0 || start > length, 0))
394 if (sizeof start < sizeof range)
396 regoff_t length_offset = length;
397 regoff_t start_offset = start;
398 if (BE (length_offset - start_offset < range, 0))
400 else if (BE (range < - start_offset, 0))
405 if (BE ((last_start < start) != (range < 0), 0))
407 /* Overflow occurred when computing last_start; substitute
408 the extreme value. */
409 last_start = range < 0 ? 0 : length;
413 if (BE (length < last_start, 0))
415 else if (BE (last_start < 0, 0))
420 __libc_lock_lock (dfa->lock);
422 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
423 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
425 /* Compile fastmap if we haven't yet. */
426 if (start < last_start && bufp->re_fastmap != NULL
427 && !bufp->re_fastmap_accurate)
428 re_compile_fastmap (bufp);
430 if (BE (bufp->re_no_sub, 0))
433 /* We need at least 1 register. */
436 else if (BE (bufp->re_regs_allocated == REG_FIXED
437 && regs->rm_num_regs < bufp->re_nsub + 1, 0))
439 nregs = regs->rm_num_regs;
440 if (BE (nregs < 1, 0))
442 /* Nothing can be copied to regs. */
448 nregs = bufp->re_nsub + 1;
449 pmatch = re_malloc (regmatch_t, nregs);
450 if (BE (pmatch == NULL, 0))
456 result = re_search_internal (bufp, string, length, start, last_start, stop,
457 nregs, pmatch, eflags);
461 /* I hope we needn't fill ther regs with -1's when no match was found. */
462 if (result != REG_NOERROR)
464 else if (regs != NULL)
466 /* If caller wants register contents data back, copy them. */
467 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
468 bufp->re_regs_allocated);
469 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
473 if (BE (rval == 0, 1))
477 assert (pmatch[0].rm_so == start);
478 rval = pmatch[0].rm_eo - start;
481 rval = pmatch[0].rm_so;
485 __libc_lock_unlock (dfa->lock);
491 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
494 int rval = REG_REALLOCATE;
496 Idx need_regs = nregs + 1;
497 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
500 /* Have the register data arrays been allocated? */
501 if (regs_allocated == REG_UNALLOCATED)
502 { /* No. So allocate them with malloc. */
503 regs->rm_start = re_malloc (regoff_t, need_regs);
504 regs->rm_end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
506 return REG_UNALLOCATED;
507 regs->rm_num_regs = need_regs;
509 else if (regs_allocated == REG_REALLOCATE)
510 { /* Yes. If we need more elements than were already
511 allocated, reallocate them. If we need fewer, just
513 if (BE (need_regs > regs->rm_num_regs, 0))
515 regoff_t *new_start =
516 re_realloc (regs->rm_start, regoff_t, need_regs);
517 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
518 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
519 return REG_UNALLOCATED;
520 regs->rm_start = new_start;
521 regs->rm_end = new_end;
522 regs->rm_num_regs = need_regs;
527 assert (regs_allocated == REG_FIXED);
528 /* This function may not be called with REG_FIXED and nregs too big. */
529 assert (regs->rm_num_regs >= nregs);
534 for (i = 0; i < nregs; ++i)
536 regs->rm_start[i] = pmatch[i].rm_so;
537 regs->rm_end[i] = pmatch[i].rm_eo;
539 for ( ; i < regs->rm_num_regs; ++i)
540 regs->rm_start[i] = regs->rm_end[i] = -1;
545 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
546 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
547 this memory for recording register information. STARTS and ENDS
548 must be allocated using the malloc library routine, and must each
549 be at least NUM_REGS * sizeof (regoff_t) bytes long.
551 If NUM_REGS == 0, then subsequent matches should allocate their own
554 Unless this function is called, the first search or match using
555 PATTERN_BUFFER will allocate its own register data, without
556 freeing the old data. */
559 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
560 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
564 bufp->re_regs_allocated = REG_REALLOCATE;
565 regs->rm_num_regs = num_regs;
566 regs->rm_start = starts;
571 bufp->re_regs_allocated = REG_UNALLOCATED;
572 regs->rm_num_regs = 0;
573 regs->rm_start = regs->rm_end = NULL;
577 weak_alias (__re_set_registers, re_set_registers)
580 /* Entry points compatible with 4.2 BSD regex library. We don't define
581 them unless specifically requested. */
583 #if defined _REGEX_RE_COMP || defined _LIBC
588 re_exec (const char *s)
590 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
592 #endif /* _REGEX_RE_COMP */
594 /* Internal entry point. */
596 /* Searches for a compiled pattern PREG in the string STRING, whose
597 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
598 meaning as with regexec. LAST_START is START + RANGE, where
599 START and RANGE have the same meaning as with re_search.
600 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
601 otherwise return the error code.
602 Note: We assume front end functions already check ranges.
603 (0 <= LAST_START && LAST_START <= LENGTH) */
607 re_search_internal (const regex_t *preg,
608 const char *string, Idx length,
609 Idx start, Idx last_start, Idx stop,
610 size_t nmatch, regmatch_t pmatch[],
614 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
615 Idx left_lim, right_lim;
617 int fl_longest_match, match_kind;
618 Idx match_first, match_last = REG_MISSING;
621 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
622 re_match_context_t mctx = { .dfa = dfa };
624 re_match_context_t mctx;
626 char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
627 && start != last_start && !preg->re_can_be_null)
628 ? preg->re_fastmap : NULL);
629 unsigned REG_TRANSLATE_TYPE t =
630 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
632 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
633 memset (&mctx, '\0', sizeof (re_match_context_t));
637 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
638 nmatch -= extra_nmatch;
640 /* Check if the DFA haven't been compiled. */
641 if (BE (preg->re_used == 0 || dfa->init_state == NULL
642 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
643 || dfa->init_state_begbuf == NULL, 0))
647 /* We assume front-end functions already check them. */
648 assert (0 <= last_start && last_start <= length);
651 /* If initial states with non-begbuf contexts have no elements,
652 the regex must be anchored. If preg->re_newline_anchor is set,
653 we'll never use init_state_nl, so do not check it. */
654 if (dfa->init_state->nodes.nelem == 0
655 && dfa->init_state_word->nodes.nelem == 0
656 && (dfa->init_state_nl->nodes.nelem == 0
657 || !preg->re_newline_anchor))
659 if (start != 0 && last_start != 0)
661 start = last_start = 0;
664 /* We must check the longest matching, if nmatch > 0. */
665 fl_longest_match = (nmatch != 0 || dfa->nbackref);
667 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
669 preg->re_syntax & REG_IGNORE_CASE, dfa);
670 if (BE (err != REG_NOERROR, 0))
672 mctx.input.stop = stop;
673 mctx.input.raw_stop = stop;
674 mctx.input.newline_anchor = preg->re_newline_anchor;
676 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
677 if (BE (err != REG_NOERROR, 0))
680 /* We will log all the DFA states through which the dfa pass,
681 if nmatch > 1, or this dfa has "multibyte node", which is a
682 back-reference or a node which can accept multibyte character or
683 multi character collating element. */
684 if (nmatch > 1 || dfa->has_mb_node)
686 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
687 if (BE (mctx.state_log == NULL, 0))
694 mctx.state_log = NULL;
697 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
698 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
700 /* Check incrementally whether of not the input string match. */
701 incr = (last_start < start) ? -1 : 1;
702 left_lim = (last_start < start) ? last_start : start;
703 right_lim = (last_start < start) ? start : last_start;
704 sb = dfa->mb_cur_max == 1;
707 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
708 | (start <= last_start ? 2 : 0)
709 | (t != NULL ? 1 : 0))
712 for (;; match_first += incr)
715 if (match_first < left_lim || right_lim < match_first)
718 /* Advance as rapidly as possible through the string, until we
719 find a plausible place to start matching. This may be done
720 with varying efficiency, so there are various possibilities:
721 only the most common of them are specialized, in order to
722 save on code size. We use a switch statement for speed. */
730 /* Fastmap with single-byte translation, match forward. */
731 while (BE (match_first < right_lim, 1)
732 && !fastmap[t[(unsigned char) string[match_first]]])
734 goto forward_match_found_start_or_reached_end;
737 /* Fastmap without translation, match forward. */
738 while (BE (match_first < right_lim, 1)
739 && !fastmap[(unsigned char) string[match_first]])
742 forward_match_found_start_or_reached_end:
743 if (BE (match_first == right_lim, 0))
745 ch = match_first >= length
746 ? 0 : (unsigned char) string[match_first];
747 if (!fastmap[t ? t[ch] : ch])
754 /* Fastmap without multi-byte translation, match backwards. */
755 while (match_first >= left_lim)
757 ch = match_first >= length
758 ? 0 : (unsigned char) string[match_first];
759 if (fastmap[t ? t[ch] : ch])
763 if (match_first < left_lim)
768 /* In this case, we can't determine easily the current byte,
769 since it might be a component byte of a multibyte
770 character. Then we use the constructed buffer instead. */
773 /* If MATCH_FIRST is out of the valid range, reconstruct the
775 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
776 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
778 err = re_string_reconstruct (&mctx.input, match_first,
780 if (BE (err != REG_NOERROR, 0))
783 offset = match_first - mctx.input.raw_mbs_idx;
785 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
786 Note that MATCH_FIRST must not be smaller than 0. */
787 ch = (match_first >= length
788 ? 0 : re_string_byte_at (&mctx.input, offset));
792 if (match_first < left_lim || match_first > right_lim)
801 /* Reconstruct the buffers so that the matcher can assume that
802 the matching starts from the beginning of the buffer. */
803 err = re_string_reconstruct (&mctx.input, match_first, eflags);
804 if (BE (err != REG_NOERROR, 0))
807 #ifdef RE_ENABLE_I18N
808 /* Don't consider this char as a possible match start if it part,
809 yet isn't the head, of a multibyte character. */
810 if (!sb && !re_string_first_byte (&mctx.input, 0))
814 /* It seems to be appropriate one, then use the matcher. */
815 /* We assume that the matching starts from 0. */
816 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
817 match_last = check_matching (&mctx, fl_longest_match,
818 start <= last_start ? &match_first : NULL);
819 if (match_last != REG_MISSING)
821 if (BE (match_last == REG_ERROR, 0))
828 mctx.match_last = match_last;
829 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
831 re_dfastate_t *pstate = mctx.state_log[match_last];
832 mctx.last_node = check_halt_state_context (&mctx, pstate,
835 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
838 err = prune_impossible_nodes (&mctx);
839 if (err == REG_NOERROR)
841 if (BE (err != REG_NOMATCH, 0))
843 match_last = REG_MISSING;
846 break; /* We found a match. */
850 match_ctx_clean (&mctx);
854 assert (match_last != REG_MISSING);
855 assert (err == REG_NOERROR);
858 /* Set pmatch[] if we need. */
863 /* Initialize registers. */
864 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
865 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
867 /* Set the points where matching start/end. */
869 pmatch[0].rm_eo = mctx.match_last;
870 /* FIXME: This function should fail if mctx.match_last exceeds
871 the maximum possible regoff_t value. We need a new error
872 code REG_OVERFLOW. */
874 if (!preg->re_no_sub && nmatch > 1)
876 err = set_regs (preg, &mctx, nmatch, pmatch,
877 dfa->has_plural_match && dfa->nbackref > 0);
878 if (BE (err != REG_NOERROR, 0))
882 /* At last, add the offset to the each registers, since we slided
883 the buffers so that we could assume that the matching starts
885 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
886 if (pmatch[reg_idx].rm_so != -1)
888 #ifdef RE_ENABLE_I18N
889 if (BE (mctx.input.offsets_needed != 0, 0))
891 pmatch[reg_idx].rm_so =
892 (pmatch[reg_idx].rm_so == mctx.input.valid_len
893 ? mctx.input.valid_raw_len
894 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
895 pmatch[reg_idx].rm_eo =
896 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
897 ? mctx.input.valid_raw_len
898 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
901 assert (mctx.input.offsets_needed == 0);
903 pmatch[reg_idx].rm_so += match_first;
904 pmatch[reg_idx].rm_eo += match_first;
906 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
908 pmatch[nmatch + reg_idx].rm_so = -1;
909 pmatch[nmatch + reg_idx].rm_eo = -1;
913 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
914 if (dfa->subexp_map[reg_idx] != reg_idx)
916 pmatch[reg_idx + 1].rm_so
917 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
918 pmatch[reg_idx + 1].rm_eo
919 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
924 re_free (mctx.state_log);
926 match_ctx_free (&mctx);
927 re_string_destruct (&mctx.input);
933 prune_impossible_nodes (re_match_context_t *mctx)
935 re_dfa_t *const dfa = mctx->dfa;
936 Idx halt_node, match_last;
938 re_dfastate_t **sifted_states;
939 re_dfastate_t **lim_states = NULL;
940 re_sift_context_t sctx;
942 assert (mctx->state_log != NULL);
944 match_last = mctx->match_last;
945 halt_node = mctx->last_node;
946 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
947 if (BE (sifted_states == NULL, 0))
954 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
955 if (BE (lim_states == NULL, 0))
962 memset (lim_states, '\0',
963 sizeof (re_dfastate_t *) * (match_last + 1));
964 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
966 ret = sift_states_backward (mctx, &sctx);
967 re_node_set_free (&sctx.limits);
968 if (BE (ret != REG_NOERROR, 0))
970 if (sifted_states[0] != NULL || lim_states[0] != NULL)
975 if (! REG_VALID_INDEX (match_last))
980 } while (mctx->state_log[match_last] == NULL
981 || !mctx->state_log[match_last]->halt);
982 halt_node = check_halt_state_context (mctx,
983 mctx->state_log[match_last],
986 ret = merge_state_array (dfa, sifted_states, lim_states,
988 re_free (lim_states);
990 if (BE (ret != REG_NOERROR, 0))
995 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
996 ret = sift_states_backward (mctx, &sctx);
997 re_node_set_free (&sctx.limits);
998 if (BE (ret != REG_NOERROR, 0))
1001 re_free (mctx->state_log);
1002 mctx->state_log = sifted_states;
1003 sifted_states = NULL;
1004 mctx->last_node = halt_node;
1005 mctx->match_last = match_last;
1008 re_free (sifted_states);
1009 re_free (lim_states);
1013 /* Acquire an initial state and return it.
1014 We must select appropriate initial state depending on the context,
1015 since initial states may have constraints like "\<", "^", etc.. */
1017 static inline re_dfastate_t *
1018 __attribute ((always_inline)) internal_function
1019 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1022 re_dfa_t *const dfa = mctx->dfa;
1023 if (dfa->init_state->has_constraint)
1025 unsigned int context;
1026 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1027 if (IS_WORD_CONTEXT (context))
1028 return dfa->init_state_word;
1029 else if (IS_ORDINARY_CONTEXT (context))
1030 return dfa->init_state;
1031 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1032 return dfa->init_state_begbuf;
1033 else if (IS_NEWLINE_CONTEXT (context))
1034 return dfa->init_state_nl;
1035 else if (IS_BEGBUF_CONTEXT (context))
1037 /* It is relatively rare case, then calculate on demand. */
1038 return re_acquire_state_context (err, dfa,
1039 dfa->init_state->entrance_nodes,
1043 /* Must not happen? */
1044 return dfa->init_state;
1047 return dfa->init_state;
1050 /* Check whether the regular expression match input string INPUT or not,
1051 and return the index where the matching end. Return REG_MISSING if
1052 there is no match, and return REG_ERROR in case of an error.
1053 FL_LONGEST_MATCH means we want the POSIX longest matching.
1054 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1055 next place where we may want to try matching.
1056 Note that the matcher assume that the maching starts from the current
1057 index of the buffer. */
1061 check_matching (re_match_context_t *mctx, int fl_longest_match,
1064 re_dfa_t *const dfa = mctx->dfa;
1067 Idx match_last = REG_MISSING;
1068 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1069 re_dfastate_t *cur_state;
1070 int at_init_state = p_match_first != NULL;
1071 Idx next_start_idx = cur_str_idx;
1074 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1075 /* An initial state must not be NULL (invalid). */
1076 if (BE (cur_state == NULL, 0))
1078 assert (err == REG_ESPACE);
1082 if (mctx->state_log != NULL)
1084 mctx->state_log[cur_str_idx] = cur_state;
1086 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1087 later. E.g. Processing back references. */
1088 if (BE (dfa->nbackref, 0))
1091 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1092 if (BE (err != REG_NOERROR, 0))
1095 if (cur_state->has_backref)
1097 err = transit_state_bkref (mctx, &cur_state->nodes);
1098 if (BE (err != REG_NOERROR, 0))
1104 /* If the RE accepts NULL string. */
1105 if (BE (cur_state->halt, 0))
1107 if (!cur_state->has_constraint
1108 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1110 if (!fl_longest_match)
1114 match_last = cur_str_idx;
1120 while (!re_string_eoi (&mctx->input))
1122 re_dfastate_t *old_state = cur_state;
1123 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1125 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1126 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1127 && mctx->input.valid_len < mctx->input.len))
1129 err = extend_buffers (mctx);
1130 if (BE (err != REG_NOERROR, 0))
1132 assert (err == REG_ESPACE);
1137 cur_state = transit_state (&err, mctx, cur_state);
1138 if (mctx->state_log != NULL)
1139 cur_state = merge_state_with_log (&err, mctx, cur_state);
1141 if (cur_state == NULL)
1143 /* Reached the invalid state or an error. Try to recover a valid
1144 state using the state log, if available and if we have not
1145 already found a valid (even if not the longest) match. */
1146 if (BE (err != REG_NOERROR, 0))
1149 if (mctx->state_log == NULL
1150 || (match && !fl_longest_match)
1151 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1155 if (BE (at_init_state, 0))
1157 if (old_state == cur_state)
1158 next_start_idx = next_char_idx;
1163 if (cur_state->halt)
1165 /* Reached a halt state.
1166 Check the halt state can satisfy the current context. */
1167 if (!cur_state->has_constraint
1168 || check_halt_state_context (mctx, cur_state,
1169 re_string_cur_idx (&mctx->input)))
1171 /* We found an appropriate halt state. */
1172 match_last = re_string_cur_idx (&mctx->input);
1175 /* We found a match, do not modify match_first below. */
1176 p_match_first = NULL;
1177 if (!fl_longest_match)
1184 *p_match_first += next_start_idx;
1189 /* Check NODE match the current context. */
1193 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1195 re_token_type_t type = dfa->nodes[node].type;
1196 unsigned int constraint = dfa->nodes[node].constraint;
1197 if (type != END_OF_RE)
1201 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1206 /* Check the halt state STATE match the current context.
1207 Return 0 if not match, if the node, STATE has, is a halt node and
1208 match the context, return the node. */
1212 check_halt_state_context (const re_match_context_t *mctx,
1213 const re_dfastate_t *state, Idx idx)
1216 unsigned int context;
1218 assert (state->halt);
1220 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1221 for (i = 0; i < state->nodes.nelem; ++i)
1222 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1223 return state->nodes.elems[i];
1227 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1228 corresponding to the DFA).
1229 Return the destination node, and update EPS_VIA_NODES;
1230 return REG_MISSING in case of errors. */
1234 proceed_next_node (const re_match_context_t *mctx,
1235 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1236 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1238 re_dfa_t *const dfa = mctx->dfa;
1241 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1243 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1244 re_node_set *edests = &dfa->edests[node];
1246 err = re_node_set_insert (eps_via_nodes, node);
1247 if (BE (err < 0, 0))
1249 /* Pick up a valid destination, or return REG_MISSING if none
1251 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1253 Idx candidate = edests->elems[i];
1254 if (!re_node_set_contains (cur_nodes, candidate))
1256 if (dest_node == REG_MISSING)
1257 dest_node = candidate;
1261 /* In order to avoid infinite loop like "(a*)*", return the second
1262 epsilon-transition if the first was already considered. */
1263 if (re_node_set_contains (eps_via_nodes, dest_node))
1266 /* Otherwise, push the second epsilon-transition on the fail stack. */
1268 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1272 /* We know we are going to exit. */
1281 re_token_type_t type = dfa->nodes[node].type;
1283 #ifdef RE_ENABLE_I18N
1284 if (dfa->nodes[node].accept_mb)
1285 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1287 #endif /* RE_ENABLE_I18N */
1288 if (type == OP_BACK_REF)
1290 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1291 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1294 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1298 char *buf = (char *) re_string_get_buffer (&mctx->input);
1299 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1308 err = re_node_set_insert (eps_via_nodes, node);
1309 if (BE (err < 0, 0))
1311 dest_node = dfa->edests[node].elems[0];
1312 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1319 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1321 Idx dest_node = dfa->nexts[node];
1322 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1323 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1324 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1327 re_node_set_empty (eps_via_nodes);
1334 static reg_errcode_t
1336 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1337 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1340 Idx num = fs->num++;
1341 if (fs->num == fs->alloc)
1343 struct re_fail_stack_ent_t *new_array =
1344 re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1345 if (new_array == NULL)
1348 fs->stack = new_array;
1350 fs->stack[num].idx = str_idx;
1351 fs->stack[num].node = dest_node;
1352 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1353 if (fs->stack[num].regs == NULL)
1355 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1356 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1362 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1363 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1365 Idx num = --fs->num;
1366 assert (REG_VALID_INDEX (num));
1367 *pidx = fs->stack[num].idx;
1368 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1369 re_node_set_free (eps_via_nodes);
1370 re_free (fs->stack[num].regs);
1371 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1372 return fs->stack[num].node;
1375 /* Set the positions where the subexpressions are starts/ends to registers
1377 Note: We assume that pmatch[0] is already set, and
1378 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1380 static reg_errcode_t
1382 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1383 size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1385 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1387 re_node_set eps_via_nodes;
1388 struct re_fail_stack_t *fs;
1389 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1390 regmatch_t *prev_idx_match;
1391 int prev_idx_match_malloced = 0;
1394 assert (nmatch > 1);
1395 assert (mctx->state_log != NULL);
1400 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1401 if (fs->stack == NULL)
1407 cur_node = dfa->init_node;
1408 re_node_set_init_empty (&eps_via_nodes);
1410 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1411 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1414 prev_idx_match = re_malloc (regmatch_t, nmatch);
1415 if (prev_idx_match == NULL)
1417 free_fail_stack_return (fs);
1420 prev_idx_match_malloced = 1;
1422 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1424 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1426 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1428 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1433 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1434 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1436 if (reg_idx == nmatch)
1438 re_node_set_free (&eps_via_nodes);
1439 if (prev_idx_match_malloced)
1440 re_free (prev_idx_match);
1441 return free_fail_stack_return (fs);
1443 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1448 re_node_set_free (&eps_via_nodes);
1449 if (prev_idx_match_malloced)
1450 re_free (prev_idx_match);
1455 /* Proceed to next node. */
1456 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1457 &eps_via_nodes, fs);
1459 if (BE (! REG_VALID_INDEX (cur_node), 0))
1461 if (BE (cur_node == REG_ERROR, 0))
1463 re_node_set_free (&eps_via_nodes);
1464 if (prev_idx_match_malloced)
1465 re_free (prev_idx_match);
1466 free_fail_stack_return (fs);
1470 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1481 re_node_set_free (&eps_via_nodes);
1482 if (prev_idx_match_malloced)
1483 re_free (prev_idx_match);
1484 return free_fail_stack_return (fs);
1487 static reg_errcode_t
1489 free_fail_stack_return (struct re_fail_stack_t *fs)
1494 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1496 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1497 re_free (fs->stack[fs_idx].regs);
1499 re_free (fs->stack);
1506 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1507 Idx cur_node, Idx cur_idx, Idx nmatch)
1509 int type = dfa->nodes[cur_node].type;
1510 if (type == OP_OPEN_SUBEXP)
1512 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1514 /* We are at the first node of this sub expression. */
1515 if (reg_num < nmatch)
1517 pmatch[reg_num].rm_so = cur_idx;
1518 pmatch[reg_num].rm_eo = -1;
1521 else if (type == OP_CLOSE_SUBEXP)
1523 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1524 if (reg_num < nmatch)
1526 /* We are at the last node of this sub expression. */
1527 if (pmatch[reg_num].rm_so < cur_idx)
1529 pmatch[reg_num].rm_eo = cur_idx;
1530 /* This is a non-empty match or we are not inside an optional
1531 subexpression. Accept this right away. */
1532 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1536 if (dfa->nodes[cur_node].opt_subexp
1537 && prev_idx_match[reg_num].rm_so != -1)
1538 /* We transited through an empty match for an optional
1539 subexpression, like (a?)*, and this is not the subexp's
1540 first match. Copy back the old content of the registers
1541 so that matches of an inner subexpression are undone as
1542 well, like in ((a?))*. */
1543 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1545 /* We completed a subexpression, but it may be part of
1546 an optional one, so do not update PREV_IDX_MATCH. */
1547 pmatch[reg_num].rm_eo = cur_idx;
1553 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1554 and sift the nodes in each states according to the following rules.
1555 Updated state_log will be wrote to STATE_LOG.
1557 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1558 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1559 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1560 the LAST_NODE, we throw away the node `a'.
1561 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1562 string `s' and transit to `b':
1563 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1565 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1566 thrown away, we throw away the node `a'.
1567 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1568 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1570 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1571 we throw away the node `a'. */
1573 #define STATE_NODE_CONTAINS(state,node) \
1574 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1576 static reg_errcode_t
1578 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1582 Idx str_idx = sctx->last_str_idx;
1583 re_node_set cur_dest;
1586 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1589 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1590 transit to the last_node and the last_node itself. */
1591 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1592 if (BE (err != REG_NOERROR, 0))
1594 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1595 if (BE (err != REG_NOERROR, 0))
1598 /* Then check each states in the state_log. */
1601 /* Update counters. */
1602 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1603 if (null_cnt > mctx->max_mb_elem_len)
1605 memset (sctx->sifted_states, '\0',
1606 sizeof (re_dfastate_t *) * str_idx);
1607 re_node_set_free (&cur_dest);
1610 re_node_set_empty (&cur_dest);
1613 if (mctx->state_log[str_idx])
1615 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1616 if (BE (err != REG_NOERROR, 0))
1620 /* Add all the nodes which satisfy the following conditions:
1621 - It can epsilon transit to a node in CUR_DEST.
1623 And update state_log. */
1624 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1625 if (BE (err != REG_NOERROR, 0))
1630 re_node_set_free (&cur_dest);
1634 static reg_errcode_t
1636 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1637 Idx str_idx, re_node_set *cur_dest)
1639 re_dfa_t *const dfa = mctx->dfa;
1640 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1643 /* Then build the next sifted state.
1644 We build the next sifted state on `cur_dest', and update
1645 `sifted_states[str_idx]' with `cur_dest'.
1647 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1648 `cur_src' points the node_set of the old `state_log[str_idx]'
1649 (with the epsilon nodes pre-filtered out). */
1650 for (i = 0; i < cur_src->nelem; i++)
1652 Idx prev_node = cur_src->elems[i];
1657 re_token_type_t type = dfa->nodes[prev_node].type;
1658 assert (!IS_EPSILON_NODE (type));
1660 #ifdef RE_ENABLE_I18N
1661 /* If the node may accept `multi byte'. */
1662 if (dfa->nodes[prev_node].accept_mb)
1663 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1664 str_idx, sctx->last_str_idx);
1665 #endif /* RE_ENABLE_I18N */
1667 /* We don't check backreferences here.
1668 See update_cur_sifted_state(). */
1670 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1671 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1672 dfa->nexts[prev_node]))
1678 if (sctx->limits.nelem)
1680 Idx to_idx = str_idx + naccepted;
1681 if (check_dst_limits (mctx, &sctx->limits,
1682 dfa->nexts[prev_node], to_idx,
1683 prev_node, str_idx))
1686 ret = re_node_set_insert (cur_dest, prev_node);
1687 if (BE (ret == -1, 0))
1694 /* Helper functions. */
1696 static reg_errcode_t
1698 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1700 Idx top = mctx->state_log_top;
1702 if (next_state_log_idx >= mctx->input.bufs_len
1703 || (next_state_log_idx >= mctx->input.valid_len
1704 && mctx->input.valid_len < mctx->input.len))
1707 err = extend_buffers (mctx);
1708 if (BE (err != REG_NOERROR, 0))
1712 if (top < next_state_log_idx)
1714 memset (mctx->state_log + top + 1, '\0',
1715 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1716 mctx->state_log_top = next_state_log_idx;
1721 static reg_errcode_t
1723 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1728 for (st_idx = 0; st_idx < num; ++st_idx)
1730 if (dst[st_idx] == NULL)
1731 dst[st_idx] = src[st_idx];
1732 else if (src[st_idx] != NULL)
1734 re_node_set merged_set;
1735 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1736 &src[st_idx]->nodes);
1737 if (BE (err != REG_NOERROR, 0))
1739 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1740 re_node_set_free (&merged_set);
1741 if (BE (err != REG_NOERROR, 0))
1748 static reg_errcode_t
1750 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1751 Idx str_idx, re_node_set *dest_nodes)
1753 re_dfa_t *const dfa = mctx->dfa;
1755 const re_node_set *candidates;
1756 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1757 : &mctx->state_log[str_idx]->nodes);
1759 if (dest_nodes->nelem == 0)
1760 sctx->sifted_states[str_idx] = NULL;
1765 /* At first, add the nodes which can epsilon transit to a node in
1767 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1768 if (BE (err != REG_NOERROR, 0))
1771 /* Then, check the limitations in the current sift_context. */
1772 if (sctx->limits.nelem)
1774 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1775 mctx->bkref_ents, str_idx);
1776 if (BE (err != REG_NOERROR, 0))
1781 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1782 if (BE (err != REG_NOERROR, 0))
1786 if (candidates && mctx->state_log[str_idx]->has_backref)
1788 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1789 if (BE (err != REG_NOERROR, 0))
1795 static reg_errcode_t
1797 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1798 const re_node_set *candidates)
1800 reg_errcode_t err = REG_NOERROR;
1803 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1804 if (BE (err != REG_NOERROR, 0))
1807 if (!state->inveclosure.alloc)
1809 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1810 if (BE (err != REG_NOERROR, 0))
1812 for (i = 0; i < dest_nodes->nelem; i++)
1813 re_node_set_merge (&state->inveclosure,
1814 dfa->inveclosures + dest_nodes->elems[i]);
1816 return re_node_set_add_intersect (dest_nodes, candidates,
1817 &state->inveclosure);
1820 static reg_errcode_t
1822 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1823 const re_node_set *candidates)
1827 re_node_set *inv_eclosure = dfa->inveclosures + node;
1828 re_node_set except_nodes;
1829 re_node_set_init_empty (&except_nodes);
1830 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1832 Idx cur_node = inv_eclosure->elems[ecl_idx];
1833 if (cur_node == node)
1835 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1837 Idx edst1 = dfa->edests[cur_node].elems[0];
1838 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1839 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1840 if ((!re_node_set_contains (inv_eclosure, edst1)
1841 && re_node_set_contains (dest_nodes, edst1))
1842 || (REG_VALID_NONZERO_INDEX (edst2)
1843 && !re_node_set_contains (inv_eclosure, edst2)
1844 && re_node_set_contains (dest_nodes, edst2)))
1846 err = re_node_set_add_intersect (&except_nodes, candidates,
1847 dfa->inveclosures + cur_node);
1848 if (BE (err != REG_NOERROR, 0))
1850 re_node_set_free (&except_nodes);
1856 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1858 Idx cur_node = inv_eclosure->elems[ecl_idx];
1859 if (!re_node_set_contains (&except_nodes, cur_node))
1861 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1862 re_node_set_remove_at (dest_nodes, idx);
1865 re_node_set_free (&except_nodes);
1871 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1872 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1874 re_dfa_t *const dfa = mctx->dfa;
1875 Idx lim_idx, src_pos, dst_pos;
1877 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1878 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1879 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1882 struct re_backref_cache_entry *ent;
1883 ent = mctx->bkref_ents + limits->elems[lim_idx];
1884 subexp_idx = dfa->nodes[ent->node].opr.idx;
1886 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1887 subexp_idx, dst_node, dst_idx,
1889 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1890 subexp_idx, src_node, src_idx,
1894 <src> <dst> ( <subexp> )
1895 ( <subexp> ) <src> <dst>
1896 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1897 if (src_pos == dst_pos)
1898 continue; /* This is unrelated limitation. */
1907 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1908 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1910 re_dfa_t *const dfa = mctx->dfa;
1911 re_node_set *eclosures = dfa->eclosures + from_node;
1914 /* Else, we are on the boundary: examine the nodes on the epsilon
1916 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1918 Idx node = eclosures->elems[node_idx];
1919 switch (dfa->nodes[node].type)
1922 if (bkref_idx != REG_MISSING)
1924 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1930 if (ent->node != node)
1934 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1935 && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1938 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1939 OP_CLOSE_SUBEXP cases below. But, if the
1940 destination node is the same node as the source
1941 node, don't recurse because it would cause an
1942 infinite loop: a regex that exhibits this behavior
1944 dst = dfa->edests[node].elems[0];
1945 if (dst == from_node)
1949 else /* if (boundaries & 2) */
1954 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1956 if (cpos == -1 /* && (boundaries & 1) */)
1958 if (cpos == 0 && (boundaries & 2))
1962 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1963 ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1965 while (ent++->more);
1969 case OP_OPEN_SUBEXP:
1970 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1974 case OP_CLOSE_SUBEXP:
1975 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1984 return (boundaries & 2) ? 1 : 0;
1989 check_dst_limits_calc_pos (const re_match_context_t *mctx,
1990 Idx limit, Idx subexp_idx,
1991 Idx from_node, Idx str_idx, Idx bkref_idx)
1993 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1996 /* If we are outside the range of the subexpression, return -1 or 1. */
1997 if (str_idx < lim->subexp_from)
2000 if (lim->subexp_to < str_idx)
2003 /* If we are within the subexpression, return 0. */
2004 boundaries = (str_idx == lim->subexp_from);
2005 boundaries |= (str_idx == lim->subexp_to) << 1;
2006 if (boundaries == 0)
2009 /* Else, examine epsilon closure. */
2010 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2011 from_node, bkref_idx);
2014 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2015 which are against limitations from DEST_NODES. */
2017 static reg_errcode_t
2019 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2020 const re_node_set *candidates, re_node_set *limits,
2021 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2024 Idx node_idx, lim_idx;
2026 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2029 struct re_backref_cache_entry *ent;
2030 ent = bkref_ents + limits->elems[lim_idx];
2032 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2033 continue; /* This is unrelated limitation. */
2035 subexp_idx = dfa->nodes[ent->node].opr.idx;
2036 if (ent->subexp_to == str_idx)
2038 Idx ops_node = REG_MISSING;
2039 Idx cls_node = REG_MISSING;
2040 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2042 Idx node = dest_nodes->elems[node_idx];
2043 re_token_type_t type = dfa->nodes[node].type;
2044 if (type == OP_OPEN_SUBEXP
2045 && subexp_idx == dfa->nodes[node].opr.idx)
2047 else if (type == OP_CLOSE_SUBEXP
2048 && subexp_idx == dfa->nodes[node].opr.idx)
2052 /* Check the limitation of the open subexpression. */
2053 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2054 if (REG_VALID_INDEX (ops_node))
2056 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2058 if (BE (err != REG_NOERROR, 0))
2062 /* Check the limitation of the close subexpression. */
2063 if (REG_VALID_INDEX (cls_node))
2064 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2066 Idx node = dest_nodes->elems[node_idx];
2067 if (!re_node_set_contains (dfa->inveclosures + node,
2069 && !re_node_set_contains (dfa->eclosures + node,
2072 /* It is against this limitation.
2073 Remove it form the current sifted state. */
2074 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2076 if (BE (err != REG_NOERROR, 0))
2082 else /* (ent->subexp_to != str_idx) */
2084 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2086 Idx node = dest_nodes->elems[node_idx];
2087 re_token_type_t type = dfa->nodes[node].type;
2088 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2090 if (subexp_idx != dfa->nodes[node].opr.idx)
2092 /* It is against this limitation.
2093 Remove it form the current sifted state. */
2094 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2096 if (BE (err != REG_NOERROR, 0))
2105 static reg_errcode_t
2107 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2108 Idx str_idx, const re_node_set *candidates)
2110 re_dfa_t *const dfa = mctx->dfa;
2113 re_sift_context_t local_sctx;
2114 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2116 if (first_idx == REG_MISSING)
2119 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2121 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2124 re_token_type_t type;
2125 struct re_backref_cache_entry *entry;
2126 node = candidates->elems[node_idx];
2127 type = dfa->nodes[node].type;
2128 /* Avoid infinite loop for the REs like "()\1+". */
2129 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2131 if (type != OP_BACK_REF)
2134 entry = mctx->bkref_ents + first_idx;
2135 enabled_idx = first_idx;
2138 Idx subexp_len, to_idx, dst_node, ret;
2139 re_dfastate_t *cur_state;
2141 if (entry->node != node)
2143 subexp_len = entry->subexp_to - entry->subexp_from;
2144 to_idx = str_idx + subexp_len;
2145 dst_node = (subexp_len ? dfa->nexts[node]
2146 : dfa->edests[node].elems[0]);
2148 if (to_idx > sctx->last_str_idx
2149 || sctx->sifted_states[to_idx] == NULL
2150 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2151 || check_dst_limits (mctx, &sctx->limits, node,
2152 str_idx, dst_node, to_idx))
2155 if (local_sctx.sifted_states == NULL)
2158 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2159 if (BE (err != REG_NOERROR, 0))
2162 local_sctx.last_node = node;
2163 local_sctx.last_str_idx = str_idx;
2164 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2165 if (BE (ret < 0, 0))
2170 cur_state = local_sctx.sifted_states[str_idx];
2171 err = sift_states_backward (mctx, &local_sctx);
2172 if (BE (err != REG_NOERROR, 0))
2174 if (sctx->limited_states != NULL)
2176 err = merge_state_array (dfa, sctx->limited_states,
2177 local_sctx.sifted_states,
2179 if (BE (err != REG_NOERROR, 0))
2182 local_sctx.sifted_states[str_idx] = cur_state;
2183 re_node_set_remove (&local_sctx.limits, enabled_idx);
2185 /* mctx->bkref_ents may have changed, reload the pointer. */
2186 entry = mctx->bkref_ents + enabled_idx;
2188 while (enabled_idx++, entry++->more);
2192 if (local_sctx.sifted_states != NULL)
2194 re_node_set_free (&local_sctx.limits);
2201 #ifdef RE_ENABLE_I18N
2204 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2205 Idx node_idx, Idx str_idx, Idx max_str_idx)
2207 re_dfa_t *const dfa = mctx->dfa;
2209 /* Check the node can accept `multi byte'. */
2210 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2211 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2212 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2213 dfa->nexts[node_idx]))
2214 /* The node can't accept the `multi byte', or the
2215 destination was already thrown away, then the node
2216 could't accept the current input `multi byte'. */
2218 /* Otherwise, it is sure that the node could accept
2219 `naccepted' bytes input. */
2222 #endif /* RE_ENABLE_I18N */
2225 /* Functions for state transition. */
2227 /* Return the next state to which the current state STATE will transit by
2228 accepting the current input byte, and update STATE_LOG if necessary.
2229 If STATE can accept a multibyte char/collating element/back reference
2230 update the destination of STATE_LOG. */
2232 static re_dfastate_t *
2234 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2235 re_dfastate_t *state)
2237 re_dfastate_t **trtable;
2240 #ifdef RE_ENABLE_I18N
2241 /* If the current state can accept multibyte. */
2242 if (BE (state->accept_mb, 0))
2244 *err = transit_state_mb (mctx, state);
2245 if (BE (*err != REG_NOERROR, 0))
2248 #endif /* RE_ENABLE_I18N */
2250 /* Then decide the next state with the single byte. */
2253 /* don't use transition table */
2254 return transit_state_sb (err, mctx, state);
2257 /* Use transition table */
2258 ch = re_string_fetch_byte (&mctx->input);
2261 trtable = state->trtable;
2262 if (BE (trtable != NULL, 1))
2265 trtable = state->word_trtable;
2266 if (BE (trtable != NULL, 1))
2268 unsigned int context;
2270 = re_string_context_at (&mctx->input,
2271 re_string_cur_idx (&mctx->input) - 1,
2273 if (IS_WORD_CONTEXT (context))
2274 return trtable[ch + SBC_MAX];
2279 if (!build_trtable (mctx->dfa, state))
2285 /* Retry, we now have a transition table. */
2289 /* Update the state_log if we need */
2292 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2293 re_dfastate_t *next_state)
2295 re_dfa_t *const dfa = mctx->dfa;
2296 Idx cur_idx = re_string_cur_idx (&mctx->input);
2298 if (cur_idx > mctx->state_log_top)
2300 mctx->state_log[cur_idx] = next_state;
2301 mctx->state_log_top = cur_idx;
2303 else if (mctx->state_log[cur_idx] == 0)
2305 mctx->state_log[cur_idx] = next_state;
2309 re_dfastate_t *pstate;
2310 unsigned int context;
2311 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2312 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2313 the destination of a multibyte char/collating element/
2314 back reference. Then the next state is the union set of
2315 these destinations and the results of the transition table. */
2316 pstate = mctx->state_log[cur_idx];
2317 log_nodes = pstate->entrance_nodes;
2318 if (next_state != NULL)
2320 table_nodes = next_state->entrance_nodes;
2321 *err = re_node_set_init_union (&next_nodes, table_nodes,
2323 if (BE (*err != REG_NOERROR, 0))
2327 next_nodes = *log_nodes;
2328 /* Note: We already add the nodes of the initial state,
2329 then we don't need to add them here. */
2331 context = re_string_context_at (&mctx->input,
2332 re_string_cur_idx (&mctx->input) - 1,
2334 next_state = mctx->state_log[cur_idx]
2335 = re_acquire_state_context (err, dfa, &next_nodes, context);
2336 /* We don't need to check errors here, since the return value of
2337 this function is next_state and ERR is already set. */
2339 if (table_nodes != NULL)
2340 re_node_set_free (&next_nodes);
2343 if (BE (dfa->nbackref, 0) && next_state != NULL)
2345 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2346 later. We must check them here, since the back references in the
2347 next state might use them. */
2348 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2350 if (BE (*err != REG_NOERROR, 0))
2353 /* If the next state has back references. */
2354 if (next_state->has_backref)
2356 *err = transit_state_bkref (mctx, &next_state->nodes);
2357 if (BE (*err != REG_NOERROR, 0))
2359 next_state = mctx->state_log[cur_idx];
2366 /* Skip bytes in the input that correspond to part of a
2367 multi-byte match, then look in the log for a state
2368 from which to restart matching. */
2369 static re_dfastate_t *
2371 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2373 re_dfastate_t *cur_state = NULL;
2376 Idx max = mctx->state_log_top;
2377 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2381 if (++cur_str_idx > max)
2383 re_string_skip_bytes (&mctx->input, 1);
2385 while (mctx->state_log[cur_str_idx] == NULL);
2387 cur_state = merge_state_with_log (err, mctx, NULL);
2389 while (*err == REG_NOERROR && cur_state == NULL);
2393 /* Helper functions for transit_state. */
2395 /* From the node set CUR_NODES, pick up the nodes whose types are
2396 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2397 expression. And register them to use them later for evaluating the
2398 correspoding back references. */
2400 static reg_errcode_t
2402 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2405 re_dfa_t *const dfa = mctx->dfa;
2409 /* TODO: This isn't efficient.
2410 Because there might be more than one nodes whose types are
2411 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2414 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2416 Idx node = cur_nodes->elems[node_idx];
2417 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2418 && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2419 && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2421 err = match_ctx_add_subtop (mctx, node, str_idx);
2422 if (BE (err != REG_NOERROR, 0))
2430 /* Return the next state to which the current state STATE will transit by
2431 accepting the current input byte. */
2433 static re_dfastate_t *
2434 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2435 re_dfastate_t *state)
2437 re_dfa_t *const dfa = mctx->dfa;
2438 re_node_set next_nodes;
2439 re_dfastate_t *next_state;
2440 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2441 unsigned int context;
2443 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2444 if (BE (*err != REG_NOERROR, 0))
2446 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2448 Idx cur_node = state->nodes.elems[node_cnt];
2449 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2451 *err = re_node_set_merge (&next_nodes,
2452 dfa->eclosures + dfa->nexts[cur_node]);
2453 if (BE (*err != REG_NOERROR, 0))
2455 re_node_set_free (&next_nodes);
2460 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2461 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2462 /* We don't need to check errors here, since the return value of
2463 this function is next_state and ERR is already set. */
2465 re_node_set_free (&next_nodes);
2466 re_string_skip_bytes (&mctx->input, 1);
2471 #ifdef RE_ENABLE_I18N
2472 static reg_errcode_t
2474 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2476 re_dfa_t *const dfa = mctx->dfa;
2480 for (i = 0; i < pstate->nodes.nelem; ++i)
2482 re_node_set dest_nodes, *new_nodes;
2483 Idx cur_node_idx = pstate->nodes.elems[i];
2486 unsigned int context;
2487 re_dfastate_t *dest_state;
2489 if (!dfa->nodes[cur_node_idx].accept_mb)
2492 if (dfa->nodes[cur_node_idx].constraint)
2494 context = re_string_context_at (&mctx->input,
2495 re_string_cur_idx (&mctx->input),
2497 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2502 /* How many bytes the node can accept? */
2503 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2504 re_string_cur_idx (&mctx->input));
2508 /* The node can accepts `naccepted' bytes. */
2509 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2510 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2511 : mctx->max_mb_elem_len);
2512 err = clean_state_log_if_needed (mctx, dest_idx);
2513 if (BE (err != REG_NOERROR, 0))
2516 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2518 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2520 dest_state = mctx->state_log[dest_idx];
2521 if (dest_state == NULL)
2522 dest_nodes = *new_nodes;
2525 err = re_node_set_init_union (&dest_nodes,
2526 dest_state->entrance_nodes, new_nodes);
2527 if (BE (err != REG_NOERROR, 0))
2530 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2531 mctx->state_log[dest_idx]
2532 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2533 if (dest_state != NULL)
2534 re_node_set_free (&dest_nodes);
2535 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2540 #endif /* RE_ENABLE_I18N */
2542 static reg_errcode_t
2544 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2546 re_dfa_t *const dfa = mctx->dfa;
2549 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2551 for (i = 0; i < nodes->nelem; ++i)
2553 Idx dest_str_idx, prev_nelem, bkc_idx;
2554 Idx node_idx = nodes->elems[i];
2555 unsigned int context;
2556 const re_token_t *node = dfa->nodes + node_idx;
2557 re_node_set *new_dest_nodes;
2559 /* Check whether `node' is a backreference or not. */
2560 if (node->type != OP_BACK_REF)
2563 if (node->constraint)
2565 context = re_string_context_at (&mctx->input, cur_str_idx,
2567 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2571 /* `node' is a backreference.
2572 Check the substring which the substring matched. */
2573 bkc_idx = mctx->nbkref_ents;
2574 err = get_subexp (mctx, node_idx, cur_str_idx);
2575 if (BE (err != REG_NOERROR, 0))
2578 /* And add the epsilon closures (which is `new_dest_nodes') of
2579 the backreference to appropriate state_log. */
2581 assert (dfa->nexts[node_idx] != REG_MISSING);
2583 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2586 re_dfastate_t *dest_state;
2587 struct re_backref_cache_entry *bkref_ent;
2588 bkref_ent = mctx->bkref_ents + bkc_idx;
2589 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2591 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2592 new_dest_nodes = (subexp_len == 0
2593 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2594 : dfa->eclosures + dfa->nexts[node_idx]);
2595 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2596 - bkref_ent->subexp_from);
2597 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2599 dest_state = mctx->state_log[dest_str_idx];
2600 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2601 : mctx->state_log[cur_str_idx]->nodes.nelem);
2602 /* Add `new_dest_node' to state_log. */
2603 if (dest_state == NULL)
2605 mctx->state_log[dest_str_idx]
2606 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2608 if (BE (mctx->state_log[dest_str_idx] == NULL
2609 && err != REG_NOERROR, 0))
2614 re_node_set dest_nodes;
2615 err = re_node_set_init_union (&dest_nodes,
2616 dest_state->entrance_nodes,
2618 if (BE (err != REG_NOERROR, 0))
2620 re_node_set_free (&dest_nodes);
2623 mctx->state_log[dest_str_idx]
2624 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2625 re_node_set_free (&dest_nodes);
2626 if (BE (mctx->state_log[dest_str_idx] == NULL
2627 && err != REG_NOERROR, 0))
2630 /* We need to check recursively if the backreference can epsilon
2633 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2635 err = check_subexp_matching_top (mctx, new_dest_nodes,
2637 if (BE (err != REG_NOERROR, 0))
2639 err = transit_state_bkref (mctx, new_dest_nodes);
2640 if (BE (err != REG_NOERROR, 0))
2650 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2651 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2652 Note that we might collect inappropriate candidates here.
2653 However, the cost of checking them strictly here is too high, then we
2654 delay these checking for prune_impossible_nodes(). */
2656 static reg_errcode_t
2658 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2660 re_dfa_t *const dfa = mctx->dfa;
2661 Idx subexp_num, sub_top_idx;
2662 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2663 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2664 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2665 if (cache_idx != REG_MISSING)
2667 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2669 if (entry->node == bkref_node)
2670 return REG_NOERROR; /* We already checked it. */
2671 while (entry++->more);
2674 subexp_num = dfa->nodes[bkref_node].opr.idx;
2676 /* For each sub expression */
2677 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2680 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2681 re_sub_match_last_t *sub_last;
2682 Idx sub_last_idx, sl_str, bkref_str_off;
2684 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2685 continue; /* It isn't related. */
2687 sl_str = sub_top->str_idx;
2688 bkref_str_off = bkref_str_idx;
2689 /* At first, check the last node of sub expressions we already
2691 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2693 regoff_t sl_str_diff;
2694 sub_last = sub_top->lasts[sub_last_idx];
2695 sl_str_diff = sub_last->str_idx - sl_str;
2696 /* The matched string by the sub expression match with the substring
2697 at the back reference? */
2698 if (sl_str_diff > 0)
2700 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2702 /* Not enough chars for a successful match. */
2703 if (bkref_str_off + sl_str_diff > mctx->input.len)
2706 err = clean_state_log_if_needed (mctx,
2709 if (BE (err != REG_NOERROR, 0))
2711 buf = (const char *) re_string_get_buffer (&mctx->input);
2713 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2714 break; /* We don't need to search this sub expression any more. */
2716 bkref_str_off += sl_str_diff;
2717 sl_str += sl_str_diff;
2718 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2721 /* Reload buf, since the preceding call might have reallocated
2723 buf = (const char *) re_string_get_buffer (&mctx->input);
2725 if (err == REG_NOMATCH)
2727 if (BE (err != REG_NOERROR, 0))
2731 if (sub_last_idx < sub_top->nlasts)
2733 if (sub_last_idx > 0)
2735 /* Then, search for the other last nodes of the sub expression. */
2736 for (; sl_str <= bkref_str_idx; ++sl_str)
2739 regoff_t sl_str_off;
2740 const re_node_set *nodes;
2741 sl_str_off = sl_str - sub_top->str_idx;
2742 /* The matched string by the sub expression match with the substring
2743 at the back reference? */
2746 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2748 /* If we are at the end of the input, we cannot match. */
2749 if (bkref_str_off >= mctx->input.len)
2752 err = extend_buffers (mctx);
2753 if (BE (err != REG_NOERROR, 0))
2756 buf = (const char *) re_string_get_buffer (&mctx->input);
2758 if (buf [bkref_str_off++] != buf[sl_str - 1])
2759 break; /* We don't need to search this sub expression
2762 if (mctx->state_log[sl_str] == NULL)
2764 /* Does this state have a ')' of the sub expression? */
2765 nodes = &mctx->state_log[sl_str]->nodes;
2766 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2767 if (cls_node == REG_MISSING)
2769 if (sub_top->path == NULL)
2771 sub_top->path = re_calloc (state_array_t,
2772 sl_str - sub_top->str_idx + 1);
2773 if (sub_top->path == NULL)
2776 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2777 in the current context? */
2778 err = check_arrival (mctx, sub_top->path, sub_top->node,
2779 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2780 if (err == REG_NOMATCH)
2782 if (BE (err != REG_NOERROR, 0))
2784 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2785 if (BE (sub_last == NULL, 0))
2787 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2789 if (err == REG_NOMATCH)
2796 /* Helper functions for get_subexp(). */
2798 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2799 If it can arrive, register the sub expression expressed with SUB_TOP
2802 static reg_errcode_t
2804 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2805 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2809 /* Can the subexpression arrive the back reference? */
2810 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2811 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2812 if (err != REG_NOERROR)
2814 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2816 if (BE (err != REG_NOERROR, 0))
2818 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2819 return clean_state_log_if_needed (mctx, to_idx);
2822 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2823 Search '(' if FL_OPEN, or search ')' otherwise.
2824 TODO: This function isn't efficient...
2825 Because there might be more than one nodes whose types are
2826 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2832 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2833 Idx subexp_idx, int type)
2836 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2838 Idx cls_node = nodes->elems[cls_idx];
2839 const re_token_t *node = dfa->nodes + cls_node;
2840 if (node->type == type
2841 && node->opr.idx == subexp_idx)
2847 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2848 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2850 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2852 static reg_errcode_t
2854 check_arrival (re_match_context_t *mctx, state_array_t *path,
2855 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2858 re_dfa_t *const dfa = mctx->dfa;
2860 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2861 re_dfastate_t *cur_state = NULL;
2862 re_node_set *cur_nodes, next_nodes;
2863 re_dfastate_t **backup_state_log;
2864 unsigned int context;
2866 subexp_num = dfa->nodes[top_node].opr.idx;
2867 /* Extend the buffer if we need. */
2868 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2870 re_dfastate_t **new_array;
2871 Idx old_alloc = path->alloc;
2872 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2873 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2874 if (new_array == NULL)
2876 path->alloc = old_alloc;
2879 path->array = new_array;
2880 memset (new_array + old_alloc, '\0',
2881 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2884 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2886 /* Temporary modify MCTX. */
2887 backup_state_log = mctx->state_log;
2888 backup_cur_idx = mctx->input.cur_idx;
2889 mctx->state_log = path->array;
2890 mctx->input.cur_idx = str_idx;
2892 /* Setup initial node set. */
2893 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2894 if (str_idx == top_str)
2896 err = re_node_set_init_1 (&next_nodes, top_node);
2897 if (BE (err != REG_NOERROR, 0))
2899 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2900 if (BE (err != REG_NOERROR, 0))
2902 re_node_set_free (&next_nodes);
2908 cur_state = mctx->state_log[str_idx];
2909 if (cur_state && cur_state->has_backref)
2911 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2912 if (BE ( err != REG_NOERROR, 0))
2916 re_node_set_init_empty (&next_nodes);
2918 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2920 if (next_nodes.nelem)
2922 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2924 if (BE ( err != REG_NOERROR, 0))
2926 re_node_set_free (&next_nodes);
2930 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2931 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2933 re_node_set_free (&next_nodes);
2936 mctx->state_log[str_idx] = cur_state;
2939 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2941 re_node_set_empty (&next_nodes);
2942 if (mctx->state_log[str_idx + 1])
2944 err = re_node_set_merge (&next_nodes,
2945 &mctx->state_log[str_idx + 1]->nodes);
2946 if (BE (err != REG_NOERROR, 0))
2948 re_node_set_free (&next_nodes);
2954 err = check_arrival_add_next_nodes (mctx, str_idx,
2955 &cur_state->non_eps_nodes, &next_nodes);
2956 if (BE (err != REG_NOERROR, 0))
2958 re_node_set_free (&next_nodes);
2963 if (next_nodes.nelem)
2965 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2966 if (BE (err != REG_NOERROR, 0))
2968 re_node_set_free (&next_nodes);
2971 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2973 if (BE ( err != REG_NOERROR, 0))
2975 re_node_set_free (&next_nodes);
2979 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2980 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2981 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2983 re_node_set_free (&next_nodes);
2986 mctx->state_log[str_idx] = cur_state;
2987 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2989 re_node_set_free (&next_nodes);
2990 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2991 : &mctx->state_log[last_str]->nodes);
2992 path->next_idx = str_idx;
2995 mctx->state_log = backup_state_log;
2996 mctx->input.cur_idx = backup_cur_idx;
2998 /* Then check the current node set has the node LAST_NODE. */
2999 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3005 /* Helper functions for check_arrival. */
3007 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3009 TODO: This function is similar to the functions transit_state*(),
3010 however this function has many additional works.
3011 Can't we unify them? */
3013 static reg_errcode_t
3015 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3016 re_node_set *cur_nodes,
3017 re_node_set *next_nodes)
3019 re_dfa_t *const dfa = mctx->dfa;
3023 re_node_set union_set;
3024 re_node_set_init_empty (&union_set);
3025 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3028 Idx cur_node = cur_nodes->elems[cur_idx];
3030 re_token_type_t type = dfa->nodes[cur_node].type;
3031 assert (!IS_EPSILON_NODE (type));
3033 #ifdef RE_ENABLE_I18N
3034 /* If the node may accept `multi byte'. */
3035 if (dfa->nodes[cur_node].accept_mb)
3037 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3041 re_dfastate_t *dest_state;
3042 Idx next_node = dfa->nexts[cur_node];
3043 Idx next_idx = str_idx + naccepted;
3044 dest_state = mctx->state_log[next_idx];
3045 re_node_set_empty (&union_set);
3048 err = re_node_set_merge (&union_set, &dest_state->nodes);
3049 if (BE (err != REG_NOERROR, 0))
3051 re_node_set_free (&union_set);
3055 result = re_node_set_insert (&union_set, next_node);
3056 if (BE (result < 0, 0))
3058 re_node_set_free (&union_set);
3061 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3063 if (BE (mctx->state_log[next_idx] == NULL
3064 && err != REG_NOERROR, 0))
3066 re_node_set_free (&union_set);
3071 #endif /* RE_ENABLE_I18N */
3073 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3075 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3076 if (BE (result < 0, 0))
3078 re_node_set_free (&union_set);
3083 re_node_set_free (&union_set);
3087 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3088 CUR_NODES, however exclude the nodes which are:
3089 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3090 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3093 static reg_errcode_t
3095 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3096 Idx ex_subexp, int type)
3099 Idx idx, outside_node;
3100 re_node_set new_nodes;
3102 assert (cur_nodes->nelem);
3104 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3105 if (BE (err != REG_NOERROR, 0))
3107 /* Create a new node set NEW_NODES with the nodes which are epsilon
3108 closures of the node in CUR_NODES. */
3110 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3112 Idx cur_node = cur_nodes->elems[idx];
3113 re_node_set *eclosure = dfa->eclosures + cur_node;
3114 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3115 if (outside_node == REG_MISSING)
3117 /* There are no problematic nodes, just merge them. */
3118 err = re_node_set_merge (&new_nodes, eclosure);
3119 if (BE (err != REG_NOERROR, 0))
3121 re_node_set_free (&new_nodes);
3127 /* There are problematic nodes, re-calculate incrementally. */
3128 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3130 if (BE (err != REG_NOERROR, 0))
3132 re_node_set_free (&new_nodes);
3137 re_node_set_free (cur_nodes);
3138 *cur_nodes = new_nodes;
3142 /* Helper function for check_arrival_expand_ecl.
3143 Check incrementally the epsilon closure of TARGET, and if it isn't
3144 problematic append it to DST_NODES. */
3146 static reg_errcode_t
3148 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3149 Idx target, Idx ex_subexp, int type)
3152 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3156 if (dfa->nodes[cur_node].type == type
3157 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3159 if (type == OP_CLOSE_SUBEXP)
3161 err = re_node_set_insert (dst_nodes, cur_node);
3162 if (BE (err == -1, 0))
3167 err = re_node_set_insert (dst_nodes, cur_node);
3168 if (BE (err == -1, 0))
3170 if (dfa->edests[cur_node].nelem == 0)
3172 if (dfa->edests[cur_node].nelem == 2)
3175 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3176 dfa->edests[cur_node].elems[1],
3178 if (BE (ret != REG_NOERROR, 0))
3181 cur_node = dfa->edests[cur_node].elems[0];
3187 /* For all the back references in the current state, calculate the
3188 destination of the back references by the appropriate entry
3189 in MCTX->BKREF_ENTS. */
3191 static reg_errcode_t
3193 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3194 Idx cur_str, Idx subexp_num, int type)
3196 re_dfa_t *const dfa = mctx->dfa;
3198 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3199 struct re_backref_cache_entry *ent;
3201 if (cache_idx_start == REG_MISSING)
3205 ent = mctx->bkref_ents + cache_idx_start;
3208 Idx to_idx, next_node;
3210 /* Is this entry ENT is appropriate? */
3211 if (!re_node_set_contains (cur_nodes, ent->node))
3214 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3215 /* Calculate the destination of the back reference, and append it
3216 to MCTX->STATE_LOG. */
3217 if (to_idx == cur_str)
3219 /* The backreference did epsilon transit, we must re-check all the
3220 node in the current state. */
3221 re_node_set new_dests;
3222 reg_errcode_t err2, err3;
3223 next_node = dfa->edests[ent->node].elems[0];
3224 if (re_node_set_contains (cur_nodes, next_node))
3226 err = re_node_set_init_1 (&new_dests, next_node);
3227 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3228 err3 = re_node_set_merge (cur_nodes, &new_dests);
3229 re_node_set_free (&new_dests);
3230 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3231 || err3 != REG_NOERROR, 0))
3233 err = (err != REG_NOERROR ? err
3234 : (err2 != REG_NOERROR ? err2 : err3));
3237 /* TODO: It is still inefficient... */
3242 re_node_set union_set;
3243 next_node = dfa->nexts[ent->node];
3244 if (mctx->state_log[to_idx])
3247 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3250 err = re_node_set_init_copy (&union_set,
3251 &mctx->state_log[to_idx]->nodes);
3252 ret = re_node_set_insert (&union_set, next_node);
3253 if (BE (err != REG_NOERROR || ret < 0, 0))
3255 re_node_set_free (&union_set);
3256 err = err != REG_NOERROR ? err : REG_ESPACE;
3262 err = re_node_set_init_1 (&union_set, next_node);
3263 if (BE (err != REG_NOERROR, 0))
3266 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3267 re_node_set_free (&union_set);
3268 if (BE (mctx->state_log[to_idx] == NULL
3269 && err != REG_NOERROR, 0))
3273 while (ent++->more);
3277 /* Build transition table for the state.
3278 Return 1 if succeeded, otherwise return NULL. */
3282 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3286 int ch, need_word_trtable = 0;
3287 unsigned int elem, mask;
3288 int dests_node_malloced = 0, dest_states_malloced = 0;
3289 Idx ndests; /* Number of the destination states from `state'. */
3290 re_dfastate_t **trtable;
3291 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3292 re_node_set follows, *dests_node;
3296 /* We build DFA states which corresponds to the destination nodes
3297 from `state'. `dests_node[i]' represents the nodes which i-th
3298 destination state contains, and `dests_ch[i]' represents the
3299 characters which i-th destination state accepts. */
3300 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3301 dests_node = (re_node_set *)
3302 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3305 dests_node = (re_node_set *)
3306 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3307 if (BE (dests_node == NULL, 0))
3309 dests_node_malloced = 1;
3311 dests_ch = (bitset *) (dests_node + SBC_MAX);
3313 /* Initialize transiton table. */
3314 state->word_trtable = state->trtable = NULL;
3316 /* At first, group all nodes belonging to `state' into several
3318 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3321 if (dests_node_malloced)
3323 /* Return 0 in case of an error, 1 otherwise. */
3326 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3332 err = re_node_set_alloc (&follows, ndests + 1);
3333 if (BE (err != REG_NOERROR, 0))
3336 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3337 + ndests * 3 * sizeof (re_dfastate_t *)))
3338 dest_states = (re_dfastate_t **)
3339 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3342 dest_states = (re_dfastate_t **)
3343 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3344 if (BE (dest_states == NULL, 0))
3347 if (dest_states_malloced)
3349 re_node_set_free (&follows);
3350 for (i = 0; i < ndests; ++i)
3351 re_node_set_free (dests_node + i);
3352 if (dests_node_malloced)
3356 dest_states_malloced = 1;
3358 dest_states_word = dest_states + ndests;
3359 dest_states_nl = dest_states_word + ndests;
3360 bitset_empty (acceptable);
3362 /* Then build the states for all destinations. */
3363 for (i = 0; i < ndests; ++i)
3366 re_node_set_empty (&follows);
3367 /* Merge the follows of this destination states. */
3368 for (j = 0; j < dests_node[i].nelem; ++j)
3370 next_node = dfa->nexts[dests_node[i].elems[j]];
3371 if (next_node != REG_MISSING)
3373 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3374 if (BE (err != REG_NOERROR, 0))
3378 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3379 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3381 /* If the new state has context constraint,
3382 build appropriate states for these contexts. */
3383 if (dest_states[i]->has_constraint)
3385 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3387 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3390 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3391 need_word_trtable = 1;
3393 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3395 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3400 dest_states_word[i] = dest_states[i];
3401 dest_states_nl[i] = dest_states[i];
3403 bitset_merge (acceptable, dests_ch[i]);
3406 if (!BE (need_word_trtable, 0))
3408 /* We don't care about whether the following character is a word
3409 character, or we are in a single-byte character set so we can
3410 discern by looking at the character code: allocate a
3411 256-entry transition table. */
3412 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3413 if (BE (trtable == NULL, 0))
3416 /* For all characters ch...: */
3417 for (i = 0; i < BITSET_UINTS; ++i)
3418 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3420 mask <<= 1, elem >>= 1, ++ch)
3421 if (BE (elem & 1, 0))
3423 /* There must be exactly one destination which accepts
3424 character ch. See group_nodes_into_DFAstates. */
3425 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3428 /* j-th destination accepts the word character ch. */
3429 if (dfa->word_char[i] & mask)
3430 trtable[ch] = dest_states_word[j];
3432 trtable[ch] = dest_states[j];
3437 /* We care about whether the following character is a word
3438 character, and we are in a multi-byte character set: discern
3439 by looking at the character code: build two 256-entry
3440 transition tables, one starting at trtable[0] and one
3441 starting at trtable[SBC_MAX]. */
3442 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3443 if (BE (trtable == NULL, 0))
3446 /* For all characters ch...: */
3447 for (i = 0; i < BITSET_UINTS; ++i)
3448 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3450 mask <<= 1, elem >>= 1, ++ch)
3451 if (BE (elem & 1, 0))
3453 /* There must be exactly one destination which accepts
3454 character ch. See group_nodes_into_DFAstates. */
3455 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3458 /* j-th destination accepts the word character ch. */
3459 trtable[ch] = dest_states[j];
3460 trtable[ch + SBC_MAX] = dest_states_word[j];
3465 if (bitset_contain (acceptable, NEWLINE_CHAR))
3467 /* The current state accepts newline character. */
3468 for (j = 0; j < ndests; ++j)
3469 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3471 /* k-th destination accepts newline character. */
3472 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3473 if (need_word_trtable)
3474 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3475 /* There must be only one destination which accepts
3476 newline. See group_nodes_into_DFAstates. */
3481 if (dest_states_malloced)
3484 re_node_set_free (&follows);
3485 for (i = 0; i < ndests; ++i)
3486 re_node_set_free (dests_node + i);
3488 if (dests_node_malloced)
3494 /* Group all nodes belonging to STATE into several destinations.
3495 Then for all destinations, set the nodes belonging to the destination
3496 to DESTS_NODE[i] and set the characters accepted by the destination
3497 to DEST_CH[i]. This function return the number of destinations. */
3501 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3502 re_node_set *dests_node, bitset *dests_ch)
3507 Idx ndests; /* Number of the destinations from `state'. */
3508 bitset accepts; /* Characters a node can accept. */
3509 const re_node_set *cur_nodes = &state->nodes;
3510 bitset_empty (accepts);
3513 /* For all the nodes belonging to `state', */
3514 for (i = 0; i < cur_nodes->nelem; ++i)
3516 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3517 re_token_type_t type = node->type;
3518 unsigned int constraint = node->constraint;
3520 /* Enumerate all single byte character this node can accept. */
3521 if (type == CHARACTER)
3522 bitset_set (accepts, node->opr.c);
3523 else if (type == SIMPLE_BRACKET)
3525 bitset_merge (accepts, node->opr.sbcset);
3527 else if (type == OP_PERIOD)
3529 #ifdef RE_ENABLE_I18N
3530 if (dfa->mb_cur_max > 1)
3531 bitset_merge (accepts, dfa->sb_char);
3534 bitset_set_all (accepts);
3535 if (!(dfa->syntax & REG_DOT_NEWLINE))
3536 bitset_clear (accepts, '\n');
3537 if (dfa->syntax & REG_DOT_NOT_NULL)
3538 bitset_clear (accepts, '\0');
3540 #ifdef RE_ENABLE_I18N
3541 else if (type == OP_UTF8_PERIOD)
3543 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3544 if (!(dfa->syntax & REG_DOT_NEWLINE))
3545 bitset_clear (accepts, '\n');
3546 if (dfa->syntax & REG_DOT_NOT_NULL)
3547 bitset_clear (accepts, '\0');
3553 /* Check the `accepts' and sift the characters which are not
3554 match it the context. */
3557 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3559 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3560 bitset_empty (accepts);
3561 if (accepts_newline)
3562 bitset_set (accepts, NEWLINE_CHAR);
3566 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3568 bitset_empty (accepts);
3572 if (constraint & NEXT_WORD_CONSTRAINT)
3574 unsigned int any_set = 0;
3575 if (type == CHARACTER && !node->word_char)
3577 bitset_empty (accepts);
3580 #ifdef RE_ENABLE_I18N
3581 if (dfa->mb_cur_max > 1)
3582 for (j = 0; j < BITSET_UINTS; ++j)
3583 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3586 for (j = 0; j < BITSET_UINTS; ++j)
3587 any_set |= (accepts[j] &= dfa->word_char[j]);
3591 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3593 unsigned int any_set = 0;
3594 if (type == CHARACTER && node->word_char)
3596 bitset_empty (accepts);
3599 #ifdef RE_ENABLE_I18N
3600 if (dfa->mb_cur_max > 1)
3601 for (j = 0; j < BITSET_UINTS; ++j)
3602 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3605 for (j = 0; j < BITSET_UINTS; ++j)
3606 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3612 /* Then divide `accepts' into DFA states, or create a new
3613 state. Above, we make sure that accepts is not empty. */
3614 for (j = 0; j < ndests; ++j)
3616 bitset intersec; /* Intersection sets, see below. */
3618 /* Flags, see below. */
3619 int has_intersec, not_subset, not_consumed;
3621 /* Optimization, skip if this state doesn't accept the character. */
3622 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3625 /* Enumerate the intersection set of this state and `accepts'. */
3627 for (k = 0; k < BITSET_UINTS; ++k)
3628 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3629 /* And skip if the intersection set is empty. */
3633 /* Then check if this state is a subset of `accepts'. */
3634 not_subset = not_consumed = 0;
3635 for (k = 0; k < BITSET_UINTS; ++k)
3637 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3638 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3641 /* If this state isn't a subset of `accepts', create a
3642 new group state, which has the `remains'. */
3645 bitset_copy (dests_ch[ndests], remains);
3646 bitset_copy (dests_ch[j], intersec);
3647 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3648 if (BE (err != REG_NOERROR, 0))
3653 /* Put the position in the current group. */
3654 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3655 if (BE (result < 0, 0))
3658 /* If all characters are consumed, go to next node. */
3662 /* Some characters remain, create a new group. */
3665 bitset_copy (dests_ch[ndests], accepts);
3666 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3667 if (BE (err != REG_NOERROR, 0))
3670 bitset_empty (accepts);
3675 for (j = 0; j < ndests; ++j)
3676 re_node_set_free (dests_node + j);
3680 #ifdef RE_ENABLE_I18N
3681 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3682 Return the number of the bytes the node accepts.
3683 STR_IDX is the current index of the input string.
3685 This function handles the nodes which can accept one character, or
3686 one collating element like '.', '[a-z]', opposite to the other nodes
3687 can only accept one byte. */
3691 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3692 const re_string_t *input, Idx str_idx)
3694 const re_token_t *node = dfa->nodes + node_idx;
3695 int char_len, elem_len;
3698 if (BE (node->type == OP_UTF8_PERIOD, 0))
3700 unsigned char c = re_string_byte_at (input, str_idx), d;
3701 if (BE (c < 0xc2, 1))
3704 if (str_idx + 2 > input->len)
3707 d = re_string_byte_at (input, str_idx + 1);
3709 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3713 if (c == 0xe0 && d < 0xa0)
3719 if (c == 0xf0 && d < 0x90)
3725 if (c == 0xf8 && d < 0x88)
3731 if (c == 0xfc && d < 0x84)
3737 if (str_idx + char_len > input->len)
3740 for (i = 1; i < char_len; ++i)
3742 d = re_string_byte_at (input, str_idx + i);
3743 if (d < 0x80 || d > 0xbf)
3749 char_len = re_string_char_size_at (input, str_idx);
3750 if (node->type == OP_PERIOD)
3754 /* FIXME: I don't think this if is needed, as both '\n'
3755 and '\0' are char_len == 1. */
3756 /* '.' accepts any one character except the following two cases. */
3757 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3758 re_string_byte_at (input, str_idx) == '\n') ||
3759 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3760 re_string_byte_at (input, str_idx) == '\0'))
3765 elem_len = re_string_elem_size_at (input, str_idx);
3766 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3769 if (node->type == COMPLEX_BRACKET)
3771 const re_charset_t *cset = node->opr.mbcset;
3773 const unsigned char *pin
3774 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3779 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3780 ? re_string_wchar_at (input, str_idx) : 0);
3782 /* match with multibyte character? */
3783 for (i = 0; i < cset->nmbchars; ++i)
3784 if (wc == cset->mbchars[i])
3786 match_len = char_len;
3787 goto check_node_accept_bytes_match;
3789 /* match with character_class? */
3790 for (i = 0; i < cset->nchar_classes; ++i)
3792 wctype_t wt = cset->char_classes[i];
3793 if (__iswctype (wc, wt))
3795 match_len = char_len;
3796 goto check_node_accept_bytes_match;
3801 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3804 unsigned int in_collseq = 0;
3805 const int32_t *table, *indirect;
3806 const unsigned char *weights, *extra;
3807 const char *collseqwc;
3809 /* This #include defines a local function! */
3810 # include <locale/weight.h>
3812 /* match with collating_symbol? */
3813 if (cset->ncoll_syms)
3814 extra = (const unsigned char *)
3815 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3816 for (i = 0; i < cset->ncoll_syms; ++i)
3818 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3819 /* Compare the length of input collating element and
3820 the length of current collating element. */
3821 if (*coll_sym != elem_len)
3823 /* Compare each bytes. */
3824 for (j = 0; j < *coll_sym; j++)
3825 if (pin[j] != coll_sym[1 + j])
3829 /* Match if every bytes is equal. */
3831 goto check_node_accept_bytes_match;
3837 if (elem_len <= char_len)
3839 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3840 in_collseq = __collseq_table_lookup (collseqwc, wc);
3843 in_collseq = find_collation_sequence_value (pin, elem_len);
3845 /* match with range expression? */
3846 for (i = 0; i < cset->nranges; ++i)
3847 if (cset->range_starts[i] <= in_collseq
3848 && in_collseq <= cset->range_ends[i])
3850 match_len = elem_len;
3851 goto check_node_accept_bytes_match;
3854 /* match with equivalence_class? */
3855 if (cset->nequiv_classes)
3857 const unsigned char *cp = pin;
3858 table = (const int32_t *)
3859 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3860 weights = (const unsigned char *)
3861 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3862 extra = (const unsigned char *)
3863 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3864 indirect = (const int32_t *)
3865 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3866 idx = findidx (&cp);
3868 for (i = 0; i < cset->nequiv_classes; ++i)
3870 int32_t equiv_class_idx = cset->equiv_classes[i];
3871 size_t weight_len = weights[idx];
3872 if (weight_len == weights[equiv_class_idx])
3875 while (cnt <= weight_len
3876 && (weights[equiv_class_idx + 1 + cnt]
3877 == weights[idx + 1 + cnt]))
3879 if (cnt > weight_len)
3881 match_len = elem_len;
3882 goto check_node_accept_bytes_match;
3891 /* match with range expression? */
3893 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3895 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3898 for (i = 0; i < cset->nranges; ++i)
3900 cmp_buf[0] = cset->range_starts[i];
3901 cmp_buf[4] = cset->range_ends[i];
3902 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3903 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3905 match_len = char_len;
3906 goto check_node_accept_bytes_match;
3910 check_node_accept_bytes_match:
3911 if (!cset->non_match)
3918 return (elem_len > char_len) ? elem_len : char_len;
3926 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3928 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3933 /* No valid character. Match it as a single byte character. */
3934 const unsigned char *collseq = (const unsigned char *)
3935 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3936 return collseq[mbs[0]];
3943 const unsigned char *extra = (const unsigned char *)
3944 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3945 int32_t extrasize = (const unsigned char *)
3946 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3948 for (idx = 0; idx < extrasize;)
3950 int mbs_cnt, found = 0;
3951 int32_t elem_mbs_len;
3952 /* Skip the name of collating element name. */
3953 idx = idx + extra[idx] + 1;
3954 elem_mbs_len = extra[idx++];
3955 if (mbs_len == elem_mbs_len)
3957 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3958 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3960 if (mbs_cnt == elem_mbs_len)
3961 /* Found the entry. */
3964 /* Skip the byte sequence of the collating element. */
3965 idx += elem_mbs_len;
3966 /* Adjust for the alignment. */
3967 idx = (idx + 3) & ~3;
3968 /* Skip the collation sequence value. */
3969 idx += sizeof (uint32_t);
3970 /* Skip the wide char sequence of the collating element. */
3971 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3972 /* If we found the entry, return the sequence value. */
3974 return *(uint32_t *) (extra + idx);
3975 /* Skip the collation sequence value. */
3976 idx += sizeof (uint32_t);
3982 #endif /* RE_ENABLE_I18N */
3984 /* Check whether the node accepts the byte which is IDX-th
3985 byte of the INPUT. */
3989 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3993 ch = re_string_byte_at (&mctx->input, idx);
3997 if (node->opr.c != ch)
4001 case SIMPLE_BRACKET:
4002 if (!bitset_contain (node->opr.sbcset, ch))
4006 #ifdef RE_ENABLE_I18N
4007 case OP_UTF8_PERIOD:
4013 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4014 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4022 if (node->constraint)
4024 /* The node has constraints. Check whether the current context
4025 satisfies the constraints. */
4026 unsigned int context = re_string_context_at (&mctx->input, idx,
4028 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4035 /* Extend the buffers, if the buffers have run out. */
4037 static reg_errcode_t
4039 extend_buffers (re_match_context_t *mctx)
4042 re_string_t *pstr = &mctx->input;
4044 /* Double the lengthes of the buffers. */
4045 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4046 if (BE (ret != REG_NOERROR, 0))
4049 if (mctx->state_log != NULL)
4051 /* And double the length of state_log. */
4052 /* XXX We have no indication of the size of this buffer. If this
4053 allocation fail we have no indication that the state_log array
4054 does not have the right size. */
4055 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4056 pstr->bufs_len + 1);
4057 if (BE (new_array == NULL, 0))
4059 mctx->state_log = new_array;
4062 /* Then reconstruct the buffers. */
4065 #ifdef RE_ENABLE_I18N
4066 if (pstr->mb_cur_max > 1)
4068 ret = build_wcs_upper_buffer (pstr);
4069 if (BE (ret != REG_NOERROR, 0))
4073 #endif /* RE_ENABLE_I18N */
4074 build_upper_buffer (pstr);
4078 #ifdef RE_ENABLE_I18N
4079 if (pstr->mb_cur_max > 1)
4080 build_wcs_buffer (pstr);
4082 #endif /* RE_ENABLE_I18N */
4084 if (pstr->trans != NULL)
4085 re_string_translate_buffer (pstr);
4092 /* Functions for matching context. */
4094 /* Initialize MCTX. */
4096 static reg_errcode_t
4098 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4100 mctx->eflags = eflags;
4101 mctx->match_last = REG_MISSING;
4104 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4105 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4106 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4109 /* Already zero-ed by the caller.
4111 mctx->bkref_ents = NULL;
4112 mctx->nbkref_ents = 0;
4113 mctx->nsub_tops = 0; */
4114 mctx->abkref_ents = n;
4115 mctx->max_mb_elem_len = 1;
4116 mctx->asub_tops = n;
4120 /* Clean the entries which depend on the current input in MCTX.
4121 This function must be invoked when the matcher changes the start index
4122 of the input, or changes the input string. */
4126 match_ctx_clean (re_match_context_t *mctx)
4129 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4132 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4133 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4135 re_sub_match_last_t *last = top->lasts[sl_idx];
4136 re_free (last->path.array);
4139 re_free (top->lasts);
4142 re_free (top->path->array);
4143 re_free (top->path);
4148 mctx->nsub_tops = 0;
4149 mctx->nbkref_ents = 0;
4152 /* Free all the memory associated with MCTX. */
4156 match_ctx_free (re_match_context_t *mctx)
4158 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4159 match_ctx_clean (mctx);
4160 re_free (mctx->sub_tops);
4161 re_free (mctx->bkref_ents);
4164 /* Add a new backreference entry to MCTX.
4165 Note that we assume that caller never call this function with duplicate
4166 entry, and call with STR_IDX which isn't smaller than any existing entry.
4169 static reg_errcode_t
4171 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4174 if (mctx->nbkref_ents >= mctx->abkref_ents)
4176 struct re_backref_cache_entry* new_entry;
4177 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4178 mctx->abkref_ents * 2);
4179 if (BE (new_entry == NULL, 0))
4181 re_free (mctx->bkref_ents);
4184 mctx->bkref_ents = new_entry;
4185 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4186 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4187 mctx->abkref_ents *= 2;
4189 if (mctx->nbkref_ents > 0
4190 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4191 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4193 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4194 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4195 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4196 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4198 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4199 If bit N is clear, means that this entry won't epsilon-transition to
4200 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4201 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4204 A backreference does not epsilon-transition unless it is empty, so set
4205 to all zeros if FROM != TO. */
4206 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4207 = (from == to ? -1 : 0);
4209 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4210 if (mctx->max_mb_elem_len < to - from)
4211 mctx->max_mb_elem_len = to - from;
4215 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4216 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4220 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4222 Idx left, right, mid, last;
4223 last = right = mctx->nbkref_ents;
4224 for (left = 0; left < right;)
4226 mid = (left + right) / 2;
4227 if (mctx->bkref_ents[mid].str_idx < str_idx)
4232 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4238 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4241 static reg_errcode_t
4243 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4246 assert (mctx->sub_tops != NULL);
4247 assert (mctx->asub_tops > 0);
4249 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4251 Idx new_asub_tops = mctx->asub_tops * 2;
4252 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4253 re_sub_match_top_t *,
4255 if (BE (new_array == NULL, 0))
4257 mctx->sub_tops = new_array;
4258 mctx->asub_tops = new_asub_tops;
4260 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4261 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4263 mctx->sub_tops[mctx->nsub_tops]->node = node;
4264 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4268 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4269 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4271 static re_sub_match_last_t *
4273 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4275 re_sub_match_last_t *new_entry;
4276 if (BE (subtop->nlasts == subtop->alasts, 0))
4278 Idx new_alasts = 2 * subtop->alasts + 1;
4279 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4280 re_sub_match_last_t *,
4282 if (BE (new_array == NULL, 0))
4284 subtop->lasts = new_array;
4285 subtop->alasts = new_alasts;
4287 new_entry = re_calloc (re_sub_match_last_t, 1);
4288 if (BE (new_entry != NULL, 1))
4290 subtop->lasts[subtop->nlasts] = new_entry;
4291 new_entry->node = node;
4292 new_entry->str_idx = str_idx;
4300 sift_ctx_init (re_sift_context_t *sctx,
4301 re_dfastate_t **sifted_sts,
4302 re_dfastate_t **limited_sts,
4303 Idx last_node, Idx last_str_idx)
4305 sctx->sifted_states = sifted_sts;
4306 sctx->limited_states = limited_sts;
4307 sctx->last_node = last_node;
4308 sctx->last_str_idx = last_str_idx;
4309 re_node_set_init_empty (&sctx->limits);